From 795607a3529bcdeea14c8ce51cf35793dd986128 Mon Sep 17 00:00:00 2001
From: Max Horn <max@quendi.de>
Date: Sat, 19 Feb 2022 16:41:17 +0100
Subject: [PATCH 01/13] Fix get/set_fpcr_aarch64 (#44256)

On Aarch64, the `fpcr` register is 64bit wide, although the top 32bit
are currently unused and reserved for future usage. Nevertheless, we
should safe and restore the full 64 bit, not just 32 bit. This also
silences a compiler warning about this. Reference:
<https://developer.arm.com/documentation/ddi0595/2021-06/AArch64-Registers/FPCR--Floating-point-Control-Register>

(cherry picked from commit 5bd0545fb05b1930f38bdd0ee93c3603b89cbc5f)
---
 src/processor_arm.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp
index f5cc2a42a4870..ea8dddf629d62 100644
--- a/src/processor_arm.cpp
+++ b/src/processor_arm.cpp
@@ -1843,20 +1843,20 @@ extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
 
 #ifdef _CPU_AARCH64_
 // FPCR FZ, bit [24]
-static constexpr uint32_t fpcr_fz_mask = 1 << 24;
+static constexpr uint64_t fpcr_fz_mask = 1 << 24;
 // FPCR FZ16, bit [19]
-static constexpr uint32_t fpcr_fz16_mask = 1 << 19;
+static constexpr uint64_t fpcr_fz16_mask = 1 << 19;
 // FPCR DN, bit [25]
-static constexpr uint32_t fpcr_dn_mask = 1 << 25;
+static constexpr uint64_t fpcr_dn_mask = 1 << 25;
 
-static inline uint32_t get_fpcr_aarch64(void)
+static inline uint64_t get_fpcr_aarch64(void)
 {
-    uint32_t fpcr;
+    uint64_t fpcr;
     asm volatile("mrs %0, fpcr" : "=r"(fpcr));
     return fpcr;
 }
 
-static inline void set_fpcr_aarch64(uint32_t fpcr)
+static inline void set_fpcr_aarch64(uint64_t fpcr)
 {
     asm volatile("msr fpcr, %0" :: "r"(fpcr));
 }
@@ -1868,8 +1868,8 @@ extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
 
 extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
 {
-    uint32_t fpcr = get_fpcr_aarch64();
-    static uint32_t mask = fpcr_fz_mask | (jl_test_cpu_feature(JL_AArch64_fullfp16) ? fpcr_fz16_mask : 0);
+    uint64_t fpcr = get_fpcr_aarch64();
+    static uint64_t mask = fpcr_fz_mask | (jl_test_cpu_feature(JL_AArch64_fullfp16) ? fpcr_fz16_mask : 0);
     fpcr = isZero ? (fpcr | mask) : (fpcr & ~mask);
     set_fpcr_aarch64(fpcr);
     return 0;
@@ -1882,7 +1882,7 @@ extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
 
 extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
 {
-    uint32_t fpcr = get_fpcr_aarch64();
+    uint64_t fpcr = get_fpcr_aarch64();
     fpcr = isDefault ? (fpcr | fpcr_dn_mask) : (fpcr & ~fpcr_dn_mask);
     set_fpcr_aarch64(fpcr);
     return 0;

From 01a01ef063c6d6e7d81cd663e85cc4af034585f1 Mon Sep 17 00:00:00 2001
From: Denis Barucic <barucden@fel.cvut.cz>
Date: Mon, 21 Mar 2022 13:06:29 +0100
Subject: [PATCH 02/13] MPFR: Fix `round(Integer, big(Inf))` (#44676)

It also fixes `round(Integer, big(NaN))`.

Solves #44662

(cherry picked from commit ecf3558c94898ddd4272b319d3405cf7256c6db7)
---
 base/mpfr.jl | 9 ++++++++-
 test/mpfr.jl | 4 ++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/base/mpfr.jl b/base/mpfr.jl
index e85f281619ac0..60f59cdb0af7e 100644
--- a/base/mpfr.jl
+++ b/base/mpfr.jl
@@ -294,7 +294,14 @@ function round(::Type{T}, x::BigFloat, r::Union{RoundingMode, MPFRRoundingMode})
     end
     return unsafe_trunc(T, res)
 end
-round(::Type{BigInt}, x::BigFloat, r::Union{RoundingMode, MPFRRoundingMode}) = _unchecked_cast(BigInt, x, r)
+
+function round(::Type{BigInt}, x::BigFloat, r::Union{RoundingMode, MPFRRoundingMode})
+    clear_flags()
+    res = _unchecked_cast(BigInt, x, r)
+    had_range_exception() && throw(InexactError(:round, BigInt, x))
+    return res
+end
+
 round(::Type{T}, x::BigFloat, r::RoundingMode) where T<:Union{Signed, Unsigned} =
     invoke(round, Tuple{Type{<:Union{Signed, Unsigned}}, BigFloat, Union{RoundingMode, MPFRRoundingMode}}, T, x, r)
 round(::Type{BigInt}, x::BigFloat, r::RoundingMode) =
diff --git a/test/mpfr.jl b/test/mpfr.jl
index a1039a7c5a810..1a0a0041bf94e 100644
--- a/test/mpfr.jl
+++ b/test/mpfr.jl
@@ -653,6 +653,10 @@ end
     @test typeof(round(Int64, x)) == Int64 && round(Int64, x) == 42
     @test typeof(round(Int, x)) == Int && round(Int, x) == 42
     @test typeof(round(UInt, x)) == UInt && round(UInt, x) == 0x2a
+
+    # Issue #44662
+    @test_throws InexactError round(Integer, big(Inf))
+    @test_throws InexactError round(Integer, big(NaN))
 end
 @testset "string representation" begin
     str = "1.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000012"

From c5ac53c1498a5ad83892235b8ab6ee39fecfbab3 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Tue, 22 Mar 2022 10:03:13 -0400
Subject: [PATCH 03/13] Add system info to start of testsuite. Profile: don't
 spawn profile listener on windows (#44639)

(cherry picked from commit 95da0d81729a1cc414b1e6a06601f8152861af6f)
---
 stdlib/Profile/src/Profile.jl   |  9 ++++++---
 stdlib/Profile/test/runtests.jl | 10 ++++++++--
 test/runtests.jl                |  9 +++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/stdlib/Profile/src/Profile.jl b/stdlib/Profile/src/Profile.jl
index b11dfb488c373..d3d5300c87527 100644
--- a/stdlib/Profile/src/Profile.jl
+++ b/stdlib/Profile/src/Profile.jl
@@ -140,9 +140,12 @@ function __init__()
         delay = 0.001
     end
     init(n, delay, limitwarn = false)
-    PROFILE_PRINT_COND[] = Base.AsyncCondition()
-    ccall(:jl_set_peek_cond, Cvoid, (Ptr{Cvoid},), PROFILE_PRINT_COND[].handle)
-    errormonitor(Threads.@spawn(profile_printing_listener()))
+    @static if !Sys.iswindows()
+        # triggering a profile via signals is not implemented on windows
+        PROFILE_PRINT_COND[] = Base.AsyncCondition()
+        ccall(:jl_set_peek_cond, Cvoid, (Ptr{Cvoid},), PROFILE_PRINT_COND[].handle)
+        errormonitor(Threads.@spawn(profile_printing_listener()))
+    end
 end
 
 """
diff --git a/stdlib/Profile/test/runtests.jl b/stdlib/Profile/test/runtests.jl
index c6f78cbed0522..f8f8c52b93123 100644
--- a/stdlib/Profile/test/runtests.jl
+++ b/stdlib/Profile/test/runtests.jl
@@ -170,7 +170,11 @@ let cmd = Base.julia_cmd()
     script = """
         using Profile
         f(::Val) = GC.safepoint()
-        @profile for i = 1:10^3; f(Val(i)); end
+        @profile for i = 1:10^3
+            println(i)
+            f(Val(i))
+        end
+        println("done")
         print(Profile.len_data())
         """
     p = open(`$cmd -e $script`)
@@ -184,7 +188,9 @@ let cmd = Base.julia_cmd()
     s = read(p, String)
     close(t)
     @test success(p)
-    @test parse(Int, s) > 100
+    @test !isempty(s)
+    @test occursin("done", s)
+    @test parse(Int, split(s, '\n')[end]) > 100
 end
 
 if Sys.isbsd() || Sys.islinux()
diff --git a/test/runtests.jl b/test/runtests.jl
index aa9e101fa2182..4c9ac1cfd869c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -124,6 +124,15 @@ cd(@__DIR__) do
         Distributed.remotecall_eval(Main, workers(), revise_init_expr)
     end
 
+    println("""
+        Running parallel tests with:
+          nworkers() = $(nworkers())
+          nthreads() = $(Threads.nthreads())
+          Sys.CPU_THREADS = $(Sys.CPU_THREADS)
+          Sys.total_memory() = $(Base.format_bytes(Sys.total_memory()))
+          Sys.free_memory() = $(Base.format_bytes(Sys.free_memory()))
+        """)
+
     #pretty print the information about gc and mem usage
     testgroupheader = "Test"
     workerheader = "(Worker)"

From c004dccd65f5cfe91ec6b6fc9b28fdae63214afd Mon Sep 17 00:00:00 2001
From: Daniel Karrasch <daniel.karrasch@posteo.de>
Date: Tue, 22 Mar 2022 21:44:56 +0100
Subject: [PATCH 04/13] Fix performance bug for `*` with `AbstractQ` (#44615)

(cherry picked from commit fc9c280584f63236a1b97da4178a41eba65b6da2)
---
 stdlib/LinearAlgebra/src/bidiag.jl   | 10 ++--
 stdlib/LinearAlgebra/src/diagonal.jl | 12 ++++-
 stdlib/LinearAlgebra/src/special.jl  | 69 ++++++++++++++++++++++++----
 stdlib/LinearAlgebra/src/tridiag.jl  |  9 ++--
 stdlib/LinearAlgebra/test/special.jl | 32 +++++++------
 5 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/bidiag.jl b/stdlib/LinearAlgebra/src/bidiag.jl
index 243553ebc64c6..dfcbec69c6de2 100644
--- a/stdlib/LinearAlgebra/src/bidiag.jl
+++ b/stdlib/LinearAlgebra/src/bidiag.jl
@@ -168,15 +168,13 @@ end
 function Matrix{T}(A::Bidiagonal) where T
     n = size(A, 1)
     B = zeros(T, n, n)
-    if n == 0
-        return B
-    end
-    for i = 1:n - 1
+    n == 0 && return B
+    @inbounds for i = 1:n - 1
         B[i,i] = A.dv[i]
         if A.uplo == 'U'
-            B[i, i + 1] = A.ev[i]
+            B[i,i+1] = A.ev[i]
         else
-            B[i + 1, i] = A.ev[i]
+            B[i+1,i] = A.ev[i]
         end
     end
     B[n,n] = A.dv[n]
diff --git a/stdlib/LinearAlgebra/src/diagonal.jl b/stdlib/LinearAlgebra/src/diagonal.jl
index 4b7d9bd9d4af1..11f3fff9cb3e2 100644
--- a/stdlib/LinearAlgebra/src/diagonal.jl
+++ b/stdlib/LinearAlgebra/src/diagonal.jl
@@ -77,8 +77,16 @@ Diagonal{T}(D::Diagonal{T}) where {T} = D
 Diagonal{T}(D::Diagonal) where {T} = Diagonal{T}(D.diag)
 
 AbstractMatrix{T}(D::Diagonal) where {T} = Diagonal{T}(D)
-Matrix(D::Diagonal) = diagm(0 => D.diag)
-Array(D::Diagonal) = Matrix(D)
+Matrix(D::Diagonal{T}) where {T} = Matrix{T}(D)
+Array(D::Diagonal{T}) where {T} = Matrix{T}(D)
+function Matrix{T}(D::Diagonal) where {T}
+    n = size(D, 1)
+    B = zeros(T, n, n)
+    @inbounds for i in 1:n
+        B[i,i] = D.diag[i]
+    end
+    return B
+end
 
 """
     Diagonal{T}(undef, n)
diff --git a/stdlib/LinearAlgebra/src/special.jl b/stdlib/LinearAlgebra/src/special.jl
index 39b62d5e3ca03..beac0c524f2f4 100644
--- a/stdlib/LinearAlgebra/src/special.jl
+++ b/stdlib/LinearAlgebra/src/special.jl
@@ -292,16 +292,65 @@ function (-)(A::UniformScaling, B::Diagonal{<:Number})
     Diagonal(A.λ .- B.diag)
 end
 
-rmul!(A::AbstractTriangular, adjB::Adjoint{<:Any,<:Union{QRCompactWYQ,QRPackedQ}}) =
-    rmul!(full!(A), adjB)
-*(A::AbstractTriangular, adjB::Adjoint{<:Any,<:Union{QRCompactWYQ,QRPackedQ}}) =
-    *(copyto!(similar(parent(A)), A), adjB)
-*(A::BiTriSym, adjB::Adjoint{<:Any,<:Union{QRCompactWYQ, QRPackedQ}}) =
-    rmul!(copyto!(Array{promote_type(eltype(A), eltype(adjB))}(undef, size(A)...), A), adjB)
-*(adjA::Adjoint{<:Any,<:Union{QRCompactWYQ, QRPackedQ}}, B::Diagonal) =
-    lmul!(adjA, copyto!(Array{promote_type(eltype(adjA), eltype(B))}(undef, size(B)...), B))
-*(adjA::Adjoint{<:Any,<:Union{QRCompactWYQ, QRPackedQ}}, B::BiTriSym) =
-    lmul!(adjA, copyto!(Array{promote_type(eltype(adjA), eltype(B))}(undef, size(B)...), B))
+lmul!(Q::AbstractQ, B::AbstractTriangular) = lmul!(Q, full!(B))
+lmul!(Q::QRPackedQ, B::AbstractTriangular) = lmul!(Q, full!(B)) # disambiguation
+lmul!(Q::Adjoint{<:Any,<:AbstractQ}, B::AbstractTriangular) = lmul!(Q, full!(B))
+lmul!(Q::Adjoint{<:Any,<:QRPackedQ}, B::AbstractTriangular) = lmul!(Q, full!(B)) # disambiguation
+
+function _qlmul(Q::AbstractQ, B)
+    TQB = promote_type(eltype(Q), eltype(B))
+    if size(Q.factors, 1) == size(B, 1)
+        Bnew = Matrix{TQB}(B)
+    elseif size(Q.factors, 2) == size(B, 1)
+        Bnew = [Matrix{TQB}(B); zeros(TQB, size(Q.factors, 1) - size(B,1), size(B, 2))]
+    else
+        throw(DimensionMismatch("first dimension of matrix must have size either $(size(Q.factors, 1)) or $(size(Q.factors, 2))"))
+    end
+    lmul!(convert(AbstractMatrix{TQB}, Q), Bnew)
+end
+function _qlmul(adjQ::Adjoint{<:Any,<:AbstractQ}, B)
+    TQB = promote_type(eltype(adjQ), eltype(B))
+    lmul!(adjoint(convert(AbstractMatrix{TQB}, parent(adjQ))), Matrix{TQB}(B))
+end
+
+*(Q::AbstractQ, B::AbstractTriangular) = _qlmul(Q, B)
+*(Q::Adjoint{<:Any,<:AbstractQ}, B::AbstractTriangular) = _qlmul(Q, B)
+*(Q::AbstractQ, B::BiTriSym) = _qlmul(Q, B)
+*(Q::Adjoint{<:Any,<:AbstractQ}, B::BiTriSym) = _qlmul(Q, B)
+*(Q::AbstractQ, B::Diagonal) = _qlmul(Q, B)
+*(Q::Adjoint{<:Any,<:AbstractQ}, B::Diagonal) = _qlmul(Q, B)
+
+rmul!(A::AbstractTriangular, Q::AbstractQ) = rmul!(full!(A), Q)
+rmul!(A::AbstractTriangular, Q::Adjoint{<:Any,<:AbstractQ}) = rmul!(full!(A), Q)
+
+function _qrmul(A, Q::AbstractQ)
+    TAQ = promote_type(eltype(A), eltype(Q))
+    return rmul!(Matrix{TAQ}(A), convert(AbstractMatrix{TAQ}, Q))
+end
+function _qrmul(A, adjQ::Adjoint{<:Any,<:AbstractQ})
+    Q = adjQ.parent
+    TAQ = promote_type(eltype(A), eltype(Q))
+    if size(A,2) == size(Q.factors, 1)
+        Anew = Matrix{TAQ}(A)
+    elseif size(A,2) == size(Q.factors,2)
+        Anew = [Matrix{TAQ}(A) zeros(TAQ, size(A, 1), size(Q.factors, 1) - size(Q.factors, 2))]
+    else
+        throw(DimensionMismatch("matrix A has dimensions $(size(A)) but matrix B has dimensions $(size(Q))"))
+    end
+    return rmul!(Anew, adjoint(convert(AbstractMatrix{TAQ}, Q)))
+end
+
+*(A::AbstractTriangular, Q::AbstractQ) = _qrmul(A, Q)
+*(A::AbstractTriangular, Q::Adjoint{<:Any,<:AbstractQ}) = _qrmul(A, Q)
+*(A::BiTriSym, Q::AbstractQ) = _qrmul(A, Q)
+*(A::BiTriSym, Q::Adjoint{<:Any,<:AbstractQ}) = _qrmul(A, Q)
+*(A::Diagonal, Q::AbstractQ) = _qrmul(A, Q)
+*(A::Diagonal, Q::Adjoint{<:Any,<:AbstractQ}) = _qrmul(A, Q)
+
+*(Q::AbstractQ, B::AbstractQ) = _qlmul(Q, B)
+*(Q::Adjoint{<:Any,<:AbstractQ}, B::AbstractQ) = _qrmul(Q, B)
+*(Q::AbstractQ, B::Adjoint{<:Any,<:AbstractQ}) = _qlmul(Q, B)
+*(Q::Adjoint{<:Any,<:AbstractQ}, B::Adjoint{<:Any,<:AbstractQ}) = _qrmul(Q, B)
 
 # fill[stored]! methods
 fillstored!(A::Diagonal, x) = (fill!(A.diag, x); A)
diff --git a/stdlib/LinearAlgebra/src/tridiag.jl b/stdlib/LinearAlgebra/src/tridiag.jl
index 5a3c7612f6784..4b1d3add5df5b 100644
--- a/stdlib/LinearAlgebra/src/tridiag.jl
+++ b/stdlib/LinearAlgebra/src/tridiag.jl
@@ -571,15 +571,16 @@ function size(M::Tridiagonal, d::Integer)
     end
 end
 
-function Matrix{T}(M::Tridiagonal{T}) where T
+function Matrix{T}(M::Tridiagonal) where {T}
     A = zeros(T, size(M))
-    for i = 1:length(M.d)
+    n = length(M.d)
+    n == 0 && return A
+    for i in 1:n-1
         A[i,i] = M.d[i]
-    end
-    for i = 1:length(M.d)-1
         A[i+1,i] = M.dl[i]
         A[i,i+1] = M.du[i]
     end
+    A[n,n] = M.d[n]
     A
 end
 Matrix(M::Tridiagonal{T}) where {T} = Matrix{T}(M)
diff --git a/stdlib/LinearAlgebra/test/special.jl b/stdlib/LinearAlgebra/test/special.jl
index ced2681ff0969..84c1bb006280b 100644
--- a/stdlib/LinearAlgebra/test/special.jl
+++ b/stdlib/LinearAlgebra/test/special.jl
@@ -188,16 +188,21 @@ end
 
 
 @testset "Triangular Types and QR" begin
-    for typ in [UpperTriangular,LowerTriangular,LinearAlgebra.UnitUpperTriangular,LinearAlgebra.UnitLowerTriangular]
+    for typ in (UpperTriangular, LowerTriangular, UnitUpperTriangular, UnitLowerTriangular)
         a = rand(n,n)
         atri = typ(a)
+        matri = Matrix(atri)
         b = rand(n,n)
         qrb = qr(b, ColumnNorm())
-        @test *(atri, adjoint(qrb.Q)) ≈ Matrix(atri) * qrb.Q'
-        @test rmul!(copy(atri), adjoint(qrb.Q)) ≈ Matrix(atri) * qrb.Q'
+        @test atri * qrb.Q ≈ matri * qrb.Q ≈ rmul!(copy(atri), qrb.Q)
+        @test atri * qrb.Q' ≈ matri * qrb.Q' ≈ rmul!(copy(atri), qrb.Q')
+        @test qrb.Q * atri ≈ qrb.Q * matri ≈ lmul!(qrb.Q, copy(atri))
+        @test qrb.Q' * atri ≈ qrb.Q' * matri ≈ lmul!(qrb.Q', copy(atri))
         qrb = qr(b, NoPivot())
-        @test *(atri, adjoint(qrb.Q)) ≈ Matrix(atri) * qrb.Q'
-        @test rmul!(copy(atri), adjoint(qrb.Q)) ≈ Matrix(atri) * qrb.Q'
+        @test atri * qrb.Q ≈ matri * qrb.Q ≈ rmul!(copy(atri), qrb.Q)
+        @test atri * qrb.Q' ≈ matri * qrb.Q' ≈ rmul!(copy(atri), qrb.Q')
+        @test qrb.Q * atri ≈ qrb.Q * matri ≈ lmul!(qrb.Q, copy(atri))
+        @test qrb.Q' * atri ≈ qrb.Q' * matri ≈ lmul!(qrb.Q', copy(atri))
     end
 end
 
@@ -421,19 +426,18 @@ end
 end
 
 @testset "BiTriSym*Q' and Q'*BiTriSym" begin
-    dl = [1, 1, 1];
-    d = [1, 1, 1, 1];
-    Tri = Tridiagonal(dl, d, dl)
+    dl = [1, 1, 1]
+    d = [1, 1, 1, 1]
+    D = Diagonal(d)
     Bi = Bidiagonal(d, dl, :L)
+    Tri = Tridiagonal(dl, d, dl)
     Sym = SymTridiagonal(d, dl)
     F = qr(ones(4, 1))
     A = F.Q'
-    @test Tri*A ≈ Matrix(Tri)*A
-    @test A*Tri ≈ A*Matrix(Tri)
-    @test Bi*A ≈ Matrix(Bi)*A
-    @test A*Bi ≈ A*Matrix(Bi)
-    @test Sym*A ≈ Matrix(Sym)*A
-    @test A*Sym ≈ A*Matrix(Sym)
+    for A in (F.Q, F.Q'), B in (D, Bi, Tri, Sym)
+        @test B*A ≈ Matrix(B)*A
+        @test A*B ≈ A*Matrix(B)
+    end
 end
 
 @testset "Ops on SymTridiagonal ev has the same length as dv" begin

From 5005faedb73b47c7514ccecf6795896a2e6819a3 Mon Sep 17 00:00:00 2001
From: Elliot Saba <staticfloat@gmail.com>
Date: Mon, 21 Mar 2022 13:33:24 -0700
Subject: [PATCH 05/13] More flexible test affinity setting (#44677)

* More flexibly test affinity setting

When running on a machine with `cpusets` applied, we are unable to
assign CPU affinity to CPUs 1 and 2; we may be locked to CPUs 9-16, for
example.  So we must inspect what our current cpumask is, and from that
select CPUs that we can safely assign affinity to in our tests.

* Import `uv_thread_getaffinity` from `print_process_affinity.jl`

* Call `uv_thread_getaffinity` only if `AFFINITY_SUPPORTED`

* Fix a syntax error

Co-authored-by: Takafumi Arakaki <aka.tkf@gmail.com>
(cherry picked from commit 32b1305c78c353c7b9241a2f1bac43788041240f)
---
 test/threads.jl | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/test/threads.jl b/test/threads.jl
index 718358f847dd5..8322bcfa2ebbd 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -4,6 +4,8 @@ using Test
 
 using Base.Threads
 
+include("print_process_affinity.jl") # import `uv_thread_getaffinity`
+
 # simple sanity tests for locks under cooperative concurrent access
 let lk = ReentrantLock()
     c1 = Event()
@@ -93,10 +95,13 @@ else
 end
 # Note also that libuv does not support affinity in macOS and it is known to
 # hang in FreeBSD. So, it's tested only in Linux and Windows:
-if Sys.islinux() || Sys.iswindows()
-    if Sys.CPU_THREADS > 1 && !running_under_rr()
-        @test run_with_affinity([2]) == "2"
-        @test run_with_affinity([1, 2]) == "1,2"
+const AFFINITY_SUPPORTED = (Sys.islinux() || Sys.iswindows()) && !running_under_rr()
+
+if AFFINITY_SUPPORTED
+    allowed_cpus = findall(uv_thread_getaffinity())
+    if length(allowed_cpus) ≥ 2
+        @test run_with_affinity(allowed_cpus[1:1]) == "$(allowed_cpus[1])"
+        @test run_with_affinity(allowed_cpus[1:2]) == "$(allowed_cpus[1]),$(allowed_cpus[2])"
     end
 end
 

From f9176aedb4c346bf29626a0a4ee0998b5dcea75e Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 15 Mar 2022 15:18:41 -0400
Subject: [PATCH 06/13] make fieldtype computation stable/pure

(cherry picked from commit 99bdd00183d77c9ce15877dc524d0029f2087e8a)
---
 src/datatype.c            | 4 ++--
 src/jl_exported_funcs.inc | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/datatype.c b/src/datatype.c
index e7f1ab22365b8..eca51949cd40e 100644
--- a/src/datatype.c
+++ b/src/datatype.c
@@ -1787,9 +1787,9 @@ JL_DLLEXPORT int jl_field_isdefined(jl_value_t *v, size_t i) JL_NOTSAFEPOINT
     return fval != NULL ? 1 : 0;
 }
 
-JL_DLLEXPORT size_t jl_get_field_offset(jl_datatype_t *ty, int field) JL_NOTSAFEPOINT
+JL_DLLEXPORT size_t jl_get_field_offset(jl_datatype_t *ty, int field)
 {
-    if (ty->layout == NULL || field > jl_datatype_nfields(ty) || field < 1)
+    if (!jl_struct_try_layout(ty) || field > jl_datatype_nfields(ty) || field < 1)
         jl_bounds_error_int((jl_value_t*)ty, field);
     return jl_field_offset(ty, field - 1);
 }
diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc
index d63500fe21736..614ed0d9d16af 100644
--- a/src/jl_exported_funcs.inc
+++ b/src/jl_exported_funcs.inc
@@ -214,7 +214,6 @@
     XX(jl_get_excstack) \
     XX(jl_get_fenv_consts) \
     XX(jl_get_field) \
-    XX(jl_get_field_offset) \
     XX(jl_get_global) \
     XX(jl_get_image_file) \
     XX(jl_get_JIT) \

From 9b7990d8c909901c9ba410469df2fcf36e0f4e35 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 15 Mar 2022 15:59:57 -0400
Subject: [PATCH 07/13] types: fix layout issues for Tuple

Fix #44614

(cherry picked from commit e9ba166674e46d4082495993a70095ecea340d84)
---
 src/jltypes.c | 12 ++++++++++--
 test/core.jl  | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/jltypes.c b/src/jltypes.c
index 7d8b1ac6bb32b..bb1cdd6c49b97 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -63,6 +63,12 @@ static int layout_uses_free_typevars(jl_value_t *v, jl_typeenv_t *env)
             return 0;
         if (dt->name == jl_namedtuple_typename)
             return layout_uses_free_typevars(jl_tparam0(dt), env) || layout_uses_free_typevars(jl_tparam1(dt), env);
+        if (dt->name == jl_tuple_typename)
+            // conservative, since we don't want to inline an abstract tuple,
+            // and we currently declare !has_fixed_layout for these, but that
+            // means we also won't be able to inline a tuple which is concrete
+            // except for the use of free type-vars
+            return 1;
         jl_svec_t *types = jl_get_fieldtypes(dt);
         size_t i, l = jl_svec_len(types);
         for (i = 0; i < l; i++) {
@@ -227,8 +233,10 @@ int jl_has_fixed_layout(jl_datatype_t *dt)
         return 1;
     if (dt->name->abstract)
         return 0;
-    if (jl_is_tuple_type(dt) || jl_is_namedtuple_type(dt))
-        return 0; // TODO: relax more?
+    if (dt->name == jl_namedtuple_typename)
+        return !layout_uses_free_typevars(jl_tparam0(dt), NULL) && !layout_uses_free_typevars(jl_tparam1(dt), NULL);
+    if (dt->name == jl_tuple_typename)
+        return 0;
     jl_svec_t *types = jl_get_fieldtypes(dt);
     size_t i, l = jl_svec_len(types);
     for (i = 0; i < l; i++) {
diff --git a/test/core.jl b/test/core.jl
index 93ba97df60420..0294a61c7d146 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -7336,6 +7336,29 @@ struct A43411{S, T}
 end
 @test isbitstype(A43411{(:a,), Tuple{Int}})
 
+# issue #44614
+struct T44614_1{T}
+    m::T
+end
+struct T44614_2{L}
+    tuple::NTuple{3, Int64}
+    T44614_2{L}(t::NTuple{3, Int64}) where {L} = new{sum(t)}(t)
+end
+struct T44614_3{L, N}
+    a::Tuple{T44614_2{L}}
+    param::NTuple{N, T44614_1}
+    T44614_3(a::Tuple{T44614_2{L}}, pars::NTuple{N, T44614_1}) where {L, N} = new{L, N}(a, pars)
+end
+@test sizeof((T44614_2{L} where L).body) == 24
+let T = T44614_3{L,2} where L
+    # these values are computable, but we currently don't know how to compute them properly
+    ex = ErrorException("Argument is an incomplete T44614_3 type and does not have a definite size.")
+    @test_throws ex sizeof(T.body)
+    @test_throws ex sizeof(T)
+    @test_throws BoundsError fieldoffset(T.body, 2)
+    @test fieldoffset(T{1}, 2) == 24
+end
+
 # Issue #34206/34207
 function mre34206(a, n)
     va = view(a, :)

From 94b7154a089b17bca235bc346a8b8d41f4f71e79 Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <40514306+aviatesk@users.noreply.github.com>
Date: Tue, 22 Mar 2022 11:44:08 +0900
Subject: [PATCH 08/13] inference: override `InterConditional` result with
 `Const` carefully (#44668)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I found that a tricky thing can happen when constant inference derives
`Const`-result while non-constant inference has derived (non-constant)
`InterConditional` result beforehand. In such a case, currently we discard
the result with constant inference (since `!(Const ⊑ InterConditional)`),
but we can achieve more accuracy by not discarding that `Const`-information, e.g.:
```julia
julia> iszero_simple(x) = x === 0
iszero_simple (generic function with 1 method)

julia> @test Base.return_types() do
           iszero_simple(0) ? nothing : missing
       end |> only === Nothing
Test Passed
```
---
 base/compiler/abstractinterpretation.jl | 31 ++++++++++++++-----------
 test/compiler/inference.jl              |  7 ++++++
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/base/compiler/abstractinterpretation.jl b/base/compiler/abstractinterpretation.jl
index 2efb660a32fb5..3cc709bc35849 100644
--- a/base/compiler/abstractinterpretation.jl
+++ b/base/compiler/abstractinterpretation.jl
@@ -110,10 +110,9 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
             splitsigs = switchtupleunion(sig)
             for sig_n in splitsigs
                 result = abstract_call_method(interp, method, sig_n, svec(), multiple_matches, sv)
-                rt, edge = result.rt, result.edge
-                if edge !== nothing
-                    push!(edges, edge)
-                end
+                rt = result.rt
+                edge = result.edge
+                edge !== nothing && push!(edges, edge)
                 this_argtypes = isa(matches, MethodMatches) ? argtypes : matches.applicable_argtypes[i]
                 this_arginfo = ArgInfo(fargs, this_argtypes)
                 const_call_result = abstract_call_method_with_const_args(interp, result,
@@ -121,8 +120,10 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
                 effects = result.edge_effects
                 const_result = nothing
                 if const_call_result !== nothing
-                    if const_call_result.rt ⊑ rt
-                        (; rt, effects, const_result) = const_call_result
+                    const_rt = const_call_result.rt
+                    if const_rt ⊑ rt
+                        rt = const_rt
+                        (; effects, const_result) = const_call_result
                     end
                 end
                 tristate_merge!(sv, effects)
@@ -135,6 +136,8 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
                     break
                 end
             end
+            this_conditional = ignorelimited(this_rt)
+            this_rt = widenwrappedconditional(this_rt)
         else
             if infer_compilation_signature(interp)
                 # Also infer the compilation signature for this method, so it's available
@@ -151,10 +154,10 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
             end
 
             result = abstract_call_method(interp, method, sig, match.sparams, multiple_matches, sv)
-            this_rt, edge = result.rt, result.edge
-            if edge !== nothing
-                push!(edges, edge)
-            end
+            this_conditional = ignorelimited(result.rt)
+            this_rt = widenwrappedconditional(result.rt)
+            edge = result.edge
+            edge !== nothing && push!(edges, edge)
             # try constant propagation with argtypes for this match
             # this is in preparation for inlining, or improving the return result
             this_argtypes = isa(matches, MethodMatches) ? argtypes : matches.applicable_argtypes[i]
@@ -164,10 +167,12 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
             effects = result.edge_effects
             const_result = nothing
             if const_call_result !== nothing
-                this_const_rt = const_call_result.rt
-                # return type of const-prop' inference can be wider than  that of non const-prop' inference
+                this_const_conditional = ignorelimited(const_call_result.rt)
+                this_const_rt = widenwrappedconditional(const_call_result.rt)
+                # return type of const-prop' inference can be wider than that of non const-prop' inference
                 # e.g. in cases when there are cycles but cached result is still accurate
                 if this_const_rt ⊑ this_rt
+                    this_conditional = this_const_conditional
                     this_rt = this_const_rt
                     (; effects, const_result) = const_call_result
                 end
@@ -178,8 +183,6 @@ function abstract_call_gf_by_type(interp::AbstractInterpreter, @nospecialize(f),
                 any_const_result = true
             end
         end
-        this_conditional = ignorelimited(this_rt)
-        this_rt = widenwrappedconditional(this_rt)
         @assert !(this_conditional isa Conditional) "invalid lattice element returned from inter-procedural context"
         seen += 1
         rettype = tmerge(rettype, this_rt)
diff --git a/test/compiler/inference.jl b/test/compiler/inference.jl
index 218e484b2beca..e6f113f2b9062 100644
--- a/test/compiler/inference.jl
+++ b/test/compiler/inference.jl
@@ -2032,6 +2032,13 @@ end
         end
         @test ts == Any[Any]
     end
+
+    # a tricky case: if constant inference derives `Const` while non-constant inference has
+    # derived `InterConditional`, we should not discard that constant information
+    iszero_simple(x) = x === 0
+    @test Base.return_types() do
+        iszero_simple(0) ? nothing : missing
+    end |> only === Nothing
 end
 
 @testset "branching on conditional object" begin

From b5ca35b08e717a857974ea5ea9d2887a5ae0348b Mon Sep 17 00:00:00 2001
From: Mirek Kratochvil <exa.exa@gmail.com>
Date: Wed, 23 Mar 2022 11:14:15 +0100
Subject: [PATCH 09/13] avoid using `@sync_add` on remotecalls (#44671)

* avoid using `@sync_add` on remotecalls

It seems like @sync_add adds the Futures to a queue (Channel) for @sync, which
in turn calls wait() for all the futures synchronously. Not only that is
slightly detrimental for network operations (latencies add up), but in case of
Distributed the call to wait() may actually cause some compilation on remote
processes, which is also wait()ed for. In result, some operations took a great
amount of "serial" processing time if executed on many workers at once.

For me, this closes #44645.

The major change can be illustrated as follows: First add some workers:

```
using Distributed
addprocs(10)
```

and then trigger something that, for example, causes package imports on the
workers:

```
using SomeTinyPackage
```

In my case (importing UnicodePlots on 10 workers), this improves the loading
time over 10 workers from ~11s to ~5.5s.

This is a far bigger issue when worker count gets high. The time of the
processing on each worker is usually around 0.3s, so triggering this problem
even on a relatively small cluster (64 workers) causes a really annoying delay,
and running `@everywhere` for the first time on reasonable clusters (I tested
with 1024 workers, see #44645) usually takes more than 5 minutes. Which sucks.

Anyway, on 64 workers this reduces the "first import" time from ~30s to ~6s,
and on 1024 workers this seems to reduce the time from over 5 minutes (I didn't
bother to measure that precisely now, sorry) to ~11s.

Related issues:
- Probably fixes #39291.
- #42156 is a kinda complementary -- it removes the most painful source of
  slowness (the 0.3s precompilation on the workers), but the fact that the
  wait()ing is serial remains a problem if the network latencies are high.

May help with #38931

Co-authored-by: Valentin Churavy <vchuravy@users.noreply.github.com>
(cherry picked from commit 62e0729dbc5f9d5d93d14dcd49457f02a0c6d3a7)
---
 base/task.jl                               | 37 +++++++++++++++++++++-
 stdlib/Distributed/src/Distributed.jl      |  4 +--
 stdlib/Distributed/src/clusterserialize.jl |  2 +-
 stdlib/Distributed/src/macros.jl           |  4 +--
 4 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/base/task.jl b/base/task.jl
index 90dea508383b1..dc08a0257a376 100644
--- a/base/task.jl
+++ b/base/task.jl
@@ -471,6 +471,12 @@ isolating the asynchronous code from changes to the variable's value in the curr
     Interpolating values via `\$` is available as of Julia 1.4.
 """
 macro async(expr)
+    do_async_macro(expr)
+end
+
+# generate the code for @async, possibly wrapping the task in something before
+# pushing it to the wait queue.
+function do_async_macro(expr; wrap=identity)
     letargs = Base._lift_one_interp!(expr)
 
     thunk = esc(:(()->($expr)))
@@ -479,7 +485,7 @@ macro async(expr)
         let $(letargs...)
             local task = Task($thunk)
             if $(Expr(:islocal, var))
-                put!($var, task)
+                put!($var, $(wrap(:task)))
             end
             schedule(task)
             task
@@ -487,6 +493,35 @@ macro async(expr)
     end
 end
 
+# task wrapper that doesn't create exceptions wrapped in TaskFailedException
+struct UnwrapTaskFailedException
+    task::Task
+end
+
+# common code for wait&fetch for UnwrapTaskFailedException
+function unwrap_task_failed(f::Function, t::UnwrapTaskFailedException)
+    try
+        f(t.task)
+    catch ex
+        if ex isa TaskFailedException
+            throw(ex.task.exception)
+        else
+            rethrow()
+        end
+    end
+end
+
+# the unwrapping for above task wrapper (gets triggered in sync_end())
+wait(t::UnwrapTaskFailedException) = unwrap_task_failed(wait, t)
+
+# same for fetching the tasks, for convenience
+fetch(t::UnwrapTaskFailedException) = unwrap_task_failed(fetch, t)
+
+# macro for running async code that doesn't throw wrapped exceptions
+macro async_unwrap(expr)
+    do_async_macro(expr, wrap=task->:(Base.UnwrapTaskFailedException($task)))
+end
+
 """
     errormonitor(t::Task)
 
diff --git a/stdlib/Distributed/src/Distributed.jl b/stdlib/Distributed/src/Distributed.jl
index d428a6df0e683..3bcbc7b67f60d 100644
--- a/stdlib/Distributed/src/Distributed.jl
+++ b/stdlib/Distributed/src/Distributed.jl
@@ -10,7 +10,7 @@ import Base: getindex, wait, put!, take!, fetch, isready, push!, length,
              hash, ==, kill, close, isopen, showerror
 
 # imports for use
-using Base: Process, Semaphore, JLOptions, buffer_writes, @sync_add,
+using Base: Process, Semaphore, JLOptions, buffer_writes, @async_unwrap,
             VERSION_STRING, binding_module, atexit, julia_exename,
             julia_cmd, AsyncGenerator, acquire, release, invokelatest,
             shell_escape_posixly, shell_escape_csh,
@@ -76,7 +76,7 @@ function _require_callback(mod::Base.PkgId)
         # broadcast top-level (e.g. from Main) import/using from node 1 (only)
         @sync for p in procs()
             p == 1 && continue
-            @sync_add remotecall(p) do
+            @async_unwrap remotecall_wait(p) do
                 Base.require(mod)
                 nothing
             end
diff --git a/stdlib/Distributed/src/clusterserialize.jl b/stdlib/Distributed/src/clusterserialize.jl
index e37987c5bf875..28025ae867c78 100644
--- a/stdlib/Distributed/src/clusterserialize.jl
+++ b/stdlib/Distributed/src/clusterserialize.jl
@@ -243,7 +243,7 @@ An exception is raised if a global constant is requested to be cleared.
 """
 function clear!(syms, pids=workers(); mod=Main)
     @sync for p in pids
-        @sync_add remotecall(clear_impl!, p, syms, mod)
+        @async_unwrap remotecall_wait(clear_impl!, p, syms, mod)
     end
 end
 clear!(sym::Symbol, pid::Int; mod=Main) = clear!([sym], [pid]; mod=mod)
diff --git a/stdlib/Distributed/src/macros.jl b/stdlib/Distributed/src/macros.jl
index 0a62fdd5439f0..a767c7a40d9c9 100644
--- a/stdlib/Distributed/src/macros.jl
+++ b/stdlib/Distributed/src/macros.jl
@@ -222,10 +222,10 @@ function remotecall_eval(m::Module, procs, ex)
             if pid == myid()
                 run_locally += 1
             else
-                @sync_add remotecall(Core.eval, pid, m, ex)
+                @async_unwrap remotecall_wait(Core.eval, pid, m, ex)
             end
         end
-        yield() # ensure that the remotecall_fetch have had a chance to start
+        yield() # ensure that the remotecalls have had a chance to start
 
         # execute locally last as we do not want local execution to block serialization
         # of the request to remote nodes.

From 50a722b04eb723db4e2af8a77a2f90acd5760a4e Mon Sep 17 00:00:00 2001
From: Daniel Karrasch <daniel.karrasch@posteo.de>
Date: Thu, 24 Mar 2022 10:09:05 +0100
Subject: [PATCH 10/13] Make `Matrix` cntr work for structured matrices for
 `zero(T) !isa T` (#44707)

(cherry picked from commit 20999873e8aa78f41555ea4ca63f6d89db43376d)
---
 stdlib/LinearAlgebra/src/bidiag.jl   |  2 +-
 stdlib/LinearAlgebra/src/dense.jl    |  9 +++++++++
 stdlib/LinearAlgebra/src/diagonal.jl |  4 ++--
 stdlib/LinearAlgebra/src/tridiag.jl  |  4 ++--
 stdlib/LinearAlgebra/test/special.jl | 22 ++++++++++++++++++++++
 5 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/bidiag.jl b/stdlib/LinearAlgebra/src/bidiag.jl
index dfcbec69c6de2..317ed15af770c 100644
--- a/stdlib/LinearAlgebra/src/bidiag.jl
+++ b/stdlib/LinearAlgebra/src/bidiag.jl
@@ -180,7 +180,7 @@ function Matrix{T}(A::Bidiagonal) where T
     B[n,n] = A.dv[n]
     return B
 end
-Matrix(A::Bidiagonal{T}) where {T} = Matrix{T}(A)
+Matrix(A::Bidiagonal{T}) where {T} = Matrix{promote_type(T, typeof(zero(T)))}(A)
 Array(A::Bidiagonal) = Matrix(A)
 promote_rule(::Type{Matrix{T}}, ::Type{<:Bidiagonal{S}}) where {T,S} =
     @isdefined(T) && @isdefined(S) ? Matrix{promote_type(T,S)} : Matrix
diff --git a/stdlib/LinearAlgebra/src/dense.jl b/stdlib/LinearAlgebra/src/dense.jl
index ffcd9e64e0752..249010adb4e5c 100644
--- a/stdlib/LinearAlgebra/src/dense.jl
+++ b/stdlib/LinearAlgebra/src/dense.jl
@@ -257,6 +257,8 @@ Vector `kv.second` will be placed on the `kv.first` diagonal.
 By default the matrix is square and its size is inferred
 from `kv`, but a non-square size `m`×`n` (padded with zeros as needed)
 can be specified by passing `m,n` as the first arguments.
+For repeated diagonal indices `kv.first` the values in the corresponding
+vectors `kv.second` will be added.
 
 `diagm` constructs a full matrix; if you want storage-efficient
 versions with fast arithmetic, see [`Diagonal`](@ref), [`Bidiagonal`](@ref)
@@ -277,6 +279,13 @@ julia> diagm(1 => [1,2,3], -1 => [4,5])
  4  0  2  0
  0  5  0  3
  0  0  0  0
+
+julia> diagm(1 => [1,2,3], 1 => [1,2,3])
+4×4 Matrix{Int64}:
+ 0  2  0  0
+ 0  0  4  0
+ 0  0  0  6
+ 0  0  0  0
 ```
 """
 diagm(kv::Pair{<:Integer,<:AbstractVector}...) = _diagm(nothing, kv...)
diff --git a/stdlib/LinearAlgebra/src/diagonal.jl b/stdlib/LinearAlgebra/src/diagonal.jl
index 11f3fff9cb3e2..5d17049cfa4e1 100644
--- a/stdlib/LinearAlgebra/src/diagonal.jl
+++ b/stdlib/LinearAlgebra/src/diagonal.jl
@@ -77,8 +77,8 @@ Diagonal{T}(D::Diagonal{T}) where {T} = D
 Diagonal{T}(D::Diagonal) where {T} = Diagonal{T}(D.diag)
 
 AbstractMatrix{T}(D::Diagonal) where {T} = Diagonal{T}(D)
-Matrix(D::Diagonal{T}) where {T} = Matrix{T}(D)
-Array(D::Diagonal{T}) where {T} = Matrix{T}(D)
+Matrix(D::Diagonal{T}) where {T} = Matrix{promote_type(T, typeof(zero(T)))}(D)
+Array(D::Diagonal{T}) where {T} = Matrix(D)
 function Matrix{T}(D::Diagonal) where {T}
     n = size(D, 1)
     B = zeros(T, n, n)
diff --git a/stdlib/LinearAlgebra/src/tridiag.jl b/stdlib/LinearAlgebra/src/tridiag.jl
index 4b1d3add5df5b..e5c31856d3f0a 100644
--- a/stdlib/LinearAlgebra/src/tridiag.jl
+++ b/stdlib/LinearAlgebra/src/tridiag.jl
@@ -134,7 +134,7 @@ function Matrix{T}(M::SymTridiagonal) where T
     Mf[n,n] = symmetric(M.dv[n], :U)
     return Mf
 end
-Matrix(M::SymTridiagonal{T}) where {T} = Matrix{T}(M)
+Matrix(M::SymTridiagonal{T}) where {T} = Matrix{promote_type(T, typeof(zero(T)))}(M)
 Array(M::SymTridiagonal) = Matrix(M)
 
 size(A::SymTridiagonal) = (length(A.dv), length(A.dv))
@@ -583,7 +583,7 @@ function Matrix{T}(M::Tridiagonal) where {T}
     A[n,n] = M.d[n]
     A
 end
-Matrix(M::Tridiagonal{T}) where {T} = Matrix{T}(M)
+Matrix(M::Tridiagonal{T}) where {T} = Matrix{promote_type(T, typeof(zero(T)))}(M)
 Array(M::Tridiagonal) = Matrix(M)
 
 similar(M::Tridiagonal, ::Type{T}) where {T} = Tridiagonal(similar(M.dl, T), similar(M.d, T), similar(M.du, T))
diff --git a/stdlib/LinearAlgebra/test/special.jl b/stdlib/LinearAlgebra/test/special.jl
index 84c1bb006280b..277c51bb9f7bb 100644
--- a/stdlib/LinearAlgebra/test/special.jl
+++ b/stdlib/LinearAlgebra/test/special.jl
@@ -104,6 +104,28 @@ Random.seed!(1)
             @test LowerTriangular(C) == LowerTriangular(Cdense)
         end
     end
+
+    @testset "Matrix constructor for !isa(zero(T), T)" begin
+        # the following models JuMP.jl's VariableRef and AffExpr, resp.
+        struct TypeWithoutZero end
+        struct TypeWithZero end
+        Base.promote_rule(::Type{TypeWithoutZero}, ::Type{TypeWithZero}) = TypeWithZero
+        Base.convert(::Type{TypeWithZero}, ::TypeWithoutZero) = TypeWithZero()
+        Base.zero(::Type{<:Union{TypeWithoutZero, TypeWithZero}}) = TypeWithZero()
+        LinearAlgebra.symmetric(::TypeWithoutZero, ::Symbol) = TypeWithoutZero()
+        Base.transpose(::TypeWithoutZero) = TypeWithoutZero()
+        d  = fill(TypeWithoutZero(), 3)
+        du = fill(TypeWithoutZero(), 2)
+        dl = fill(TypeWithoutZero(), 2)
+        D  = Diagonal(d)
+        Bu = Bidiagonal(d, du, :U)
+        Bl = Bidiagonal(d, dl, :L)
+        Tri = Tridiagonal(dl, d, du)
+        Sym = SymTridiagonal(d, dl)
+        for M in (D, Bu, Bl, Tri, Sym)
+            @test Matrix(M) == zeros(TypeWithZero, 3, 3)
+        end
+    end
 end
 
 @testset "Binary ops among special types" begin

From a3c8e37c4e7f609f3fa2d86430ebcc38db04aa1b Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Fri, 25 Mar 2022 16:15:18 +0800
Subject: [PATCH 11/13] Recover 1.7's behavior (#44736)

Fix #44734.

(cherry picked from commit 68e2969217112c6a9ee576183af20101a6132b71)
---
 stdlib/LinearAlgebra/src/blas.jl     | 7 +++++--
 stdlib/LinearAlgebra/test/generic.jl | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/blas.jl b/stdlib/LinearAlgebra/src/blas.jl
index 7d7f77c282bc0..caa61cf94a52d 100644
--- a/stdlib/LinearAlgebra/src/blas.jl
+++ b/stdlib/LinearAlgebra/src/blas.jl
@@ -510,7 +510,9 @@ for (fname, elty) in ((:daxpy_,:Float64),
         end
     end
 end
-function axpy!(alpha::Number, x::AbstractArray{T}, y::AbstractArray{T}) where T<:BlasFloat
+
+#TODO: replace with `x::AbstractArray{T}` once we separate `BLAS.axpy!` and `LinearAlgebra.axpy!`
+function axpy!(alpha::Number, x::Union{DenseArray{T},StridedVector{T}}, y::Union{DenseArray{T},StridedVector{T}}) where T<:BlasFloat
     if length(x) != length(y)
         throw(DimensionMismatch(lazy"x has length $(length(x)), but y has length $(length(y))"))
     end
@@ -582,7 +584,8 @@ for (fname, elty) in ((:daxpby_,:Float64), (:saxpby_,:Float32),
     end
 end
 
-function axpby!(alpha::Number, x::AbstractArray{T}, beta::Number, y::AbstractArray{T}) where T<:BlasFloat
+#TODO: replace with `x::AbstractArray{T}` once we separate `BLAS.axpby!` and `LinearAlgebra.axpby!`
+function axpby!(alpha::Number, x::Union{DenseArray{T},AbstractVector{T}}, beta::Number, y::Union{DenseArray{T},AbstractVector{T}},) where T<:BlasFloat
     require_one_based_indexing(x, y)
     if length(x) != length(y)
         throw(DimensionMismatch(lazy"x has length $(length(x)), but y has length $(length(y))"))
diff --git a/stdlib/LinearAlgebra/test/generic.jl b/stdlib/LinearAlgebra/test/generic.jl
index b56edf9439fe0..cd52d30da6c8d 100644
--- a/stdlib/LinearAlgebra/test/generic.jl
+++ b/stdlib/LinearAlgebra/test/generic.jl
@@ -295,6 +295,14 @@ end
     @test LinearAlgebra.axpy!(α, x, rx, y, ry) == [1 1 1 1; 11 1 1 26]
 end
 
+@testset "LinearAlgebra.axp(b)y! for non strides input" begin
+    a = rand(5, 5)
+    @test LinearAlgebra.axpby!(1, Hermitian(a), 1, zeros(size(a))) == Hermitian(a)
+    @test_broken LinearAlgebra.axpby!(1, 1.:5, 1, zeros(5)) == 1.:5
+    @test LinearAlgebra.axpy!(1, Hermitian(a), zeros(size(a))) == Hermitian(a)
+    @test LinearAlgebra.axpy!(1, 1.:5, zeros(5)) == 1.:5
+end
+
 @testset "norm and normalize!" begin
     vr = [3.0, 4.0]
     for Tr in (Float32, Float64)

From 730edd49b72f142a4ac27b49d370bd6e11f9f9c2 Mon Sep 17 00:00:00 2001
From: Sheehan Olver <solver@mac.com>
Date: Fri, 25 Mar 2022 20:14:50 +0000
Subject: [PATCH 12/13] Avoid copy in getindex(::AbstractQ, ...) (#44729)

(cherry picked from commit ea82910042ff6e5e34e96a2a8711c3087bc7b209)
---
 stdlib/LinearAlgebra/src/qr.jl  | 5 +++--
 stdlib/LinearAlgebra/test/qr.jl | 6 ++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/qr.jl b/stdlib/LinearAlgebra/src/qr.jl
index 16e066ed1e030..4e1cc83b468f5 100644
--- a/stdlib/LinearAlgebra/src/qr.jl
+++ b/stdlib/LinearAlgebra/src/qr.jl
@@ -582,8 +582,9 @@ size(F::Union{QR,QRCompactWY,QRPivoted}) = size(getfield(F, :factors))
 size(Q::AbstractQ, dim::Integer) = size(getfield(Q, :factors), dim == 2 ? 1 : dim)
 size(Q::AbstractQ) = size(Q, 1), size(Q, 2)
 
-copy(Q::AbstractQ{T}) where {T} = lmul!(Q, Matrix{T}(I, size(Q)))
-getindex(Q::AbstractQ, inds...) = copy(Q)[inds...]
+copymutable(Q::AbstractQ{T}) where {T} = lmul!(Q, Matrix{T}(I, size(Q)))
+copy(Q::AbstractQ) = copymutable(Q)
+getindex(Q::AbstractQ, inds...) = copymutable(Q)[inds...]
 getindex(Q::AbstractQ, ::Colon, ::Colon) = copy(Q)
 
 function getindex(Q::AbstractQ, ::Colon, j::Int)
diff --git a/stdlib/LinearAlgebra/test/qr.jl b/stdlib/LinearAlgebra/test/qr.jl
index f9acbdb376465..a7b24f08385f2 100644
--- a/stdlib/LinearAlgebra/test/qr.jl
+++ b/stdlib/LinearAlgebra/test/qr.jl
@@ -449,6 +449,12 @@ end
         @test Q2[:, :] ≈ M[:, :]
         @test Q2[:, :, :] ≈ M[:, :, :]
     end
+    # Check that getindex works if copy returns itself (#44729)
+    struct MyIdentity{T} <: LinearAlgebra.AbstractQ{T} end
+    Base.size(::MyIdentity, dim::Integer) = dim in (1,2) ? 2 : 1
+    Base.copy(J::MyIdentity) = J
+    LinearAlgebra.lmul!(::MyIdentity{T}, M::Array{T}) where {T} = M
+    @test MyIdentity{Float64}()[1,:] == [1.0, 0.0]
 end
 
 end # module TestQR

From 6a88865f231e188ddba89a5a19fae274b7c2be8a Mon Sep 17 00:00:00 2001
From: Simeon Schaub <schaub@mit.edu>
Date: Sun, 27 Mar 2022 02:05:55 -0400
Subject: [PATCH 13/13] fix oc lowering with return type annotations (#44727)

fixes #44723

Co-authored-by: Takafumi Arakaki <aka.tkf@gmail.com>
(cherry picked from commit 19eb3073561266f5e1699e9f4f9d52c65b42d76f)
---
 src/julia-syntax.scm | 20 ++++++++++++--------
 test/syntax.jl       |  4 ++++
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/julia-syntax.scm b/src/julia-syntax.scm
index 0f0e5c032d377..7538b89d32cb3 100644
--- a/src/julia-syntax.scm
+++ b/src/julia-syntax.scm
@@ -4105,7 +4105,7 @@ f(x) = yt(x)
            (cons (car e)
                  (map-cl-convert (cdr e) fname lam namemap defined toplevel interp opaq globals))))))))
 
-(define (closure-convert e) (cl-convert e #f #f #f #f #f #f #f))
+(define (closure-convert e) (cl-convert e #f #f (table) (table) #f #f #f))
 
 ;; pass 5: convert to linear IR
 
@@ -4205,17 +4205,21 @@ f(x) = yt(x)
                                      (loop (cdr s))))))
             `(pop_exception ,restore-token))))
     (define (emit-return x)
-      (define (actually-return x)
-        (let* ((x   (if rett
-                        (compile (convert-for-type-decl x rett) '() #t #f)
-                        x))
-               (tmp (if ((if (null? catch-token-stack) valid-ir-return? simple-atom?) x)
+      (define (emit- x)
+        (let* ((tmp (if ((if (null? catch-token-stack) valid-ir-return? simple-atom?) x)
                         #f
                         (make-ssavalue))))
-          (if tmp (emit `(= ,tmp ,x)))
+          (if tmp
+              (begin (emit `(= ,tmp ,x)) tmp)
+              x)))
+      (define (actually-return x)
+        (let* ((x (if rett
+                      (compile (convert-for-type-decl (emit- x) rett) '() #t #f)
+                      x))
+               (x (emit- x)))
           (let ((pexc (pop-exc-expr catch-token-stack '())))
             (if pexc (emit pexc)))
-          (emit `(return ,(or tmp x)))))
+          (emit `(return ,x))))
       (if x
           (if (> handler-level 0)
               (let ((tmp (cond ((and (simple-atom? x) (or (not (ssavalue? x)) (not finally-handler))) #f)
diff --git a/test/syntax.jl b/test/syntax.jl
index ff392d5069708..805d006f971ba 100644
--- a/test/syntax.jl
+++ b/test/syntax.jl
@@ -3265,3 +3265,7 @@ end
     @test m.Foo.bar === 1
     @test Core.get_binding_type(m.Foo, :bar) == Any
 end
+
+# issue 44723
+demo44723()::Any = Base.Experimental.@opaque () -> true ? 1 : 2
+@test demo44723()() == 1