From 0f57da64cdfd81f42b1e2ee875deb3ac760fb68f Mon Sep 17 00:00:00 2001 From: "Luiz M. Faria" Date: Wed, 22 Jun 2022 17:45:06 +0200 Subject: [PATCH] extract `TiledFactorization` module --- Project.toml | 4 - docs/src/references.md | 3 +- src/DataFlowTasks.jl | 3 - src/TiledFactorization/TiledFactorization.jl | 36 ------ src/TiledFactorization/cholesky.jl | 117 ------------------- src/TiledFactorization/lu.jl | 73 ------------ src/TiledFactorization/tiledmatrix.jl | 34 ------ test/Project.toml | 2 - test/juliascheduler_test.jl | 19 --- test/logging_dag_test.jl | 47 ++++++++ test/priorityscheduler_test.jl | 20 ---- 11 files changed, 48 insertions(+), 310 deletions(-) delete mode 100644 src/TiledFactorization/TiledFactorization.jl delete mode 100644 src/TiledFactorization/cholesky.jl delete mode 100644 src/TiledFactorization/lu.jl delete mode 100644 src/TiledFactorization/tiledmatrix.jl create mode 100644 test/logging_dag_test.jl diff --git a/Project.toml b/Project.toml index 741de8de..82cb74c3 100644 --- a/Project.toml +++ b/Project.toml @@ -8,13 +8,9 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" -Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" -RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431" -TriangularSolve = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf" [compat] julia = "^1" diff --git a/docs/src/references.md b/docs/src/references.md index b1d5890d..bbe7d232 100644 --- a/docs/src/references.md +++ b/docs/src/references.md @@ -1,6 +1,5 @@ # [References](@id references-section) ```@autodocs -Modules = [DataFlowTasks, - DataFlowTasks.TiledFactorization] +Modules = [DataFlowTasks] ``` diff --git a/src/DataFlowTasks.jl b/src/DataFlowTasks.jl index ffcadb0f..d9f6462c 100644 --- a/src/DataFlowTasks.jl +++ b/src/DataFlowTasks.jl @@ -43,9 +43,6 @@ export @dasync, @dspawn -# a module showcasing some usecases. To be removed eventually... -include("TiledFactorization/TiledFactorization.jl") - function __init__() # default scheduler capacity = 50 diff --git a/src/TiledFactorization/TiledFactorization.jl b/src/TiledFactorization/TiledFactorization.jl deleted file mode 100644 index 62dae72f..00000000 --- a/src/TiledFactorization/TiledFactorization.jl +++ /dev/null @@ -1,36 +0,0 @@ -""" - module TiledFactorization - -Tiled algorithms for factoring dense matrices. -""" -module TiledFactorization - -using LinearAlgebra -using LinearAlgebra: BlasInt -using LoopVectorization -using TriangularSolve -using RecursiveFactorization -using Octavian - -using DataFlowTasks -using DataFlowTasks: R,W,RW - -function schur_complement!(C,A,B,tturbo::Val{T}) where {T} - # RecursiveFactorization.schur_complement!(C,A,B,tturbo) # usually slower than Octavian - if T - Octavian.matmul!(C,A,B,-1,1) - else - Octavian.matmul_serial!(C,A,B,-1,1) - end -end - -const TILESIZE = Ref(256) - -include("tiledmatrix.jl") -include("cholesky.jl") -include("lu.jl") - -export - PseudoTiledMatrix - -end diff --git a/src/TiledFactorization/cholesky.jl b/src/TiledFactorization/cholesky.jl deleted file mode 100644 index c3901050..00000000 --- a/src/TiledFactorization/cholesky.jl +++ /dev/null @@ -1,117 +0,0 @@ -#= - Tiled Cholesky factorization in pure Julia. The serial performance - essentially comes from the `TriangularSolve` and `LoopVectorization` - packages. The parallelizatino is handled by `DataFlowTask`s. -=# - -cholesky(A::Matrix,args...) = cholesky!(deepcopy(A),args...) - -function cholesky!(A::Matrix,s=TILESIZE[],tturbo::Val{T}=Val(false)) where {T} - _cholesky!(PseudoTiledMatrix(A,s),tturbo) -end - -# tiled cholesky factorization -function _cholesky!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T} - m,n = size(A) # number of blocks - for i in 1:m - # _chol!(A[i,i],UpperTriangular) - @dspawn _chol!(A[i,i],UpperTriangular,tturbo) (A[i,i],) (RW,) - Aii = A[i,i] - U = UpperTriangular(Aii) - L = adjoint(U) - for j in i+1:n - Aij = A[i,j] - # TriangularSolve.ldiv!(L,Aij,tturbo) - @dspawn TriangularSolve.ldiv!(L,Aij,tturbo) (Aii,Aij) (R,RW) - end - for j in i+1:m - Aij = A[i,j] - for k in j:n - # TODO: for k = j, only the upper part needs to be updated, - # dividing the cost of that operation by two - Ajk = A[j,k] - Aji = adjoint(Aij) - Aik = A[i,k] - # schur_complement!(Ajk,Aji,Aik,tturbo) - @dspawn schur_complement!(Ajk,Aji,Aik,tturbo) (Ajk,Aij,Aik) (RW,R,R) - end - end - end - # wait for all computations before returning - DataFlowTasks.sync() - return Cholesky(A.data,'U',zero(BlasInt)) -end - -# a fork-join approach for comparison with the data-flow parallelism -function _cholesky_forkjoin!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T} - m,n = size(A) # number of blocks - for i in 1:m - _chol!(A[i,i],UpperTriangular,tturbo) - Aii = A[i,i] - U = UpperTriangular(Aii) - L = adjoint(U) - Threads.@threads for j in i+1:n - Aij = A[i,j] - TriangularSolve.ldiv!(L,Aij,tturbo) - end - # spawn m*(m+1)/2 tasks and sync them at the end - @sync for j in i+1:m - Aij = A[i,j] - for k in j:n - Ajk = A[j,k] - Aji = adjoint(Aij) - Aik = A[i,k] - Threads.@spawn schur_complement!(Ajk,Aji,Aik,tturbo) - end - end - end - return Cholesky(A.data,'U',zero(Int32)) -end - -# Modified from the generic version from LinearAlgebra (MIT license). -function _chol!(A::AbstractMatrix{<:Real}, ::Type{UpperTriangular},tturbo::Val{T}=Val(false)) where {T} - Base.require_one_based_indexing(A) - n = LinearAlgebra.checksquare(A) - @inbounds begin - for k = 1:n - Akk = A[k,k] - for i = 1:k - 1 - Akk -= A[i,k]*A[i,k] - end - A[k,k] = Akk - Akk, info = _chol!(Akk, UpperTriangular) - if info != 0 - return UpperTriangular(A), info - end - A[k,k] = Akk - AkkInv = inv(Akk') - if T - @tturbo warn_check_args=false for j = k + 1:n - for i = 1:k - 1 - A[k,j] -= A[i,k]*A[i,j] - end - end - @tturbo warn_check_args=false for j in k+1:n - A[k,j] = AkkInv*A[k,j] - end - else - @turbo warn_check_args=false for j = k + 1:n - for i = 1:k - 1 - A[k,j] -= A[i,k]*A[i,j] - end - end - @turbo warn_check_args=false for j in k+1:n - A[k,j] = AkkInv*A[k,j] - end - end - end - end - return UpperTriangular(A), convert(Int32, 0) -end -## Numbers -function _chol!(x::Number, uplo) - rx = real(x) - rxr = sqrt(abs(rx)) - rval = convert(promote_type(typeof(x), typeof(rxr)), rxr) - rx == abs(x) ? (rval, convert(Int32, 0)) : (rval, convert(Int32, 1)) -end diff --git a/src/TiledFactorization/lu.jl b/src/TiledFactorization/lu.jl deleted file mode 100644 index d5f7f0e2..00000000 --- a/src/TiledFactorization/lu.jl +++ /dev/null @@ -1,73 +0,0 @@ -#= - Tiled LU factorization in pure Julia. The serial performance essentially - comes from the `TriangularSolve` and `LoopVectorization`, and - `RecursiveFactorization` packages. The parallelization is handled by - `DataFlowTask`s. -=# - -lu(A::Matrix,args...) = lu!(deepcopy(A),args...) - -function lu!(A::Matrix,s=TILESIZE[],tturbo::Val{T}=Val(false)) where {T} - _lu!(PseudoTiledMatrix(A,s),tturbo) -end - -function _lu!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T} - m,n = size(A) - for i in 1:m - Aii = A[i,i] - # TODO: for simplicity, no pivot is allowed. Pivoting the diagonal - # blocks requires permuting the corresponding row/columns before continuining - @dspawn RecursiveFactorization.lu!(Aii,NoPivot(),tturbo) (Aii,) (RW,) - # @dspawn LinearAlgebra.lu!(Aii) (Aii,) (RW,) - for j in i+1:n - Aij = A[i,j] - Aji = A[j,i] - @dspawn begin - TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo) - TriangularSolve.rdiv!(Aji,UpperTriangular(Aii),tturbo) - end (Aii,Aij,Aji) (R,RW,RW) - # TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo) - # TriangularSolve.rdiv!(Aji,UpperTriangular(Aii),tturbo) - end - for j in i+1:m - for k in i+1:n - Ajk = A[j,k] - Aji = A[j,i] - Aik = A[i,k] - @dspawn schur_complement!(Ajk,Aji,Aik,tturbo) (Ajk,Aji,Aik) (RW,R,R) - # schur_complement!(Ajk,Aji,Aik,tturbo) - end - end - end - # wait for all computations before returning - DataFlowTasks.sync() - return LU(A.data,BlasInt[],zero(BlasInt)) -end - -# a fork-join approach for comparison with the data-flow parallelism -function _lu_forkjoin!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T} - m,n = size(A) - for i in 1:m - Aii = A[i,i] - # FIXME: for simplicity, no pivot is allowed. Pivoting the diagonal - # blocks requires permuting the corresponding row/columns before continuining - RecursiveFactorization.lu!(Aii,NoPivot(),tturbo) - Threads.@threads for j in i+1:n - Aij = A[i,j] - Aji = A[j,i] - TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo) - TriangularSolve.rdiv!(Aji,UpperTriangular(Aii)) - end - @sync for j in i+1:m - for k in i+1:n - Ajk = A[j,k] - Aji = A[j,i] - Aik = A[i,k] - Threads.@spawn schur_complement!(Ajk,Aji,Aik,tturbo) - end - end - end - # wait for all computations before returning - DataFlowTasks.sync() - return LU(A.data,BlasInt[],zero(BlasInt)) -end diff --git a/src/TiledFactorization/tiledmatrix.jl b/src/TiledFactorization/tiledmatrix.jl deleted file mode 100644 index 3b29bfa0..00000000 --- a/src/TiledFactorization/tiledmatrix.jl +++ /dev/null @@ -1,34 +0,0 @@ -""" - PseudoTiledMatrix(data::Matrix,sz::Int) - -Wrap a `Matrix` in a tiled structure of size `sz`, where `getindex(A,i,j)` -returns a view of the `(i,j)` block (of size `(sz × sz)`). No copy of `data` is -made, but the elements in a block are not continguos in memory. If `sz` is not a -divisor of the matrix size, one last row/column block will be included of size -given by the remainder fo the division. -""" -struct PseudoTiledMatrix{T} - data::Matrix{T} - # rows/columns of block i are given by partition[i]:partition[i+1]-1 - partition::Vector{Int} - function PseudoTiledMatrix(data::Matrix,s::Int) - T = eltype(data) - m,n = size(data) - p = 1:s:m |> collect - push!(p,m+1) - new{T}(data,p) - end -end - -function Base.size(A::PseudoTiledMatrix) - length(A.partition)-1,length(A.partition)-1 -end - -function Base.getindex(A::PseudoTiledMatrix,i::Int,j::Int) - @assert 1 ≤ i ≤ size(A)[1] - @assert 1 ≤ j ≤ size(A)[2] - p = A.partition - I = p[i]:(p[i+1]-1) - J = p[j]:(p[j+1]-1) - view(A.data,I,J) -end diff --git a/test/Project.toml b/test/Project.toml index 547d2b8c..7dc2899f 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,4 @@ [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -TriangularSolve = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf" diff --git a/test/juliascheduler_test.jl b/test/juliascheduler_test.jl index 9fad6cfd..d16bfa77 100644 --- a/test/juliascheduler_test.jl +++ b/test/juliascheduler_test.jl @@ -2,7 +2,6 @@ using Test using DataFlowTasks using DataFlowTasks: R,W,RW using LinearAlgebra -using DataFlowTasks.TiledFactorization include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl")) @@ -39,22 +38,4 @@ DataFlowTasks.setscheduler!(sch) @test abs(t1-t2) < 1e-2 end - @testset "Tiled cholesky factorization" begin - m = 100 - bsize = div(m,5) - # create an SPD matrix - A = rand(m,m) - A = (A + adjoint(A))/2 - A = A + m*I - F = TiledFactorization.cholesky(A) - @test F.L*F.U ≈ A - end - - @testset "Tiled lu factorization" begin - m = 100 - bsize = div(m,5) - A = rand(m,m) - F = TiledFactorization.lu(A) - @test F.L*F.U ≈ A - end end diff --git a/test/logging_dag_test.jl b/test/logging_dag_test.jl new file mode 100644 index 00000000..f0f3e175 --- /dev/null +++ b/test/logging_dag_test.jl @@ -0,0 +1,47 @@ +using Test +using DataFlowTasks +using DataFlowTasks: Logger, parse!, PlotFinished, PlotRunnable, TiledFactorization +using LinearAlgebra +using Logging +using Plots + +GC.gc() + +sch = DataFlowTasks.JuliaScheduler(1000) +# sch = DataFlowTasks.PriorityScheduler(200,true) +DataFlowTasks.setscheduler!(sch) + +include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl")) + +m = 4000 +A = rand(m,m) +A = (A+A') + 2*size(A,1)*I + +TiledFactorization.TILESIZE[] = 256 +io = open("forkjoin.log","w+") +logger = Logger(io) +TiledFactorization.cholesky!(copy(A)) # run once to compile +DataFlowTasks.TASKCOUNTER[] = 0 +with_logger(logger) do + DataFlowTasks.reset_timer!(logger) + F = TiledFactorization.cholesky!(A) +end +parse!(logger) + +# plot(PlotRunnable(),logger) +# plot(PlotFinished(),logger) +@info DataFlowTasks.TASKCOUNTER[] +plot(logger) + +# tl = logger.tasklogs +# chol = filter(t-> occursin("chol",t.task_label),tl) +# schur = filter(t-> occursin("schur",t.task_label),tl) +# ldiv = filter(t-> occursin("ldiv",t.task_label),tl) + +# t_chol = map(t -> (t.time_finish-t.time_start)/1e9,chol) +# t_ldiv = map(t -> (t.time_finish-t.time_start)/1e9,ldiv) +# t_schur = map(t -> (t.time_finish-t.time_start)/1e9,schur) +# histogram(t_chol,label="cholesky",xlabel="time (s)",alpha=0.8) +# histogram!(t_ldiv,label="ldiv",xlabel="time (s)",alpha=0.5) +# histogram!(t_schur,label="schur",xlabel="time (s)",alpha=0.5) +# plot(p1,p2,p3,layout=(1,3)) diff --git a/test/priorityscheduler_test.jl b/test/priorityscheduler_test.jl index 9731d058..4f3f465c 100644 --- a/test/priorityscheduler_test.jl +++ b/test/priorityscheduler_test.jl @@ -2,7 +2,6 @@ using Test using DataFlowTasks using DataFlowTasks: R,W,RW using LinearAlgebra -using DataFlowTasks.TiledFactorization background = false sch = DataFlowTasks.PriorityScheduler(100,background) @@ -23,23 +22,4 @@ include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl")) @test abs(t1-t2) < 1e-2 end - @testset "Tiled cholesky factorization" begin - m = 100 - bsize = div(m,5) - # create an SPD matrix - A = rand(m,m) - A = (A + adjoint(A))/2 - A = A + m*I - F = TiledFactorization.cholesky(A) - @test F.L*F.U ≈ A - end - - @testset "Tiled lu factorization" begin - m = 100 - bsize = div(m,5) - A = rand(m,m) - F = TiledFactorization.lu(A) - @test F.L*F.U ≈ A - end - end