diff --git a/Project.toml b/Project.toml
index 741de8de..82cb74c3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,13 +8,9 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
-Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
-RecursiveFactorization = "f2c3362d-daeb-58d1-803e-2bc74f2840b4"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 ThreadPools = "b189fb0b-2eb5-4ed4-bc0c-d34c51242431"
-TriangularSolve = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf"
 
 [compat]
 julia = "^1"
diff --git a/docs/src/references.md b/docs/src/references.md
index b1d5890d..bbe7d232 100644
--- a/docs/src/references.md
+++ b/docs/src/references.md
@@ -1,6 +1,5 @@
 # [References](@id references-section)
 
 ```@autodocs
-Modules =   [DataFlowTasks,
-            DataFlowTasks.TiledFactorization]
+Modules =   [DataFlowTasks]
 ```
diff --git a/src/DataFlowTasks.jl b/src/DataFlowTasks.jl
index ffcadb0f..d9f6462c 100644
--- a/src/DataFlowTasks.jl
+++ b/src/DataFlowTasks.jl
@@ -43,9 +43,6 @@ export
     @dasync,
     @dspawn
 
-# a module showcasing some usecases. To be removed eventually...
-include("TiledFactorization/TiledFactorization.jl")
-
 function __init__()
     # default scheduler
     capacity  = 50
diff --git a/src/TiledFactorization/TiledFactorization.jl b/src/TiledFactorization/TiledFactorization.jl
deleted file mode 100644
index 62dae72f..00000000
--- a/src/TiledFactorization/TiledFactorization.jl
+++ /dev/null
@@ -1,36 +0,0 @@
-"""
-    module TiledFactorization
-
-Tiled algorithms for factoring dense matrices.
-"""
-module TiledFactorization
-
-using LinearAlgebra
-using LinearAlgebra: BlasInt
-using LoopVectorization
-using TriangularSolve
-using RecursiveFactorization
-using Octavian
-
-using DataFlowTasks
-using DataFlowTasks: R,W,RW
-
-function schur_complement!(C,A,B,tturbo::Val{T}) where {T}
-    # RecursiveFactorization.schur_complement!(C,A,B,tturbo) # usually slower than Octavian
-    if T
-        Octavian.matmul!(C,A,B,-1,1)
-    else
-        Octavian.matmul_serial!(C,A,B,-1,1)
-    end
-end
-
-const TILESIZE = Ref(256)
-
-include("tiledmatrix.jl")
-include("cholesky.jl")
-include("lu.jl")
-
-export
-    PseudoTiledMatrix
-
-end
diff --git a/src/TiledFactorization/cholesky.jl b/src/TiledFactorization/cholesky.jl
deleted file mode 100644
index c3901050..00000000
--- a/src/TiledFactorization/cholesky.jl
+++ /dev/null
@@ -1,117 +0,0 @@
-#=
-    Tiled Cholesky factorization in pure Julia. The serial performance
-    essentially comes from the `TriangularSolve` and `LoopVectorization`
-    packages. The parallelizatino is handled by `DataFlowTask`s.
-=#
-
-cholesky(A::Matrix,args...) = cholesky!(deepcopy(A),args...)
-
-function cholesky!(A::Matrix,s=TILESIZE[],tturbo::Val{T}=Val(false)) where {T}
-    _cholesky!(PseudoTiledMatrix(A,s),tturbo)
-end
-
-# tiled cholesky factorization
-function _cholesky!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T}
-    m,n = size(A) # number of blocks
-    for i in 1:m
-        # _chol!(A[i,i],UpperTriangular)
-        @dspawn _chol!(A[i,i],UpperTriangular,tturbo) (A[i,i],) (RW,)
-        Aii = A[i,i]
-        U = UpperTriangular(Aii)
-        L = adjoint(U)
-        for j in i+1:n
-            Aij = A[i,j]
-            # TriangularSolve.ldiv!(L,Aij,tturbo)
-            @dspawn TriangularSolve.ldiv!(L,Aij,tturbo) (Aii,Aij) (R,RW)
-        end
-        for j in i+1:m
-            Aij = A[i,j]
-            for k in j:n
-                # TODO: for k = j, only the upper part needs to be updated,
-                # dividing the cost of that operation by two
-                Ajk = A[j,k]
-                Aji = adjoint(Aij)
-                Aik = A[i,k]
-                # schur_complement!(Ajk,Aji,Aik,tturbo)
-                @dspawn schur_complement!(Ajk,Aji,Aik,tturbo) (Ajk,Aij,Aik) (RW,R,R)
-            end
-        end
-    end
-    # wait for all computations before returning
-    DataFlowTasks.sync()
-    return Cholesky(A.data,'U',zero(BlasInt))
-end
-
-# a fork-join approach for comparison with the data-flow parallelism
-function _cholesky_forkjoin!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T}
-    m,n = size(A) # number of blocks
-    for i in 1:m
-        _chol!(A[i,i],UpperTriangular,tturbo)
-        Aii = A[i,i]
-        U = UpperTriangular(Aii)
-        L = adjoint(U)
-        Threads.@threads for j in i+1:n
-            Aij = A[i,j]
-            TriangularSolve.ldiv!(L,Aij,tturbo)
-        end
-        # spawn m*(m+1)/2 tasks and sync them at the end
-        @sync for j in i+1:m
-            Aij = A[i,j]
-            for k in j:n
-                Ajk = A[j,k]
-                Aji = adjoint(Aij)
-                Aik = A[i,k]
-                Threads.@spawn schur_complement!(Ajk,Aji,Aik,tturbo)
-            end
-        end
-    end
-    return Cholesky(A.data,'U',zero(Int32))
-end
-
-# Modified from the generic version from LinearAlgebra (MIT license).
-function _chol!(A::AbstractMatrix{<:Real}, ::Type{UpperTriangular},tturbo::Val{T}=Val(false)) where {T}
-    Base.require_one_based_indexing(A)
-    n = LinearAlgebra.checksquare(A)
-    @inbounds begin
-        for k = 1:n
-            Akk = A[k,k]
-            for i = 1:k - 1
-                Akk -= A[i,k]*A[i,k]
-            end
-            A[k,k] = Akk
-            Akk, info = _chol!(Akk, UpperTriangular)
-            if info != 0
-                return UpperTriangular(A), info
-            end
-            A[k,k] = Akk
-            AkkInv = inv(Akk')
-            if T
-                @tturbo warn_check_args=false for j = k + 1:n
-                    for i = 1:k - 1
-                        A[k,j] -= A[i,k]*A[i,j]
-                    end
-                end
-                @tturbo warn_check_args=false for j in k+1:n
-                    A[k,j] = AkkInv*A[k,j]
-                end
-            else
-                @turbo warn_check_args=false for j = k + 1:n
-                    for i = 1:k - 1
-                        A[k,j] -= A[i,k]*A[i,j]
-                    end
-                end
-                @turbo warn_check_args=false for j in k+1:n
-                    A[k,j] = AkkInv*A[k,j]
-                end
-            end
-        end
-    end
-    return UpperTriangular(A), convert(Int32, 0)
-end
-## Numbers
-function _chol!(x::Number, uplo)
-    rx = real(x)
-    rxr = sqrt(abs(rx))
-    rval =  convert(promote_type(typeof(x), typeof(rxr)), rxr)
-    rx == abs(x) ? (rval, convert(Int32, 0)) : (rval, convert(Int32, 1))
-end
diff --git a/src/TiledFactorization/lu.jl b/src/TiledFactorization/lu.jl
deleted file mode 100644
index d5f7f0e2..00000000
--- a/src/TiledFactorization/lu.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-#=
-    Tiled LU factorization in pure Julia. The serial performance essentially
-    comes from the `TriangularSolve` and `LoopVectorization`, and
-    `RecursiveFactorization` packages. The parallelization is handled by
-    `DataFlowTask`s.
-=#
-
-lu(A::Matrix,args...) = lu!(deepcopy(A),args...)
-
-function lu!(A::Matrix,s=TILESIZE[],tturbo::Val{T}=Val(false)) where {T}
-    _lu!(PseudoTiledMatrix(A,s),tturbo)
-end
-
-function _lu!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T}
-    m,n = size(A)
-    for i in 1:m
-        Aii = A[i,i]
-        # TODO: for simplicity, no pivot is allowed. Pivoting the diagonal
-        # blocks requires permuting the corresponding row/columns before continuining
-        @dspawn RecursiveFactorization.lu!(Aii,NoPivot(),tturbo) (Aii,) (RW,)
-        # @dspawn LinearAlgebra.lu!(Aii) (Aii,) (RW,)
-        for j in i+1:n
-            Aij = A[i,j]
-            Aji = A[j,i]
-            @dspawn begin
-                TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo)
-                TriangularSolve.rdiv!(Aji,UpperTriangular(Aii),tturbo)
-            end (Aii,Aij,Aji) (R,RW,RW)
-            # TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo)
-            # TriangularSolve.rdiv!(Aji,UpperTriangular(Aii),tturbo)
-        end
-        for j in i+1:m
-            for k in i+1:n
-                Ajk = A[j,k]
-                Aji = A[j,i]
-                Aik = A[i,k]
-                @dspawn schur_complement!(Ajk,Aji,Aik,tturbo) (Ajk,Aji,Aik) (RW,R,R)
-                # schur_complement!(Ajk,Aji,Aik,tturbo)
-            end
-        end
-    end
-    # wait for all computations before returning
-    DataFlowTasks.sync()
-    return LU(A.data,BlasInt[],zero(BlasInt))
-end
-
-# a fork-join approach for comparison with the data-flow parallelism
-function _lu_forkjoin!(A::PseudoTiledMatrix,tturbo::Val{T}=Val(false)) where {T}
-    m,n = size(A)
-    for i in 1:m
-        Aii = A[i,i]
-        # FIXME: for simplicity, no pivot is allowed. Pivoting the diagonal
-        # blocks requires permuting the corresponding row/columns before continuining
-        RecursiveFactorization.lu!(Aii,NoPivot(),tturbo)
-        Threads.@threads for j in i+1:n
-            Aij = A[i,j]
-            Aji = A[j,i]
-            TriangularSolve.ldiv!(UnitLowerTriangular(Aii),Aij,tturbo)
-            TriangularSolve.rdiv!(Aji,UpperTriangular(Aii))
-        end
-        @sync for j in i+1:m
-            for k in i+1:n
-                Ajk = A[j,k]
-                Aji = A[j,i]
-                Aik = A[i,k]
-                Threads.@spawn schur_complement!(Ajk,Aji,Aik,tturbo)
-            end
-        end
-    end
-    # wait for all computations before returning
-    DataFlowTasks.sync()
-    return LU(A.data,BlasInt[],zero(BlasInt))
-end
diff --git a/src/TiledFactorization/tiledmatrix.jl b/src/TiledFactorization/tiledmatrix.jl
deleted file mode 100644
index 3b29bfa0..00000000
--- a/src/TiledFactorization/tiledmatrix.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-"""
-    PseudoTiledMatrix(data::Matrix,sz::Int)
-
-Wrap a `Matrix` in a tiled structure of size `sz`, where `getindex(A,i,j)`
-returns a view of the `(i,j)` block (of size `(sz × sz)`). No copy of `data` is
-made, but the elements in a block are not continguos in memory. If `sz` is not a
-divisor of the matrix size, one last row/column block will be included of size
-given by the remainder fo the division.
-"""
-struct PseudoTiledMatrix{T}
-    data::Matrix{T}
-    # rows/columns of block i are given by partition[i]:partition[i+1]-1
-    partition::Vector{Int}
-    function PseudoTiledMatrix(data::Matrix,s::Int)
-        T = eltype(data)
-        m,n = size(data)
-        p = 1:s:m |> collect
-        push!(p,m+1)
-        new{T}(data,p)
-    end
-end
-
-function Base.size(A::PseudoTiledMatrix)
-    length(A.partition)-1,length(A.partition)-1
-end
-
-function Base.getindex(A::PseudoTiledMatrix,i::Int,j::Int)
-    @assert 1 ≤ i ≤ size(A)[1]
-    @assert 1 ≤ j ≤ size(A)[2]
-    p = A.partition
-    I = p[i]:(p[i+1]-1)
-    J = p[j]:(p[j+1]-1)
-    view(A.data,I,J)
-end
diff --git a/test/Project.toml b/test/Project.toml
index 547d2b8c..7dc2899f 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,6 +1,4 @@
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TriangularSolve = "d5829a12-d9aa-46ab-831f-fb7c9ab06edf"
diff --git a/test/juliascheduler_test.jl b/test/juliascheduler_test.jl
index 9fad6cfd..d16bfa77 100644
--- a/test/juliascheduler_test.jl
+++ b/test/juliascheduler_test.jl
@@ -2,7 +2,6 @@ using Test
 using DataFlowTasks
 using DataFlowTasks: R,W,RW
 using LinearAlgebra
-using DataFlowTasks.TiledFactorization
 
 include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl"))
 
@@ -39,22 +38,4 @@ DataFlowTasks.setscheduler!(sch)
         @test abs(t1-t2) < 1e-2
     end
 
-    @testset "Tiled cholesky factorization" begin
-        m  = 100
-        bsize = div(m,5)
-        # create an SPD matrix
-        A = rand(m,m)
-        A = (A + adjoint(A))/2
-        A = A + m*I
-        F = TiledFactorization.cholesky(A)
-        @test F.L*F.U ≈ A
-    end
-
-    @testset "Tiled lu factorization" begin
-        m  = 100
-        bsize = div(m,5)
-        A = rand(m,m)
-        F = TiledFactorization.lu(A)
-        @test F.L*F.U ≈ A
-    end
 end
diff --git a/test/logging_dag_test.jl b/test/logging_dag_test.jl
new file mode 100644
index 00000000..f0f3e175
--- /dev/null
+++ b/test/logging_dag_test.jl
@@ -0,0 +1,47 @@
+using Test
+using DataFlowTasks
+using DataFlowTasks: Logger, parse!, PlotFinished, PlotRunnable, TiledFactorization
+using LinearAlgebra
+using Logging
+using Plots
+
+GC.gc()
+
+sch = DataFlowTasks.JuliaScheduler(1000)
+# sch = DataFlowTasks.PriorityScheduler(200,true)
+DataFlowTasks.setscheduler!(sch)
+
+include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl"))
+
+m = 4000
+A = rand(m,m)
+A = (A+A') + 2*size(A,1)*I
+
+TiledFactorization.TILESIZE[] = 256
+io     = open("forkjoin.log","w+")
+logger = Logger(io)
+TiledFactorization.cholesky!(copy(A)) # run once to compile
+DataFlowTasks.TASKCOUNTER[] = 0
+with_logger(logger) do
+    DataFlowTasks.reset_timer!(logger)
+    F = TiledFactorization.cholesky!(A)
+end
+parse!(logger)
+
+# plot(PlotRunnable(),logger)
+# plot(PlotFinished(),logger)
+@info DataFlowTasks.TASKCOUNTER[]
+plot(logger)
+
+# tl = logger.tasklogs
+# chol  = filter(t-> occursin("chol",t.task_label),tl)
+# schur = filter(t-> occursin("schur",t.task_label),tl)
+# ldiv = filter(t-> occursin("ldiv",t.task_label),tl)
+
+# t_chol = map(t -> (t.time_finish-t.time_start)/1e9,chol)
+# t_ldiv = map(t -> (t.time_finish-t.time_start)/1e9,ldiv)
+# t_schur = map(t -> (t.time_finish-t.time_start)/1e9,schur)
+# histogram(t_chol,label="cholesky",xlabel="time (s)",alpha=0.8)
+# histogram!(t_ldiv,label="ldiv",xlabel="time (s)",alpha=0.5)
+# histogram!(t_schur,label="schur",xlabel="time (s)",alpha=0.5)
+# plot(p1,p2,p3,layout=(1,3))
diff --git a/test/priorityscheduler_test.jl b/test/priorityscheduler_test.jl
index 9731d058..4f3f465c 100644
--- a/test/priorityscheduler_test.jl
+++ b/test/priorityscheduler_test.jl
@@ -2,7 +2,6 @@ using Test
 using DataFlowTasks
 using DataFlowTasks: R,W,RW
 using LinearAlgebra
-using DataFlowTasks.TiledFactorization
 
 background = false
 sch = DataFlowTasks.PriorityScheduler(100,background)
@@ -23,23 +22,4 @@ include(joinpath(DataFlowTasks.PROJECT_ROOT,"test","testutils.jl"))
         @test abs(t1-t2) < 1e-2
     end
 
-    @testset "Tiled cholesky factorization" begin
-        m  = 100
-        bsize = div(m,5)
-        # create an SPD matrix
-        A = rand(m,m)
-        A = (A + adjoint(A))/2
-        A = A + m*I
-        F = TiledFactorization.cholesky(A)
-        @test F.L*F.U ≈ A
-    end
-
-    @testset "Tiled lu factorization" begin
-        m  = 100
-        bsize = div(m,5)
-        A = rand(m,m)
-        F = TiledFactorization.lu(A)
-        @test F.L*F.U ≈ A
-    end
-
 end