Fix LAPACK non-orthogonality (#842)

Co-authored-by: Michael F. Herbst <[email protected]>
JuliaMolSim · Apr 14, 2023 · 6a8c25c · 6a8c25c
1 parent f31d9a9
commit 6a8c25c
Show file tree

Hide file tree

Showing 9 changed files with 80 additions and 40 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -9,7 +9,8 @@ stages:
 julia/1.8-n2:
   stage: test
   variables:
-    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1 --qos=devel -p dgx -t 00:15:00 -A hpc-prf-dftkjl"
+    # SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1 --qos=devel -p dgx -t 00:15:00 -A hpc-prf-dftkjl"
+    SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1             -p gpu -t 00:30:00 -A hpc-prf-dftkjl"
     JULIA_NUM_THREADS: "1"  # GPU and multi-threading not yet compatible
   coverage: '/\(\d+.\d+\%\) covered/'
   rules:

diff --git a/src/density_methods.jl b/src/density_methods.jl
@@ -87,7 +87,7 @@ function atomic_density(basis::PlaneWaveBasis, method::AtomicDensity, magnetic_m
     ρtot = atomic_total_density(basis, method)
     ρspin = atomic_spin_density(basis, method, magnetic_moments)
     ρ = ρ_from_total_and_spin(ρtot, ρspin)
-    
+
     N = sum(ρ) * basis.model.unit_cell_volume / prod(basis.fft_size)
 
     if !isnothing(n_electrons) && (N > 0)

diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
@@ -28,6 +28,10 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
         end
         # Get ψguessk
         if !isnothing(ψguess)
+            if n_Gk != size(ψguess[ik], 1)
+                error("Mismatch in dimension between guess ($(size(ψguess, 1)) and " *
+                      "Hamiltonian $n_Gk")
+            end
             nev_guess = size(ψguess[ik], 2)
             if nev_guess > nev_per_kpoint
                 ψguessk = ψguess[ik][:, 1:nev_per_kpoint]

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
@@ -39,6 +39,7 @@
 ## TODO it seems there is a lack of orthogonalization immediately after locking, maybe investigate this to save on some choleskys
 ## TODO debug orthogonalizations when A=I
 
+# TODO Use @debug for this
 # vprintln(args...) = println(args...)  # Uncomment for output
 vprintln(args...) = nothing
 
@@ -54,7 +55,7 @@ involving this structure will always yield a plain array (and not a LazyHcat str
 LazyHcat is a lightweight subset of BlockArrays.jl's functionalities, but has the
 advantage to be able to store GPU Arrays (BlockArrays is heavily built on Julia's CPU Array).
 """
-struct LazyHcat{T <: Number, D <: Tuple} <: AbstractMatrix{T}
+struct LazyHcat{T<:Number, D<:Tuple} <: AbstractMatrix{T}
     blocks::D
 end
 
@@ -78,16 +79,16 @@ Base.Array(A::LazyHcat)  = hcat(A.blocks...)
 
 Base.adjoint(A::LazyHcat) = Adjoint(A)
 
-@views function Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::LazyHcat) where {T}
+@views function Base.:*(Aadj::Adjoint{T,<:LazyHcat}, B::LazyHcat) where {T}
     A = Aadj.parent
     rows = size(A)[2]
     cols = size(B)[2]
     ret = similar(A.blocks[1], rows, cols)
 
     orow = 0  # row offset
-    for (iA, blA) in enumerate(A.blocks)
+    for blA in A.blocks
         ocol = 0  # column offset
-        for (iB, blB) in enumerate(B.blocks)
+        for blB in B.blocks
             ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] .= blA' * blB
             ocol += size(blB, 2)
         end
@@ -96,7 +97,7 @@ Base.adjoint(A::LazyHcat) = Adjoint(A)
     ret
 end
 
-Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::AbstractMatrix) where {T} = Aadj * LazyHcat(B)
+Base.:*(Aadj::Adjoint{T,<:LazyHcat}, B::AbstractMatrix) where {T} = Aadj * LazyHcat(B)
 
 @views function *(Ablock::LazyHcat, B::AbstractMatrix)
     res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
@@ -117,8 +118,29 @@ end
 @timing function rayleigh_ritz(X, AX, N)
     XAX = X' * AX
     @assert all(!isnan, XAX)
-    F = eigen(Hermitian(XAX))
-    F.vectors[:,1:N], F.values[1:N]
+    rayleigh_ritz(Hermitian(XAX), N)
+end
+@views function rayleigh_ritz(XAX::Hermitian, N)
+    # Fallback: Use whatever is the default dense eigensolver.
+    # Note: GenericLinearAlgebra uses a QR-based algorithm, which is pretty safe in terms
+    #       of keeping the vectors orthogonal
+    values, vectors = eigen(XAX)
+    vectors[:, 1:N], values[1:N]
+end
+@views function rayleigh_ritz(XAX::Hermitian{T,<:Array}, N) where {T <: Union{ComplexF32,ComplexF64,
+                                                                              Float32,Float64}}
+    # LAPACK sysevr (the Julia default dense eigensolver) can actually return eigenvectors
+    # that are significantly non-orthogonal (1e-4 in Float32 in some tests) here,
+    # presumably because it tries hard to make them eigenvectors in the presence
+    # of small gaps. Since we care more about them being orthogonal
+    # than about them being eigenvectors, re-orthogonalize.
+    # TODO To avoid orthogonalization: Use syevd,
+    # which does a much better job, see https://github.com/JuliaLang/julia/pull/49262
+    # and https://github.com/JuliaLang/julia/pull/49355
+    values, vectors = eigen(XAX)
+    v = vectors[:, 1:N]
+    ortho!(v)
+    v, values[1:N]
 end
 
 # B-orthogonalize X (in place) using only one B apply.
@@ -146,7 +168,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
     # U,S,V = svd(X)
     # return U*V', 1, 1
 
-    growth_factor = 1
+    growth_factor = one(real(T))
 
     success = false
     nchol = 0
@@ -209,7 +231,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
 
     # @assert norm(X'X - I) < tol
 
-    X, nchol, growth_factor
+    (; X, nchol, growth_factor)
 end
 
 # Randomize the columns of X if the norm is below tol
@@ -262,7 +284,7 @@ end
 
         # If we're at a fixed point, growth_factor is 1 and if tol >
         # eps(), the loop will terminate, even if BY'Y != 0
-        growth_factor*eps(real(T)) < tol && break
+        growth_factor * eps(real(T)) < tol && break
 
         niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen")
         niter += 1
@@ -372,14 +394,17 @@ end
         end
 
         ### Compute new residuals
-        new_R = new_AX .- new_BX .* λs'
+        @timing "Update residuals" begin
+            new_R = new_AX .- new_BX .* λs'
+            @views for i = 1:size(X, 2)
+                resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
+            end
+        end
+        vprintln(niter, "   ", resid_history[:, niter+1])
+
         # it is actually a good question of knowing when to
         # precondition. Here seems sensible, but it could plausibly be
         # done before or after
-        @views for i=1:size(X,2)
-            resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
-        end
-        vprintln(niter, "   ", resid_history[:, niter+1])
         if precon !== I
             @timing "preconditioning" begin
                 precondprep!(precon, X)  # update preconditioner if needed; defaults to noop
@@ -454,10 +479,6 @@ end
         # Quick sanity check
         for i = 1:size(X, 2)
             @views if abs(BX[:, i]'X[:, i] - 1) >= sqrt(eps(real(eltype(X))))
-                # TODO error-ing is too harsh here. Better throw some exception (e.g. in
-                # reduced precision this can be interpreted as an indicator to re-try the
-                # orthogonalisation in elevated precision or to stop the reduced precision
-                # iterations as a whole.
                 error("LOBPCG is badly failing to keep the vectors normalized; this should never happen")
             end
         end

diff --git a/src/external/jld2io.jl b/src/external/jld2io.jl
@@ -28,7 +28,7 @@ function save_scfres_master(file::AbstractString, scfres::NamedTuple, ::Val{:jld
 
     JLD2.jldopen(file, "w") do jld
         jld["__propertynames"] = propertynames(scfres)
-        jld["ρ_real"]          = scfres.ρ
+        jld["ρ"]               = scfres.ρ
         jld["basis"]           = scfres.basis
 
         for symbol in propertynames(scfres)
@@ -44,7 +44,7 @@ end
 function load_scfres(jld::JLD2.JLDFile)
     basis = jld["basis"]
     scfdict = Dict{Symbol, Any}(
-        :ρ     => jld["ρ_real"],
+        :ρ     => jld["ρ"],
         :basis => basis
     )
 

diff --git a/src/terms/xc.jl b/src/terms/xc.jl
@@ -28,12 +28,12 @@ end
 
 function (xc::Xc)(basis::PlaneWaveBasis{T}) where {T}
     isempty(xc.functionals) && return TermNoop()
+
     # Charge density for non-linear core correction
+    ρcore = nothing
     if any(a -> a.use_nlcc, basis.model.atoms)
         ρcore = ρ_from_total(basis, atomic_total_density(basis, CoreDensity()))
         minimum(ρcore) < -sqrt(eps(T)) && @warn("Negative ρcore detected: $(minimum(ρcore))")
-    else
-        ρcore = ρ_from_total(basis, zeros(T, basis.fft_size))
     end
     functionals = map(xc.functionals) do fun
         # Strip duals from functional parameters if needed
@@ -45,11 +45,11 @@ function (xc::Xc)(basis::PlaneWaveBasis{T}) where {T}
            T(xc.potential_threshold), ρcore)
 end
 
-struct TermXc{T} <: TermNonlinear where {T}
+struct TermXc{T,CT} <: TermNonlinear where {T,CT}
     functionals::Vector{Functional}
     scaling_factor::T
     potential_threshold::T
-    ρcore::Array{T,4}
+    ρcore::CT
 end
 
 function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupation;
@@ -62,7 +62,9 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
     @assert all(family(xc) in (:lda, :gga, :mgga, :mggal) for xc in term.functionals)
 
     # Add the model core charge density (non-linear core correction)
-    ρ = ρ + term.ρcore
+    if !isnothing(term.ρcore)
+        ρ = ρ + term.ρcore
+    end
 
     # Compute kinetic energy density, if needed.
     if isnothing(τ) && any(needs_τ, term.functionals)

diff --git a/src/workarounds/cuda_arrays.jl b/src/workarounds/cuda_arrays.jl
@@ -1,14 +1,14 @@
 using LinearAlgebra
 
 # https://github.com/JuliaGPU/CUDA.jl/issues/1572
-function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T<:Complex,AT<:CUDA.CuArray}
-    vals, vects = CUDA.CUSOLVER.heevd!('V', 'U', A.data)
-    (vectors = vects, values = vals)
+function LinearAlgebra.eigen(A::Hermitian{<:Complex,<:CUDA.CuArray})
+    values, vectors = CUDA.CUSOLVER.heevd!('V', 'U', A.data)
+    (; values, vectors)
 end
 
-function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T<:Real,AT<:CUDA.CuArray}
-    vals, vects = CUDA.CUSOLVER.syevd!('V', 'U', A.data)
-    (vectors = vects, values = vals)
+function LinearAlgebra.eigen(A::Hermitian{<:Real,<:CUDA.CuArray})
+    values, vectors = CUDA.CUSOLVER.syevd!('V', 'U', A.data)
+    (; values, vectors)
 end
 
 synchronize_device(::GPU{<:CUDA.CuArray}) = CUDA.synchronize()

diff --git a/test/gpu.jl b/test/gpu.jl
@@ -10,7 +10,13 @@ include("testcases.jl")
     function run_problem(; architecture)
         model = model_PBE(silicon.lattice, silicon.atoms, silicon.positions)
         basis = PlaneWaveBasis(model; Ecut=10, kgrid=(3, 3, 3), architecture)
-        self_consistent_field(basis; tol=1e-10, solver=scf_damping_solver(1.0))
+
+        # TODO Right now guess generation on the GPU does not work
+        basis_cpu = PlaneWaveBasis(model; basis.Ecut, basis.kgrid)
+        ρ_guess = guess_density(basis_cpu)
+        ρ = DFTK.to_device(architecture, ρ_guess)
+
+        self_consistent_field(basis; tol=1e-10, solver=scf_damping_solver(1.0), ρ)
     end
 
     scfres_cpu = run_problem(; architecture=DFTK.CPU())
@@ -26,19 +32,25 @@ end
                           magnetic_moments, smearing=Smearing.Gaussian(), temperature=1e-3)
         basis = PlaneWaveBasis(model; Ecut=20, kgrid=(4, 4, 4), architecture)
 
-        ρ = guess_density(basis, magnetic_moments)
+        # TODO Right now guess generation on the GPU does not work
+        basis_cpu = PlaneWaveBasis(model; basis.Ecut, basis.kgrid)
+        ρ_guess = guess_density(basis_cpu, magnetic_moments)
+        ρ = DFTK.to_device(architecture, ρ_guess)
+        # ρ = guess_density(basis, magnetic_moments)
+
         # TODO Bump tolerance a bit here ... still leads to NaNs unfortunately
-        self_consistent_field(basis; ρ, tol=1e-8, mixing=KerkerMixing(),
+        self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing(),
                               solver=scf_damping_solver(1.0))
     end
 
     scfres_cpu = run_problem(; architecture=DFTK.CPU())
     scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
-    @test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-8
-    @test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-7
+    @test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-7
+    @test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-6
 end
 
 
+# TODO Fix guess density generation
 # TODO Float32
 # TODO meta GGA
 # TODO Aluminium with LdosMixing

diff --git a/test/silicon_redHF.jl b/test/silicon_redHF.jl
@@ -1,7 +1,7 @@
 include("run_scf_and_compare.jl")
 include("testcases.jl")
-using DoubleFloats
 using GenericLinearAlgebra
+using DoubleFloats
 
 # Silicon redHF (without xc) is a metal, so we add a bit of temperature to it
 function run_silicon_redHF(T; Ecut=5, grid_size=15, spin_polarization=:none, kwargs...)