Skip to content

Commit

Permalink
Fix LAPACK non-orthogonality (#842)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael F. Herbst <[email protected]>
  • Loading branch information
antoine-levitt and mfherbst authored Apr 14, 2023
1 parent f31d9a9 commit 6a8c25c
Show file tree
Hide file tree
Showing 9 changed files with 80 additions and 40 deletions.
3 changes: 2 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ stages:
julia/1.8-n2:
stage: test
variables:
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1 --qos=devel -p dgx -t 00:15:00 -A hpc-prf-dftkjl"
# SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1 --qos=devel -p dgx -t 00:15:00 -A hpc-prf-dftkjl"
SCHEDULER_PARAMETERS: "-N 1 -n 1 -c 16 --gres=gpu:a100:1 -p gpu -t 00:30:00 -A hpc-prf-dftkjl"
JULIA_NUM_THREADS: "1" # GPU and multi-threading not yet compatible
coverage: '/\(\d+.\d+\%\) covered/'
rules:
Expand Down
2 changes: 1 addition & 1 deletion src/density_methods.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ function atomic_density(basis::PlaneWaveBasis, method::AtomicDensity, magnetic_m
ρtot = atomic_total_density(basis, method)
ρspin = atomic_spin_density(basis, method, magnetic_moments)
ρ = ρ_from_total_and_spin(ρtot, ρspin)

N = sum(ρ) * basis.model.unit_cell_volume / prod(basis.fft_size)

if !isnothing(n_electrons) && (N > 0)
Expand Down
4 changes: 4 additions & 0 deletions src/eigen/diag.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
end
# Get ψguessk
if !isnothing(ψguess)
if n_Gk != size(ψguess[ik], 1)
error("Mismatch in dimension between guess ($(size(ψguess, 1)) and " *
"Hamiltonian $n_Gk")
end
nev_guess = size(ψguess[ik], 2)
if nev_guess > nev_per_kpoint
ψguessk = ψguess[ik][:, 1:nev_per_kpoint]
Expand Down
59 changes: 40 additions & 19 deletions src/eigen/lobpcg_hyper_impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
## TODO it seems there is a lack of orthogonalization immediately after locking, maybe investigate this to save on some choleskys
## TODO debug orthogonalizations when A=I

# TODO Use @debug for this
# vprintln(args...) = println(args...) # Uncomment for output
vprintln(args...) = nothing

Expand All @@ -54,7 +55,7 @@ involving this structure will always yield a plain array (and not a LazyHcat str
LazyHcat is a lightweight subset of BlockArrays.jl's functionalities, but has the
advantage to be able to store GPU Arrays (BlockArrays is heavily built on Julia's CPU Array).
"""
struct LazyHcat{T <: Number, D <: Tuple} <: AbstractMatrix{T}
struct LazyHcat{T<:Number, D<:Tuple} <: AbstractMatrix{T}
blocks::D
end

Expand All @@ -78,16 +79,16 @@ Base.Array(A::LazyHcat) = hcat(A.blocks...)

Base.adjoint(A::LazyHcat) = Adjoint(A)

@views function Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::LazyHcat) where {T}
@views function Base.:*(Aadj::Adjoint{T,<:LazyHcat}, B::LazyHcat) where {T}
A = Aadj.parent
rows = size(A)[2]
cols = size(B)[2]
ret = similar(A.blocks[1], rows, cols)

orow = 0 # row offset
for (iA, blA) in enumerate(A.blocks)
for blA in A.blocks
ocol = 0 # column offset
for (iB, blB) in enumerate(B.blocks)
for blB in B.blocks
ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] .= blA' * blB
ocol += size(blB, 2)
end
Expand All @@ -96,7 +97,7 @@ Base.adjoint(A::LazyHcat) = Adjoint(A)
ret
end

Base.:*(Aadj::Adjoint{T, <: LazyHcat}, B::AbstractMatrix) where {T} = Aadj * LazyHcat(B)
Base.:*(Aadj::Adjoint{T,<:LazyHcat}, B::AbstractMatrix) where {T} = Aadj * LazyHcat(B)

@views function *(Ablock::LazyHcat, B::AbstractMatrix)
res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :] # First multiplication
Expand All @@ -117,8 +118,29 @@ end
@timing function rayleigh_ritz(X, AX, N)
XAX = X' * AX
@assert all(!isnan, XAX)
F = eigen(Hermitian(XAX))
F.vectors[:,1:N], F.values[1:N]
rayleigh_ritz(Hermitian(XAX), N)
end
@views function rayleigh_ritz(XAX::Hermitian, N)
# Fallback: Use whatever is the default dense eigensolver.
# Note: GenericLinearAlgebra uses a QR-based algorithm, which is pretty safe in terms
# of keeping the vectors orthogonal
values, vectors = eigen(XAX)
vectors[:, 1:N], values[1:N]
end
@views function rayleigh_ritz(XAX::Hermitian{T,<:Array}, N) where {T <: Union{ComplexF32,ComplexF64,
Float32,Float64}}
# LAPACK sysevr (the Julia default dense eigensolver) can actually return eigenvectors
# that are significantly non-orthogonal (1e-4 in Float32 in some tests) here,
# presumably because it tries hard to make them eigenvectors in the presence
# of small gaps. Since we care more about them being orthogonal
# than about them being eigenvectors, re-orthogonalize.
# TODO To avoid orthogonalization: Use syevd,
# which does a much better job, see https://github.com/JuliaLang/julia/pull/49262
# and https://github.com/JuliaLang/julia/pull/49355
values, vectors = eigen(XAX)
v = vectors[:, 1:N]
ortho!(v)
v, values[1:N]
end

# B-orthogonalize X (in place) using only one B apply.
Expand Down Expand Up @@ -146,7 +168,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
# U,S,V = svd(X)
# return U*V', 1, 1

growth_factor = 1
growth_factor = one(real(T))

success = false
nchol = 0
Expand Down Expand Up @@ -209,7 +231,7 @@ normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))

# @assert norm(X'X - I) < tol

X, nchol, growth_factor
(; X, nchol, growth_factor)
end

# Randomize the columns of X if the norm is below tol
Expand Down Expand Up @@ -262,7 +284,7 @@ end

# If we're at a fixed point, growth_factor is 1 and if tol >
# eps(), the loop will terminate, even if BY'Y != 0
growth_factor*eps(real(T)) < tol && break
growth_factor * eps(real(T)) < tol && break

niter > 10 && error("Ortho(X,Y) is failing badly, this should never happen")
niter += 1
Expand Down Expand Up @@ -372,14 +394,17 @@ end
end

### Compute new residuals
new_R = new_AX .- new_BX .* λs'
@timing "Update residuals" begin
new_R = new_AX .- new_BX .* λs'
@views for i = 1:size(X, 2)
resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
end
end
vprintln(niter, " ", resid_history[:, niter+1])

# it is actually a good question of knowing when to
# precondition. Here seems sensible, but it could plausibly be
# done before or after
@views for i=1:size(X,2)
resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
end
vprintln(niter, " ", resid_history[:, niter+1])
if precon !== I
@timing "preconditioning" begin
precondprep!(precon, X) # update preconditioner if needed; defaults to noop
Expand Down Expand Up @@ -454,10 +479,6 @@ end
# Quick sanity check
for i = 1:size(X, 2)
@views if abs(BX[:, i]'X[:, i] - 1) >= sqrt(eps(real(eltype(X))))
# TODO error-ing is too harsh here. Better throw some exception (e.g. in
# reduced precision this can be interpreted as an indicator to re-try the
# orthogonalisation in elevated precision or to stop the reduced precision
# iterations as a whole.
error("LOBPCG is badly failing to keep the vectors normalized; this should never happen")
end
end
Expand Down
4 changes: 2 additions & 2 deletions src/external/jld2io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function save_scfres_master(file::AbstractString, scfres::NamedTuple, ::Val{:jld

JLD2.jldopen(file, "w") do jld
jld["__propertynames"] = propertynames(scfres)
jld["ρ_real"] = scfres.ρ
jld["ρ"] = scfres.ρ
jld["basis"] = scfres.basis

for symbol in propertynames(scfres)
Expand All @@ -44,7 +44,7 @@ end
function load_scfres(jld::JLD2.JLDFile)
basis = jld["basis"]
scfdict = Dict{Symbol, Any}(
=> jld["ρ_real"],
=> jld["ρ"],
:basis => basis
)

Expand Down
12 changes: 7 additions & 5 deletions src/terms/xc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@ end

function (xc::Xc)(basis::PlaneWaveBasis{T}) where {T}
isempty(xc.functionals) && return TermNoop()

# Charge density for non-linear core correction
ρcore = nothing
if any(a -> a.use_nlcc, basis.model.atoms)
ρcore = ρ_from_total(basis, atomic_total_density(basis, CoreDensity()))
minimum(ρcore) < -sqrt(eps(T)) && @warn("Negative ρcore detected: $(minimum(ρcore))")
else
ρcore = ρ_from_total(basis, zeros(T, basis.fft_size))
end
functionals = map(xc.functionals) do fun
# Strip duals from functional parameters if needed
Expand All @@ -45,11 +45,11 @@ function (xc::Xc)(basis::PlaneWaveBasis{T}) where {T}
T(xc.potential_threshold), ρcore)
end

struct TermXc{T} <: TermNonlinear where {T}
struct TermXc{T,CT} <: TermNonlinear where {T,CT}
functionals::Vector{Functional}
scaling_factor::T
potential_threshold::T
ρcore::Array{T,4}
ρcore::CT
end

function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupation;
Expand All @@ -62,7 +62,9 @@ function xc_potential_real(term::TermXc, basis::PlaneWaveBasis{T}, ψ, occupatio
@assert all(family(xc) in (:lda, :gga, :mgga, :mggal) for xc in term.functionals)

# Add the model core charge density (non-linear core correction)
ρ = ρ + term.ρcore
if !isnothing(term.ρcore)
ρ = ρ + term.ρcore
end

# Compute kinetic energy density, if needed.
if isnothing(τ) && any(needs_τ, term.functionals)
Expand Down
12 changes: 6 additions & 6 deletions src/workarounds/cuda_arrays.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
using LinearAlgebra

# https://github.com/JuliaGPU/CUDA.jl/issues/1572
function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T<:Complex,AT<:CUDA.CuArray}
vals, vects = CUDA.CUSOLVER.heevd!('V', 'U', A.data)
(vectors = vects, values = vals)
function LinearAlgebra.eigen(A::Hermitian{<:Complex,<:CUDA.CuArray})
values, vectors = CUDA.CUSOLVER.heevd!('V', 'U', A.data)
(; values, vectors)
end

function LinearAlgebra.eigen(A::Hermitian{T,AT}) where {T<:Real,AT<:CUDA.CuArray}
vals, vects = CUDA.CUSOLVER.syevd!('V', 'U', A.data)
(vectors = vects, values = vals)
function LinearAlgebra.eigen(A::Hermitian{<:Real,<:CUDA.CuArray})
values, vectors = CUDA.CUSOLVER.syevd!('V', 'U', A.data)
(; values, vectors)
end

synchronize_device(::GPU{<:CUDA.CuArray}) = CUDA.synchronize()
Expand Down
22 changes: 17 additions & 5 deletions test/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@ include("testcases.jl")
function run_problem(; architecture)
model = model_PBE(silicon.lattice, silicon.atoms, silicon.positions)
basis = PlaneWaveBasis(model; Ecut=10, kgrid=(3, 3, 3), architecture)
self_consistent_field(basis; tol=1e-10, solver=scf_damping_solver(1.0))

# TODO Right now guess generation on the GPU does not work
basis_cpu = PlaneWaveBasis(model; basis.Ecut, basis.kgrid)
ρ_guess = guess_density(basis_cpu)
ρ = DFTK.to_device(architecture, ρ_guess)

self_consistent_field(basis; tol=1e-10, solver=scf_damping_solver(1.0), ρ)
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
Expand All @@ -26,19 +32,25 @@ end
magnetic_moments, smearing=Smearing.Gaussian(), temperature=1e-3)
basis = PlaneWaveBasis(model; Ecut=20, kgrid=(4, 4, 4), architecture)

ρ = guess_density(basis, magnetic_moments)
# TODO Right now guess generation on the GPU does not work
basis_cpu = PlaneWaveBasis(model; basis.Ecut, basis.kgrid)
ρ_guess = guess_density(basis_cpu, magnetic_moments)
ρ = DFTK.to_device(architecture, ρ_guess)
# ρ = guess_density(basis, magnetic_moments)

# TODO Bump tolerance a bit here ... still leads to NaNs unfortunately
self_consistent_field(basis; ρ, tol=1e-8, mixing=KerkerMixing(),
self_consistent_field(basis; ρ, tol=1e-7, mixing=KerkerMixing(),
solver=scf_damping_solver(1.0))
end

scfres_cpu = run_problem(; architecture=DFTK.CPU())
scfres_gpu = run_problem(; architecture=DFTK.GPU(CuArray))
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-8
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-7
@test abs(scfres_cpu.energies.total - scfres_gpu.energies.total) < 1e-7
@test norm(scfres_cpu.ρ - Array(scfres_gpu.ρ)) < 1e-6
end


# TODO Fix guess density generation
# TODO Float32
# TODO meta GGA
# TODO Aluminium with LdosMixing
Expand Down
2 changes: 1 addition & 1 deletion test/silicon_redHF.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include("run_scf_and_compare.jl")
include("testcases.jl")
using DoubleFloats
using GenericLinearAlgebra
using DoubleFloats

# Silicon redHF (without xc) is a metal, so we add a bit of temperature to it
function run_silicon_redHF(T; Ecut=5, grid_size=15, spin_polarization=:none, kwargs...)
Expand Down

0 comments on commit 6a8c25c

Please sign in to comment.