Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize run_spin_excitation! for GPU #462

Merged
merged 4 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion KomaMRICore/src/simulation/SimMethods/Bloch/BlochGPU.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
include("KernelFunctions.jl")

"""These properties are redundant with seq.Δt and seq.ADC, but it is much faster
to calculate them on the CPU before the simulation is run."""
struct SeqBlockProperties{T<:Real}
Expand Down Expand Up @@ -64,6 +66,10 @@ end
struct BlochGPUPrealloc{T} <: PreallocResult{T}
seq_properties::AbstractVector{SeqBlockProperties{T}}
Bz::AbstractMatrix{T}
B::AbstractMatrix{T}
φ::AbstractMatrix{T}
ΔT1::AbstractMatrix{T}
ΔT2::AbstractMatrix{T}
Δϕ::AbstractMatrix{T}
ϕ::AbstractArray{T}
Mxy::AbstractMatrix{Complex{T}}
Expand All @@ -77,6 +83,10 @@ function prealloc_block(p::BlochGPUPrealloc{T}, i::Integer) where {T<:Real}
return BlochGPUPrealloc(
[seq_block],
view(p.Bz,:,1:seq_block.length),
view(p.B,:,1:seq_block.length),
view(p.φ,:,1:seq_block.length-1),
view(p.ΔT1,:,1:seq_block.length-1),
view(p.ΔT2,:,1:seq_block.length-1),
view(p.Δϕ,:,1:seq_block.length-1),
seq_block.nADC > 0 ? view(p.ϕ,:,1:seq_block.length-1) : view(p.ϕ,:,1),
view(p.Mxy,:,1:seq_block.nADC),
Expand All @@ -91,6 +101,10 @@ function prealloc(sim_method::Bloch, backend::KA.GPU, obj::Phantom{T}, M::Mag{T}
return BlochGPUPrealloc(
precalc.seq_properties,
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, T, (size(obj.x, 1), max_block_length-1)),
KA.zeros(backend, Complex{T}, (size(obj.x, 1), max_block_length)),
Expand Down Expand Up @@ -150,4 +164,34 @@ function run_spin_precession!(
M.xy .= M.xy .* exp.(-seq_block.dur ./ p.T2) .* _cis.(pre.ϕ[:,end])

return nothing
end
end

function run_spin_excitation!(
p::Phantom{T},
seq::DiscreteSequence{T},
sig::AbstractArray{Complex{T}},
M::Mag{T},
sim_method::Bloch,
backend::KA.GPU,
pre::BlochGPUPrealloc
) where {T<:Real}
#Motion
x, y, z = get_spin_coords(p.motion, p.x, p.y, p.z, seq.t')

#Effective Field
pre.Bz .= (x .* seq.Gx' .+ y .* seq.Gy' .+ z .* seq.Gz') .+ pre.ΔBz .- seq.Δf' ./ T(γ) # ΔB_0 = (B_0 - ω_rf/γ), Need to add a component here to model scanner's dB0(x,y,z)
pre.B .= sqrt.(abs.(seq.B1') .^ 2 .+ abs.(pre.Bz) .^ 2)

#Spinor Rotation
pre.φ .= T(-π .* γ) .* (pre.B[:,1:end-1] .* seq.Δt') # TODO: Use trapezoidal integration here (?), this is just Forward Euler

#Relaxation
pre.ΔT1 .= exp.(-seq.Δt' ./ p.T1)
pre.ΔT2 .= exp.(-seq.Δt' ./ p.T2)

#Excitation
apply_excitation!(backend, 512)(M.xy, M.z, pre.φ, seq.B1, pre.Bz, pre.B, pre.ΔT1, pre.ΔT2, p.ρ, ndrange=size(M.xy,1))
KA.synchronize(backend)

return nothing
end
61 changes: 61 additions & 0 deletions KomaMRICore/src/simulation/SimMethods/Bloch/KernelFunctions.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
using KernelAbstractions: @kernel, @Const, @index, @uniform, @groupsize, @localmem

## COV_EXCL_START

@kernel function apply_excitation!(Mxy, Mz, @Const(φ), @Const(B1), @Const(Bz), @Const(B), @Const(ΔT1), @Const(ΔT2), @Const(ρ))
i_g = @index(Global)
i_l = @index(Local)

@uniform T = eltype(φ)
@uniform N = @groupsize()[1]
@uniform N_Δt = size(φ, 2)

s_α_r = @localmem T (N,)
s_α_i = @localmem T (N,)
s_β_i = @localmem T (N,)
s_β_r = @localmem T (N,)
s_Mxy_r = @localmem T (N,)
s_Mxy_i = @localmem T (N,)
s_Mxy_new_r = @localmem T (N,)
s_Mxy_new_i = @localmem T (N,)
s_Mz = @localmem T (N,)
s_Mz_new = @localmem T (N,)
s_ρ = @localmem T (N,)

@inbounds s_Mxy_r[i_l] = real(Mxy[i_g])
@inbounds s_Mxy_i[i_l] = imag(Mxy[i_g])
@inbounds s_Mz[i_l] = Mz[i_g]
@inbounds s_ρ[i_l] = ρ[i_g]

@inbounds for t = 1 : N_Δt
sin_φ = sin(φ[i_g, t]) #TO-DO: use sincos once oneAPI releases version with https://github.com/JuliaGPU/oneAPI.jl/commit/260a4dda0ea223dbf0893de7b4a13d994ae27bd1
cos_φ = cos(φ[i_g, t])
s_α_r[i_l] = cos_φ
if (iszero(B[i_g, t]))
s_α_i[i_l] = -(Bz[i_g, t] / (B[i_g, t] + eps(T))) * sin_φ
s_β_r[i_l] = (imag(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
s_β_i[i_l] = -(real(B1[t]) / (B[i_g, t] + eps(T))) * sin_φ
else
s_α_i[i_l] = -(Bz[i_g, t] / B[i_g, t]) * sin_φ
s_β_r[i_l] = (imag(B1[t]) / B[i_g, t]) * sin_φ
s_β_i[i_l] = -(real(B1[t]) / B[i_g, t]) * sin_φ
end
s_Mxy_new_r[i_l] = 2 * (s_Mxy_i[i_l] * (s_α_r[i_l] * s_α_i[i_l] - s_β_r[i_l] * s_β_i[i_l]) +
s_Mz[i_l] * (s_α_i[i_l] * s_β_i[i_l] + s_α_r[i_l] * s_β_r[i_l])) +
s_Mxy_r[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 - s_β_r[i_l]^2 + s_β_i[i_l]^2)
s_Mxy_new_i[i_l] = -2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_α_i[i_l] + s_β_r[i_l] * s_β_i[i_l]) -
s_Mz[i_l] * (s_α_r[i_l] * s_β_i[i_l] - s_α_i[i_l] * s_β_r[i_l])) +
s_Mxy_i[i_l] * (s_α_r[i_l]^2 - s_α_i[i_l]^2 + s_β_r[i_l]^2 - s_β_i[i_l]^2)
s_Mz_new[i_l] = s_Mz[i_l] * (s_α_r[i_l]^2 + s_α_i[i_l]^2 - s_β_r[i_l]^2 - s_β_i[i_l]^2) -
2 * (s_Mxy_r[i_l] * (s_α_r[i_l] * s_β_r[i_l] - s_α_i[i_l] * s_β_i[i_l]) +
s_Mxy_i[i_l] * (s_α_r[i_l] * s_β_i[i_l] + s_α_i[i_l] * s_β_r[i_l]))
s_Mxy_r[i_l] = s_Mxy_new_r[i_l] * ΔT2[i_g, t]
s_Mxy_i[i_l] = s_Mxy_new_i[i_l] * ΔT2[i_g, t]
s_Mz[i_l] = s_Mz_new[i_l] * ΔT1[i_g, t] + s_ρ[i_l] * (1 - ΔT1[i_g, t])
end

@inbounds Mxy[i_g] = s_Mxy_r[i_l] + 1im * s_Mxy_i[i_l]
@inbounds Mz[i_g] = s_Mz[i_l]
end

## COV_EXCL_STOP
4 changes: 1 addition & 3 deletions KomaMRICore/src/simulation/SimulatorCore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,7 @@ function simulate(
sim_params = default_sim_params(sim_params)
#Warn if user is trying to run on CPU without enabling multi-threading
if (!sim_params["gpu"] && Threads.nthreads() == 1)
@info """
Simulation will be run on the CPU with only 1 thread. To
enable multi-threading, start julia with --threads=auto.
@info """Simulation will be run on the CPU with only 1 thread. To enable multi-threading, start julia with --threads=auto
""" maxlog=1
end
# Simulation init
Expand Down