From 7484966842d7844d51194460dd516d31d3ed97e4 Mon Sep 17 00:00:00 2001 From: Augustin Bussy Date: Thu, 15 May 2025 17:45:45 +0200 Subject: [PATCH 1/3] GPU optimized symmetry operations --- src/DFTK.jl | 1 + src/gpu/gpu_arrays.jl | 8 -------- src/gpu/symmetry.jl | 47 +++++++++++++++++++++++++++++++++++++++++++ src/symmetry.jl | 4 ++-- 4 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 src/gpu/symmetry.jl diff --git a/src/DFTK.jl b/src/DFTK.jl index ab698b2691..63ffabf8f3 100644 --- a/src/DFTK.jl +++ b/src/DFTK.jl @@ -234,6 +234,7 @@ include("workarounds/dummy_inplace_fft.jl") include("workarounds/forwarddiff_rules.jl") # Optimized generic GPU functions and GPU workarounds +include("gpu/symmetry.jl") include("gpu/linalg.jl") include("gpu/gpu_arrays.jl") diff --git a/src/gpu/gpu_arrays.jl b/src/gpu/gpu_arrays.jl index 6f71320372..a465cbc7a3 100644 --- a/src/gpu/gpu_arrays.jl +++ b/src/gpu/gpu_arrays.jl @@ -5,14 +5,6 @@ using Preferences # https://github.com/JuliaGPU/CUDA.jl/issues/1565 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y) -function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis; symmetries=basis.symmetries) - all(isone, symmetries) && return ρ - # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly - # thing for cases where ρ sits on a device (e.g. GPU) - ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries) - ρ .= to_device(basis.architecture, ρ_CPU) -end - for fun in (:potential_terms, :kernel_terms) @eval function DftFunctionals.$fun(fun::DispatchFunctional, ρ::AT, args...) where {AT <: AbstractGPUArray{Float64}} diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl new file mode 100644 index 0000000000..0fb2c43ad0 --- /dev/null +++ b/src/gpu/symmetry.jl @@ -0,0 +1,47 @@ +using GPUArraysCore + +function accumulate_over_symmetries!(ρaccu::AbstractGPUArray, ρin::AbstractGPUArray, + basis::PlaneWaveBasis{T}, symmetries) where {T} + if all(isone, symmetries) + ρaccu .+= ρin + return ρaccu + end + + Gs = vec(G_vectors(basis)) + ρtmp = similar(ρaccu) + fft_size = basis.fft_size + + for symop in symmetries + if isone(symop) + ρaccu .+= ρin + continue + end + + invS = Mat3{Int}(inv(symop.S)) + map!(ρtmp, Gs) do G + idx = index_G_vectors(fft_size, invS * G) + isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, symop.τ))) * ρin[idx] + end + ρaccu .+= ρtmp + end + ρaccu +end + +function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T}; + symmetries=basis.symmetries) where {T} + all(isone, symmetries) && return ρ + + Gs = vec(G_vectors(basis)) + fft_size = basis.fft_size + ρtmp = similar(ρ) + + for symop in symmetries + isone(symop) && continue + map!(ρtmp, ρ, Gs) do ρ_i, G + idx = index_G_vectors(fft_size, symop.S * G) + isnothing(idx) ? zero(complex(T)) : ρ_i + end + ρ .= ρtmp + end + ρ +end \ No newline at end of file diff --git a/src/symmetry.jl b/src/symmetry.jl index 16c0621e17..91556b75fe 100644 --- a/src/symmetry.jl +++ b/src/symmetry.jl @@ -320,7 +320,7 @@ Symmetrize a density by applying all the basis (by default) symmetries and formi """ @views @timing function symmetrize_ρ(basis, ρ::AbstractArray{T}; symmetries=basis.symmetries, do_lowpass=true) where {T} - ρin_fourier = to_cpu(fft(basis, ρ)) + ρin_fourier = fft(basis, ρ) ρout_fourier = zero(ρin_fourier) for σ = 1:size(ρ, 4) accumulate_over_symmetries!(ρout_fourier[:, :, :, σ], @@ -328,7 +328,7 @@ Symmetrize a density by applying all the basis (by default) symmetries and formi do_lowpass && lowpass_for_symmetry!(ρout_fourier[:, :, :, σ], basis; symmetries) end inv_fft = T <: Real ? irfft : ifft - inv_fft(basis, to_device(basis.architecture, ρout_fourier) ./ length(symmetries)) + inv_fft(basis, ρout_fourier ./ length(symmetries)) end """ From 0649c55331bc8ca859094629b913a701a4d16717 Mon Sep 17 00:00:00 2001 From: Augustin Bussy Date: Fri, 16 May 2025 17:31:00 +0200 Subject: [PATCH 2/3] Changed loop ordering --- src/gpu/symmetry.jl | 53 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl index 0fb2c43ad0..4929106e09 100644 --- a/src/gpu/symmetry.jl +++ b/src/gpu/symmetry.jl @@ -1,47 +1,46 @@ using GPUArraysCore -function accumulate_over_symmetries!(ρaccu::AbstractGPUArray, ρin::AbstractGPUArray, - basis::PlaneWaveBasis{T}, symmetries) where {T} - if all(isone, symmetries) - ρaccu .+= ρin - return ρaccu - end - - Gs = vec(G_vectors(basis)) - ρtmp = similar(ρaccu) +function accumulate_over_symmetries!(ρaccu::AbstractArray, ρin::AbstractArray, + basis::PlaneWaveBasis{T}, symmetries) where {T} + Gs = reshape(G_vectors(basis), size(ρaccu)) fft_size = basis.fft_size - for symop in symmetries - if isone(symop) - ρaccu .+= ρin - continue - end + symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries]) + symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries]) + n_symm = length(symmetries) - invS = Mat3{Int}(inv(symop.S)) - map!(ρtmp, Gs) do G + map!(ρaccu, Gs) do G + acc = zero(complex(T)) + # Explicit loop over indicies because AMDGPU does not support zip() in map! + for i_symm in 1:n_symm + invS = symm_invS[i_symm] + τ = symm_τ[i_symm] idx = index_G_vectors(fft_size, invS * G) - isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, symop.τ))) * ρin[idx] + acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx] end - ρaccu .+= ρtmp + acc end ρaccu end - + function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T}; symmetries=basis.symmetries) where {T} all(isone, symmetries) && return ρ - Gs = vec(G_vectors(basis)) + Gs = reshape(G_vectors(basis), size(ρ)) fft_size = basis.fft_size ρtmp = similar(ρ) - for symop in symmetries - isone(symop) && continue - map!(ρtmp, ρ, Gs) do ρ_i, G - idx = index_G_vectors(fft_size, symop.S * G) - isnothing(idx) ? zero(complex(T)) : ρ_i + symm_S = to_device(basis.architecture, [symop.S for symop in symmetries]) + + map!(ρtmp, ρ, Gs) do ρ_i, G + acc = ρ_i + for S in symm_S + idx = index_G_vectors(fft_size, S * G) + acc *= isnothing(idx) ? zero(complex(T)) : one(complex(T)) end - ρ .= ρtmp + acc end + ρ .= ρtmp ρ -end \ No newline at end of file +end From c3972a656d53fa1ed864af3f4f2fb1fa5639ed30 Mon Sep 17 00:00:00 2001 From: Augustin Bussy Date: Wed, 28 May 2025 13:42:48 +0200 Subject: [PATCH 3/3] Unify CPU and GPU symmetry code --- src/DFTK.jl | 1 - src/gpu/symmetry.jl | 46 ------------------------ src/symmetry.jl | 85 ++++++++++++++++++++++++++------------------ src/terms/hartree.jl | 7 ++-- 4 files changed, 54 insertions(+), 85 deletions(-) delete mode 100644 src/gpu/symmetry.jl diff --git a/src/DFTK.jl b/src/DFTK.jl index 63ffabf8f3..ab698b2691 100644 --- a/src/DFTK.jl +++ b/src/DFTK.jl @@ -234,7 +234,6 @@ include("workarounds/dummy_inplace_fft.jl") include("workarounds/forwarddiff_rules.jl") # Optimized generic GPU functions and GPU workarounds -include("gpu/symmetry.jl") include("gpu/linalg.jl") include("gpu/gpu_arrays.jl") diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl deleted file mode 100644 index 4929106e09..0000000000 --- a/src/gpu/symmetry.jl +++ /dev/null @@ -1,46 +0,0 @@ -using GPUArraysCore - -function accumulate_over_symmetries!(ρaccu::AbstractArray, ρin::AbstractArray, - basis::PlaneWaveBasis{T}, symmetries) where {T} - Gs = reshape(G_vectors(basis), size(ρaccu)) - fft_size = basis.fft_size - - symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries]) - symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries]) - n_symm = length(symmetries) - - map!(ρaccu, Gs) do G - acc = zero(complex(T)) - # Explicit loop over indicies because AMDGPU does not support zip() in map! - for i_symm in 1:n_symm - invS = symm_invS[i_symm] - τ = symm_τ[i_symm] - idx = index_G_vectors(fft_size, invS * G) - acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx] - end - acc - end - ρaccu -end - -function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T}; - symmetries=basis.symmetries) where {T} - all(isone, symmetries) && return ρ - - Gs = reshape(G_vectors(basis), size(ρ)) - fft_size = basis.fft_size - ρtmp = similar(ρ) - - symm_S = to_device(basis.architecture, [symop.S for symop in symmetries]) - - map!(ρtmp, ρ, Gs) do ρ_i, G - acc = ρ_i - for S in symm_S - idx = index_G_vectors(fft_size, S * G) - acc *= isnothing(idx) ? zero(complex(T)) : one(complex(T)) - end - acc - end - ρ .= ρtmp - ρ -end diff --git a/src/symmetry.jl b/src/symmetry.jl index 91556b75fe..6fd72dafc9 100644 --- a/src/symmetry.jl +++ b/src/symmetry.jl @@ -269,49 +269,66 @@ function apply_symop(symop::SymOp, basis, ρin; kwargs...) end # Accumulates the symmetrized versions of the density ρin into ρout (in Fourier space). -# No normalization is performed +# No normalization is performed. This function is optimized for CPU and GPU. function accumulate_over_symmetries!(ρaccu, ρin, basis::PlaneWaveBasis{T}, symmetries) where {T} - for symop in symmetries - # Common special case, where ρin does not need to be processed - if isone(symop) - ρaccu .+= ρin - continue - end - - # Transform ρin -> to the partial density at S * k. - # - # Since the eigenfunctions of the Hamiltonian at k and Sk satisfy - # u_{Sk}(x) = u_{k}(S^{-1} (x - τ)) - # with Fourier transform - # ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G) - # equivalently - # ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G) - invS = Mat3{Int}(inv(symop.S)) - for (ig, G) in enumerate(G_vectors_generator(basis.fft_size)) - igired = index_G_vectors(basis, invS * G) - isnothing(igired) && continue - - if iszero(symop.τ) - @inbounds ρaccu[ig] += ρin[igired] - else - factor = cis2pi(-T(dot(G, symop.τ))) - @inbounds ρaccu[ig] += factor * ρin[igired] - end + # For each G vector and symmetry S: + # Transform ρin -> to the partial density at S * k. + # + # Since the eigenfunctions of the Hamiltonian at k and Sk satisfy + # u_{Sk}(x) = u_{k}(S^{-1} (x - τ)) + # with Fourier transform + # ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G) + # equivalently + # ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G) + Gs = reshape(G_vectors(basis), size(ρaccu)) + fft_size = basis.fft_size + + # Need to transfer symmetry data to the GPU as isbit data + symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries]) + symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries]) + n_symm = length(symmetries) + + # Looping over symmetries inside of map! on G vectors allow for a single GPU kernel launch + map!(ρaccu, Gs) do G + acc = zero(complex(T)) + # Explicit loop over indicies because AMDGPU does not support zip() in map! + for i_symm in 1:n_symm + invS = symm_invS[i_symm] + τ = symm_τ[i_symm] + idx = index_G_vectors(fft_size, invS * G) + acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx] end - end # symop + acc + end ρaccu end # Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid +# This function is optimized for CPU and GPU. function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmetries) - for symop in symmetries - isone(symop) && continue - for (ig, G) in enumerate(G_vectors_generator(basis.fft_size)) - if index_G_vectors(basis, symop.S * G) === nothing - ρ[ig] = 0 - end + all(isone, symmetries) && return ρ + + Gs = reshape(G_vectors(basis), size(ρ)) + fft_size = basis.fft_size + ρtmp = similar(ρ) + + symm_S = to_device(basis.architecture, [symop.S for symop in symmetries]) + + # Perfrorms the following loop in an optimized way for CPU and GPU: + # for (ig, G) in enumerate(G_vectors(basis)) + # if index_G_vectors(basis, symop.S * G) === nothing + # ρ[ig] = 0 + # end + # end + map!(ρtmp, ρ, Gs) do ρ_i, G + acc = ρ_i + for S in symm_S + idx = index_G_vectors(fft_size, S * G) + acc *= isnothing(idx) ? 0 : 1 end + acc end + ρ .= ρtmp ρ end diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl index bed22ab97b..b14a5715bd 100644 --- a/src/terms/hartree.jl +++ b/src/terms/hartree.jl @@ -28,19 +28,18 @@ struct TermHartree <: TermNonlinear end function compute_poisson_green_coeffs(basis::PlaneWaveBasis{T}, scaling_factor; q=zero(Vec3{T})) where {T} - model = basis.model + recip_lattice = basis.model.recip_lattice # Solving the Poisson equation ΔV = -4π ρ in Fourier space # is multiplying elementwise by 4π / |G|^2. - poisson_green_coeffs = 4T(π) ./ [sum(abs2, model.recip_lattice * (G + q)) - for G in to_cpu(G_vectors(basis))] + poisson_green_coeffs = map(G -> 4T(π) / sum(abs2, recip_lattice * (G + q)), + G_vectors(basis)) if iszero(q) # Compensating charge background => Zero DC. GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0 # Symmetrize Fourier coeffs to have real iFFT. enforce_real!(poisson_green_coeffs, basis) end - poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs) scaling_factor .* poisson_green_coeffs end function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}