From 7484966842d7844d51194460dd516d31d3ed97e4 Mon Sep 17 00:00:00 2001
From: Augustin Bussy <augustin.bussy@cscs.ch>
Date: Thu, 15 May 2025 17:45:45 +0200
Subject: [PATCH 1/3] GPU optimized symmetry operations

---
 src/DFTK.jl           |  1 +
 src/gpu/gpu_arrays.jl |  8 --------
 src/gpu/symmetry.jl   | 47 +++++++++++++++++++++++++++++++++++++++++++
 src/symmetry.jl       |  4 ++--
 4 files changed, 50 insertions(+), 10 deletions(-)
 create mode 100644 src/gpu/symmetry.jl

diff --git a/src/DFTK.jl b/src/DFTK.jl
index ab698b2691..63ffabf8f3 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -234,6 +234,7 @@ include("workarounds/dummy_inplace_fft.jl")
 include("workarounds/forwarddiff_rules.jl")
 
 # Optimized generic GPU functions and GPU workarounds
+include("gpu/symmetry.jl")
 include("gpu/linalg.jl")
 include("gpu/gpu_arrays.jl")
 
diff --git a/src/gpu/gpu_arrays.jl b/src/gpu/gpu_arrays.jl
index 6f71320372..a465cbc7a3 100644
--- a/src/gpu/gpu_arrays.jl
+++ b/src/gpu/gpu_arrays.jl
@@ -5,14 +5,6 @@ using Preferences
 # https://github.com/JuliaGPU/CUDA.jl/issues/1565
 LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y)
 
-function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis; symmetries=basis.symmetries)
-    all(isone, symmetries) && return ρ
-    # lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
-    # thing for cases where ρ sits on a device (e.g. GPU)
-    ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries)
-    ρ .= to_device(basis.architecture, ρ_CPU)
-end
-
 for fun in (:potential_terms, :kernel_terms)
     @eval function DftFunctionals.$fun(fun::DispatchFunctional, ρ::AT,
                                        args...) where {AT <: AbstractGPUArray{Float64}}
diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl
new file mode 100644
index 0000000000..0fb2c43ad0
--- /dev/null
+++ b/src/gpu/symmetry.jl
@@ -0,0 +1,47 @@
+using GPUArraysCore
+
+function accumulate_over_symmetries!(ρaccu::AbstractGPUArray, ρin::AbstractGPUArray,
+                                     basis::PlaneWaveBasis{T}, symmetries) where {T}
+    if all(isone, symmetries)
+        ρaccu .+= ρin
+        return ρaccu
+    end
+
+    Gs = vec(G_vectors(basis))
+    ρtmp = similar(ρaccu)
+    fft_size = basis.fft_size
+
+    for symop in symmetries
+        if isone(symop)
+            ρaccu .+= ρin
+            continue
+        end
+
+        invS = Mat3{Int}(inv(symop.S))
+        map!(ρtmp, Gs) do G
+            idx = index_G_vectors(fft_size, invS * G)
+            isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, symop.τ))) * ρin[idx]
+        end
+        ρaccu .+= ρtmp
+    end
+    ρaccu
+end
+        
+function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T};
+                               symmetries=basis.symmetries) where {T}
+    all(isone, symmetries) && return ρ
+
+    Gs = vec(G_vectors(basis))
+    fft_size = basis.fft_size
+    ρtmp = similar(ρ)
+
+    for symop in symmetries
+        isone(symop) && continue
+        map!(ρtmp, ρ, Gs) do ρ_i, G
+            idx = index_G_vectors(fft_size, symop.S * G)
+            isnothing(idx) ? zero(complex(T)) : ρ_i
+        end
+        ρ .= ρtmp
+    end
+    ρ
+end
\ No newline at end of file
diff --git a/src/symmetry.jl b/src/symmetry.jl
index 16c0621e17..91556b75fe 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -320,7 +320,7 @@ Symmetrize a density by applying all the basis (by default) symmetries and formi
 """
 @views @timing function symmetrize_ρ(basis, ρ::AbstractArray{T};
                                      symmetries=basis.symmetries, do_lowpass=true) where {T}
-    ρin_fourier  = to_cpu(fft(basis, ρ))
+    ρin_fourier  = fft(basis, ρ)
     ρout_fourier = zero(ρin_fourier)
     for σ = 1:size(ρ, 4)
         accumulate_over_symmetries!(ρout_fourier[:, :, :, σ],
@@ -328,7 +328,7 @@ Symmetrize a density by applying all the basis (by default) symmetries and formi
         do_lowpass && lowpass_for_symmetry!(ρout_fourier[:, :, :, σ], basis; symmetries)
     end
     inv_fft = T <: Real ? irfft : ifft
-    inv_fft(basis, to_device(basis.architecture, ρout_fourier) ./ length(symmetries))
+    inv_fft(basis, ρout_fourier ./ length(symmetries))
 end
 
 """

From 0649c55331bc8ca859094629b913a701a4d16717 Mon Sep 17 00:00:00 2001
From: Augustin Bussy <augustin.bussy@cscs.ch>
Date: Fri, 16 May 2025 17:31:00 +0200
Subject: [PATCH 2/3] Changed loop ordering

---
 src/gpu/symmetry.jl | 53 ++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl
index 0fb2c43ad0..4929106e09 100644
--- a/src/gpu/symmetry.jl
+++ b/src/gpu/symmetry.jl
@@ -1,47 +1,46 @@
 using GPUArraysCore
 
-function accumulate_over_symmetries!(ρaccu::AbstractGPUArray, ρin::AbstractGPUArray,
-                                     basis::PlaneWaveBasis{T}, symmetries) where {T}
-    if all(isone, symmetries)
-        ρaccu .+= ρin
-        return ρaccu
-    end
-
-    Gs = vec(G_vectors(basis))
-    ρtmp = similar(ρaccu)
+function accumulate_over_symmetries!(ρaccu::AbstractArray, ρin::AbstractArray,
+    basis::PlaneWaveBasis{T}, symmetries) where {T}
+    Gs = reshape(G_vectors(basis), size(ρaccu))
     fft_size = basis.fft_size
 
-    for symop in symmetries
-        if isone(symop)
-            ρaccu .+= ρin
-            continue
-        end
+    symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries])
+    symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries])
+    n_symm = length(symmetries)
 
-        invS = Mat3{Int}(inv(symop.S))
-        map!(ρtmp, Gs) do G
+    map!(ρaccu, Gs) do G
+        acc = zero(complex(T))
+        # Explicit loop over indicies because AMDGPU does not support zip() in map!
+        for i_symm in 1:n_symm
+            invS = symm_invS[i_symm]
+            τ = symm_τ[i_symm]
             idx = index_G_vectors(fft_size, invS * G)
-            isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, symop.τ))) * ρin[idx]
+            acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx]
         end
-        ρaccu .+= ρtmp
+        acc
     end
     ρaccu
 end
-        
+
 function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T};
                                symmetries=basis.symmetries) where {T}
     all(isone, symmetries) && return ρ
 
-    Gs = vec(G_vectors(basis))
+    Gs = reshape(G_vectors(basis), size(ρ))
     fft_size = basis.fft_size
     ρtmp = similar(ρ)
 
-    for symop in symmetries
-        isone(symop) && continue
-        map!(ρtmp, ρ, Gs) do ρ_i, G
-            idx = index_G_vectors(fft_size, symop.S * G)
-            isnothing(idx) ? zero(complex(T)) : ρ_i
+    symm_S = to_device(basis.architecture, [symop.S for symop in symmetries])
+
+    map!(ρtmp, ρ, Gs) do ρ_i, G
+        acc = ρ_i
+	    for S in symm_S
+            idx = index_G_vectors(fft_size, S * G)
+            acc *= isnothing(idx) ? zero(complex(T)) : one(complex(T))
         end
-        ρ .= ρtmp
+        acc
     end
+    ρ .= ρtmp
     ρ
-end
\ No newline at end of file
+end

From c3972a656d53fa1ed864af3f4f2fb1fa5639ed30 Mon Sep 17 00:00:00 2001
From: Augustin Bussy <augustin.bussy@cscs.ch>
Date: Wed, 28 May 2025 13:42:48 +0200
Subject: [PATCH 3/3] Unify CPU and GPU symmetry code

---
 src/DFTK.jl          |  1 -
 src/gpu/symmetry.jl  | 46 ------------------------
 src/symmetry.jl      | 85 ++++++++++++++++++++++++++------------------
 src/terms/hartree.jl |  7 ++--
 4 files changed, 54 insertions(+), 85 deletions(-)
 delete mode 100644 src/gpu/symmetry.jl

diff --git a/src/DFTK.jl b/src/DFTK.jl
index 63ffabf8f3..ab698b2691 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -234,7 +234,6 @@ include("workarounds/dummy_inplace_fft.jl")
 include("workarounds/forwarddiff_rules.jl")
 
 # Optimized generic GPU functions and GPU workarounds
-include("gpu/symmetry.jl")
 include("gpu/linalg.jl")
 include("gpu/gpu_arrays.jl")
 
diff --git a/src/gpu/symmetry.jl b/src/gpu/symmetry.jl
deleted file mode 100644
index 4929106e09..0000000000
--- a/src/gpu/symmetry.jl
+++ /dev/null
@@ -1,46 +0,0 @@
-using GPUArraysCore
-
-function accumulate_over_symmetries!(ρaccu::AbstractArray, ρin::AbstractArray,
-    basis::PlaneWaveBasis{T}, symmetries) where {T}
-    Gs = reshape(G_vectors(basis), size(ρaccu))
-    fft_size = basis.fft_size
-
-    symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries])
-    symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries])
-    n_symm = length(symmetries)
-
-    map!(ρaccu, Gs) do G
-        acc = zero(complex(T))
-        # Explicit loop over indicies because AMDGPU does not support zip() in map!
-        for i_symm in 1:n_symm
-            invS = symm_invS[i_symm]
-            τ = symm_τ[i_symm]
-            idx = index_G_vectors(fft_size, invS * G)
-            acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx]
-        end
-        acc
-    end
-    ρaccu
-end
-
-function lowpass_for_symmetry!(ρ::AbstractGPUArray, basis::PlaneWaveBasis{T};
-                               symmetries=basis.symmetries) where {T}
-    all(isone, symmetries) && return ρ
-
-    Gs = reshape(G_vectors(basis), size(ρ))
-    fft_size = basis.fft_size
-    ρtmp = similar(ρ)
-
-    symm_S = to_device(basis.architecture, [symop.S for symop in symmetries])
-
-    map!(ρtmp, ρ, Gs) do ρ_i, G
-        acc = ρ_i
-	    for S in symm_S
-            idx = index_G_vectors(fft_size, S * G)
-            acc *= isnothing(idx) ? zero(complex(T)) : one(complex(T))
-        end
-        acc
-    end
-    ρ .= ρtmp
-    ρ
-end
diff --git a/src/symmetry.jl b/src/symmetry.jl
index 91556b75fe..6fd72dafc9 100644
--- a/src/symmetry.jl
+++ b/src/symmetry.jl
@@ -269,49 +269,66 @@ function apply_symop(symop::SymOp, basis, ρin; kwargs...)
 end
 
 # Accumulates the symmetrized versions of the density ρin into ρout (in Fourier space).
-# No normalization is performed
+# No normalization is performed. This function is optimized for CPU and GPU.
 function accumulate_over_symmetries!(ρaccu, ρin, basis::PlaneWaveBasis{T}, symmetries) where {T}
-    for symop in symmetries
-        # Common special case, where ρin does not need to be processed
-        if isone(symop)
-            ρaccu .+= ρin
-            continue
-        end
-
-        # Transform ρin -> to the partial density at S * k.
-        #
-        # Since the eigenfunctions of the Hamiltonian at k and Sk satisfy
-        #      u_{Sk}(x) = u_{k}(S^{-1} (x - τ))
-        # with Fourier transform
-        #      ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G)
-        # equivalently
-        #     ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G)
-        invS = Mat3{Int}(inv(symop.S))
-        for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
-            igired = index_G_vectors(basis, invS * G)
-            isnothing(igired) && continue
-
-            if iszero(symop.τ)
-                @inbounds ρaccu[ig] += ρin[igired]
-            else
-                factor = cis2pi(-T(dot(G, symop.τ)))
-                @inbounds ρaccu[ig] += factor * ρin[igired]
-            end
+    # For each G vector and symmetry S:
+    # Transform ρin -> to the partial density at S * k.
+    #
+    # Since the eigenfunctions of the Hamiltonian at k and Sk satisfy
+    #      u_{Sk}(x) = u_{k}(S^{-1} (x - τ))
+    # with Fourier transform
+    #      ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G)
+    # equivalently
+    #     ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G)
+    Gs = reshape(G_vectors(basis), size(ρaccu))
+    fft_size = basis.fft_size
+
+    # Need to transfer  symmetry data to the GPU as isbit data
+    symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries])
+    symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries])
+    n_symm = length(symmetries)
+
+    # Looping over symmetries inside of map! on G vectors allow for a single GPU kernel launch
+    map!(ρaccu, Gs) do G
+        acc = zero(complex(T))
+        # Explicit loop over indicies because AMDGPU does not support zip() in map!
+        for i_symm in 1:n_symm
+            invS = symm_invS[i_symm]
+            τ = symm_τ[i_symm]
+            idx = index_G_vectors(fft_size, invS * G)
+            acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx]
         end
-    end  # symop
+        acc
+    end
     ρaccu
 end
 
 # Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid
+# This function is optimized for CPU and GPU.
 function lowpass_for_symmetry!(ρ::AbstractArray, basis; symmetries=basis.symmetries)
-    for symop in symmetries
-        isone(symop) && continue
-        for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
-            if index_G_vectors(basis, symop.S * G) === nothing
-                ρ[ig] = 0
-            end
+    all(isone, symmetries) && return ρ
+
+    Gs = reshape(G_vectors(basis), size(ρ))
+    fft_size = basis.fft_size
+    ρtmp = similar(ρ)
+
+    symm_S = to_device(basis.architecture, [symop.S for symop in symmetries])
+
+    # Perfrorms the following loop in an optimized way for CPU and GPU:
+    # for (ig, G) in enumerate(G_vectors(basis))
+    #     if index_G_vectors(basis, symop.S * G) === nothing
+    #         ρ[ig] = 0
+    #     end
+    # end
+    map!(ρtmp, ρ, Gs) do ρ_i, G
+        acc = ρ_i
+        for S in symm_S
+            idx = index_G_vectors(fft_size, S * G)
+            acc *= isnothing(idx) ? 0 : 1
         end
+        acc
     end
+    ρ .= ρtmp
     ρ
 end
 
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index bed22ab97b..b14a5715bd 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -28,19 +28,18 @@ struct TermHartree <: TermNonlinear
 end
 function compute_poisson_green_coeffs(basis::PlaneWaveBasis{T}, scaling_factor;
                                       q=zero(Vec3{T})) where {T}
-    model = basis.model
+    recip_lattice = basis.model.recip_lattice
 
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
-    poisson_green_coeffs = 4T(π) ./ [sum(abs2, model.recip_lattice * (G + q))
-                                     for G in to_cpu(G_vectors(basis))]
+    poisson_green_coeffs = map(G -> 4T(π) / sum(abs2, recip_lattice * (G + q)),
+                               G_vectors(basis))
     if iszero(q)
         # Compensating charge background => Zero DC.
         GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0
         # Symmetrize Fourier coeffs to have real iFFT.
         enforce_real!(poisson_green_coeffs, basis)
     end
-    poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs)
     scaling_factor .* poisson_green_coeffs
 end
 function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}