diff --git a/Project.toml b/Project.toml index 27715904b8..26c2f924b7 100644 --- a/Project.toml +++ b/Project.toml @@ -8,12 +8,14 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a" BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" Brillouin = "23470ee3-d0df-4052-8b1a-8cbd6363e7f0" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DftFunctionals = "6bd331d2-b28d-4fd3-880e-1a1c7f37947f" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" @@ -33,6 +35,7 @@ Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" diff --git a/examples/gpu.jl b/examples/gpu.jl new file mode 100644 index 0000000000..f59c12d12b --- /dev/null +++ b/examples/gpu.jl @@ -0,0 +1,26 @@ +using DFTK +using CUDA +using MKL +setup_threading(n_blas=1) + +a = 10.263141334305942 # Lattice constant in Bohr +lattice = a / 2 .* [[0 1 1.]; [1 0 1.]; [1 1 0.]] +Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4")) +atoms = [Si, Si] +positions = [ones(3)/8, -ones(3)/8]; +terms_LDA = [Kinetic(), AtomicLocal(), AtomicNonlocal()] + +# Setup an LDA model and discretize using +# a single k-point and a small `Ecut` of 5 Hartree. +mod = Model(lattice, atoms, positions; terms=terms_LDA,symmetries=false) +basis = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1)) +basis_gpu = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray) + + +DFTK.reset_timer!(DFTK.timer) +scfres = self_consistent_field(basis; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3)) +println(DFTK.timer) + +DFTK.reset_timer!(DFTK.timer) +scfres_gpu = self_consistent_field(basis_gpu; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3)) +println(DFTK.timer) diff --git a/src/DFTK.jl b/src/DFTK.jl index a3550f76fb..82b4d72b35 100644 --- a/src/DFTK.jl +++ b/src/DFTK.jl @@ -13,6 +13,10 @@ using spglib_jll using Unitful using UnitfulAtomic using ForwardDiff +using AbstractFFTs +using GPUArrays +using CUDA +using Random using ChainRulesCore export Vec3 diff --git a/src/Model.jl b/src/Model.jl index 6f9e0c90cd..9878841c27 100644 --- a/src/Model.jl +++ b/src/Model.jl @@ -263,13 +263,48 @@ Examples of covectors are forces. Reciprocal vectors are a special case: they are covectors, but conventionally have an additional factor of 2π in their definition, so they transform rather with 2π times the inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red. + +The trans_mat functions return the transition matrices required to do such a change of basis. =# -vector_red_to_cart(model::Model, rred) = model.lattice * rred -vector_cart_to_red(model::Model, rcart) = model.inv_lattice * rcart -covector_red_to_cart(model::Model, fred) = model.inv_lattice' * fred -covector_cart_to_red(model::Model, fcart) = model.lattice' * fcart -recip_vector_red_to_cart(model::Model, qred) = model.recip_lattice * qred -recip_vector_cart_to_red(model::Model, qcart) = model.inv_recip_lattice * qcart + +trans_mat_vector_red_to_cart(model::Model) = model.lattice +trans_mat_vector_cart_to_red(model::Model) = model.inv_lattice +trans_mat_covector_red_to_cart(model::Model) = model.inv_lattice' +trans_mat_covector_cart_to_red(model::Model) = model.lattice' +trans_mat_recip_vector_red_to_cart(model::Model) = model.recip_lattice +trans_mat_recip_vector_cart_to_red(model::Model) = model.inv_recip_lattice + +fun_mat_list =(:vector_red_to_cart, + :vector_cart_to_red, + :covector_red_to_cart, + :covector_cart_to_red, + :recip_vector_red_to_cart, + :recip_vector_cart_to_red +) + +for fun1 in fun_mat_list + #= + The following functions compute the change of basis for a given vector. To do so, + they call the trans_mat functions to get the corresponding transition matrix. + These functions can be broadcasted over an Array of vectors: however, they are + not GPU compatible, as they require the model, which is no isbits. + =# + @eval $fun1(model::Model, vec) = $(Symbol("trans_mat_"*string(fun1)))(model::Model) * vec + #= + The following functions take an AbstractArray of vectors and compute the change of basis + for every vector in the AbstractArray: they return an AbstractArray of the same type + and size as the input, but containing the vectors in a new basis. + These functions are GPU compatible (ie the AbstractArray can be a GPUArray), since + they use a map and the transition matrices are static arrays. + =# + @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray{AT}) where {AT <: Vec3} + trans_matrix = $(Symbol("trans_mat_"*string(fun1)))(model) + in_new_basis = map(A) do Ai + trans_matrix * Ai + end + in_new_basis + end +end #= Transformations on vectors and covectors are matrices and comatrices. diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl index 43cc911a5f..0c7fe3bbca 100644 --- a/src/PlaneWaveBasis.jl +++ b/src/PlaneWaveBasis.jl @@ -39,7 +39,7 @@ Normalization conventions: `G_to_r` and `r_to_G` convert between these representations. """ -struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real} +struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AT, RT <: AT, AT <: AbstractArray} # T is the default type to express data, VT the corresponding bare value type (i.e. not dual) model::Model{T, VT} @@ -67,8 +67,8 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real} G_to_r_normalization::T # G_to_r = G_to_r_normalization * BFFT # "cubic" basis in reciprocal and real space, on which potentials and densities are stored - G_vectors::Array{Vec3{Int}, 3} - r_vectors::Array{Vec3{VT }, 3} + G_vectors::GT + r_vectors::RT ## MPI-local information of the kpoints this processor treats # Irreducible kpoints. In the case of collinear spin, @@ -148,7 +148,7 @@ end # and are stored in PlaneWaveBasis for easy reconstruction. function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, kcoords, kweights, kgrid, kshift, - symmetries_respect_rgrid, comm_kpts) where {T <: Real} + symmetries_respect_rgrid, comm_kpts, array_type = Array) where {T <: Real} # Validate fft_size if variational max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2 @@ -191,7 +191,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, kweights_global = kweights # Setup FFT plans - (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(T, fft_size) + Gs = G_vectors(fft_size, array_type) + (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(Gs,T), fft_size) # Normalization constants # r_to_G = r_to_G_normalization * FFT @@ -244,22 +245,25 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational, end @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components @assert length(kpoints) == length(kweights_thisproc) + Threads.nthreads() != 1 && Gs isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.") VT = value_type(T) dvol = model.unit_cell_volume ./ prod(fft_size) r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3] terms = Vector{Any}(undef, length(model.term_types)) # Dummy terms array, filled below - basis = PlaneWaveBasis{T,value_type(T)}( + RT = array_type{Vec3{VT }, 3} + GT = array_type{Vec3{Int }, 3} + + basis = PlaneWaveBasis{T,value_type(T), array_type, GT, RT}( model, fft_size, dvol, Ecut, variational, opFFT, ipFFT, opBFFT, ipBFFT, r_to_G_normalization, G_to_r_normalization, - G_vectors(fft_size), r_vectors, + Gs, r_vectors, kpoints, kweights_thisproc, kgrid, kshift, kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs, symmetries, symmetries_respect_rgrid, terms) - # Instantiate the terms with the basis for (it, t) in enumerate(model.term_types) term_name = string(nameof(typeof(t))) @@ -277,7 +281,7 @@ end variational=true, fft_size=nothing, kgrid=nothing, kshift=nothing, symmetries_respect_rgrid=isnothing(fft_size), - comm_kpts=MPI.COMM_WORLD) where {T <: Real} + comm_kpts=MPI.COMM_WORLD, array_type = Array) where {T <: Real} if isnothing(fft_size) @assert variational if symmetries_respect_rgrid @@ -295,7 +299,7 @@ end fft_size = compute_fft_size(model, Ecut, kcoords; factors) end PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights, - kgrid, kshift, symmetries_respect_rgrid, comm_kpts) + kgrid, kshift, symmetries_respect_rgrid, comm_kpts, array_type) end @doc raw""" @@ -322,7 +326,7 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints PlaneWaveBasis(basis.model, basis.Ecut, basis.fft_size, basis.variational, kcoords, kweights, kgrid, kshift, - basis.symmetries_respect_rgrid, basis.comm_kpts) + basis.symmetries_respect_rgrid, basis.comm_kpts, array_type = array_type(basis)) end """ @@ -331,13 +335,15 @@ end The wave vectors `G` in reduced (integer) coordinates for a cubic basis set of given sizes. """ -function G_vectors(fft_size::Union{Tuple,AbstractVector}) + +function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type = Array) # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower # than this implementation, hence the code duplication. start = .- cld.(fft_size .- 1, 2) stop = fld.(fft_size .- 1, 2) axes = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3] - [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]] + Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]] + convert(array_type, Gs) #Offload to GPU if needed. end function G_vectors_generator(fft_size::Union{Tuple,AbstractVector}) # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and @@ -358,6 +364,12 @@ or a ``k``-point `kpt`. G_vectors(basis::PlaneWaveBasis) = basis.G_vectors G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors +""" +Return the type of array used for computations (Array if on CPU, CuArray, +ROCArray... if on GPU). +""" +array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT + @doc raw""" G_vectors_cart(basis::PlaneWaveBasis) @@ -365,7 +377,7 @@ G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates. """ -G_vectors_cart(basis::PlaneWaveBasis) = recip_vector_red_to_cart.(basis.model, G_vectors(basis)) +G_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, G_vectors(basis)) function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint) recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt)) end diff --git a/src/common/ortho.jl b/src/common/ortho.jl index a0f7508339..3936c5bb25 100644 --- a/src/common/ortho.jl +++ b/src/common/ortho.jl @@ -1,2 +1,3 @@ # Orthonormalize -ortho_qr(φk) = Matrix(qr(φk).Q) +ortho_qr(φk::AbstractArray) = Matrix(qr(φk).Q) #LinearAlgebra.QRCompactWYQ -> Matrix +ortho_qr(φk::CuArray) = CuArray(qr(φk).Q) #CUDA.CUSOLVER.CuQRPackedQ -> CuArray diff --git a/src/densities.jl b/src/densities.jl index 570e852937..4855bce9c3 100644 --- a/src/densities.jl +++ b/src/densities.jl @@ -26,18 +26,18 @@ grid `basis`, where the individual k-points are occupied according to `occupatio chunk_length = cld(length(ik_n), Threads.nthreads()) # chunk-local variables - ρ_chunklocal = Array{T,4}[zeros(T, basis.fft_size..., basis.model.n_spin_components) - for _ = 1:Threads.nthreads()] - ψnk_real_chunklocal = Array{complex(T),3}[zeros(complex(T), basis.fft_size) - for _ = 1:Threads.nthreads()] + ρ_chunklocal = [convert(array_type(basis), zeros(T, basis.fft_size..., basis.model.n_spin_components)) + for _ = 1:Threads.nthreads()] + ψnk_real_chunklocal = [convert(array_type(basis), zeros(complex(T), basis.fft_size)) + for _ = 1:Threads.nthreads()] @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length)) Threads.@spawn for (ik, n) in chunk # spawn a task per chunk + kpt = basis.kpoints[ik] ψnk_real = ψnk_real_chunklocal[ichunk] ρ_loc = ρ_chunklocal[ichunk] - kpt = basis.kpoints[ik] - G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n]) + G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n]) ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real) end end diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl index 38a555f262..ee46fa1199 100644 --- a/src/eigen/diag.jl +++ b/src/eigen/diag.jl @@ -54,7 +54,10 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint:: end # Transform results into a nicer datastructure - (λ=[real.(res.λ) for res in results], + # TODO: keep λ on the gpu? Careful then, as self_consistent_field's eigenvalues + # will be a CuArray -> due to the Smearing.occupation function, occupation will also + # be a CuArray, so no scalar indexing (in ene_ops, in compute_density...) + (λ=[Array(real.(res.λ)) for res in results], X=[res.X for res in results], residual_norms=[res.residual_norms for res in results], iterations=[res.iterations for res in results], diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl index 832eaf2cee..8548d79c58 100644 --- a/src/eigen/diag_lobpcg_hyper.jl +++ b/src/eigen/diag_lobpcg_hyper.jl @@ -9,6 +9,7 @@ function lobpcg_hyper(A, X0; maxiter=100, prec=nothing, result = LOBPCG(A, X0, I, prec, tol, maxiter; n_conv_check=n_conv_check, kwargs...) n_conv_check === nothing && (n_conv_check = size(X0, 2)) + converged = maximum(result.residual_norms[1:n_conv_check]) < tol iterations = size(result.residual_history, 2) - 1 diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl index 205943017d..702acbd2c5 100644 --- a/src/eigen/lobpcg_hyper_impl.jl +++ b/src/eigen/lobpcg_hyper_impl.jl @@ -43,17 +43,83 @@ vprintln(args...) = nothing using LinearAlgebra -using BlockArrays # used for the `mortar` command which makes block matrices +import Base: * +include("../workarounds/gpu_arrays.jl") -# when X or Y are BlockArrays, this makes the return value be a proper array (not a BlockArray) -function array_mul(X::AbstractArray{T}, Y) where T - Z = Array{T}(undef, size(X, 1), size(Y, 2)) - mul!(Z, X, Y) +# For now, BlockMatrix can store arrays of different types (for example, an element +# of type views and one of type Matrix). Maybe for performance issues it should only +# store arrays of the same type? + +struct BlockMatrix + blocks::Tuple + size::Tuple{Int64,Int64} +end + +""" +Build a BlockMatrix containing the given arrays, from left to right. +This function will fail (for now) if: + -the arrays do not all have the same "height" (ie size[1] must match). +""" +function make_block_vector(arrays::AbstractArray...) + length(arrays) ==0 && error("Empty BlockMatrix is not currently implemented") + n_ref= size(arrays[1])[1] + m=0 + for array in arrays + n_i, m_i = size(array) + n_ref != n_i && error("The given arrays do not have matching 'height': cannot build a BlockMatrix out of them.") + m += m_i + end + BlockMatrix(arrays, (n_ref,m)) +end + + +""" +Given A and B as two BlockMatrixs [A1, A2, A3], [B1, B2, B3] form the matrix +A'B (which is not a BlockMatrix). block_overlap also has compatible versions with two Arrays. +block_overlap always compute some form of adjoint, ie the product A'*B. +""" +@views function block_overlap(A::BlockMatrix, B::BlockMatrix) + rows = A.size[2] + cols = B.size[2] + ret = similar(A.blocks[1], rows, cols) + + orow = 0 # row offset + for (iA, blA) in enumerate(A.blocks) + ocol = 0 # column offset + for (iB, blB) in enumerate(B.blocks) + ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] = blA' * blB + ocol += size(blB, 2) + end + orow += size(blA, 2) + end + ret +end + +block_overlap(blocksA::BlockMatrix, B) = block_overlap(blocksA, make_block_vector(B)) +block_overlap(A, B) = A' * B # Default fallback method. Note the adjoint. + +""" +Given A as a BlockMatrix [A1, A2, A3] and B a Matrix, compute the matrix-matrix product +A * B avoiding a concatenation of the blocks to a dense array. +""" +@views function *(Ablock::BlockMatrix, B) + res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :] # First multiplication + offset = size(Ablock.blocks[1], 2) + for block in Ablock.blocks[2:end] + mul!(res, block, B[offset .+ (1:size(block, 2)), :], 1, 1) + offset += size(block, 2) + end + res +end + +function LinearAlgebra.mul!(res,A::BlockMatrix,B::AbstractArray,α,β) + # Has slightly better performances than a naive res = α*A*B - β*res + mul!(res, A*B, I, α, β) end # Perform a Rayleigh-Ritz for the N first eigenvectors. -@timing function rayleigh_ritz(X, AX, N) - F = eigen(Hermitian(array_mul(X', AX))) +@timing function rayleigh_ritz(X::BlockMatrix, AX::BlockMatrix, N) + F = eigen(Hermitian(block_overlap(X, AX))) # block_overlap(X,AX) is an AbstractArray, not a BlockMatrix F.vectors[:,1:N], F.values[1:N] end @@ -170,16 +236,16 @@ end niter = 1 ninners = zeros(Int,0) while true - BYX = BY'X - # XXX the one(T) instead of plain old 1 is because of https://github.com/JuliaArrays/BlockArrays.jl/issues/176 + BYX = block_overlap(BY,X) # = BY' X mul!(X, Y, BYX, -one(T), one(T)) # X -= Y*BY'X # If the orthogonalization has produced results below 2eps, we drop them # This is to be able to orthogonalize eg [1;0] against [e^iθ;0], # as can happen in extreme cases in the ortho!(cP, cX) dropped = drop!(X) if dropped != [] - @views mul!(X[:, dropped], Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X + X[:, dropped] .-= Y * block_overlap(BY,X[:, dropped]) #X = X - Y'*BY*X end + if norm(BYX) < tol && niter > 1 push!(ninners, 0) break @@ -215,11 +281,10 @@ function final_retval(X, AX, resid_history, niter, n_matvec) residuals = AX .- X*Diagonal(λ) (λ=λ, X=X, residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)], - residual_history=resid_history[:, 1:niter+1], - n_matvec=n_matvec) + residual_history=resid_history[:, 1:niter+1], n_matvec=n_matvec) + #λ doesn't have to be on the GPU, but X does (ψ should always be on GPU throughout the code). end - ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1)) ### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P) ### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y @@ -230,12 +295,13 @@ end miniter=1, ortho_tol=2eps(real(eltype(X))), n_conv_check=nothing, display_progress=false) N, M = size(X) + # If N is too small, we will likely get in trouble error_message(verb) = "The eigenproblem is too small, and the iterative " * - "eigensolver $verb fail; increase the number of " * - "degrees of freedom, or use a dense eigensolver." - N > 3M || error(error_message("will")) - N >= 3M+5 || @warn error_message("might") + "eigensolver $verb fail; increase the number of " * + "degrees of freedom, or use a dense eigensolver." + N > 3M || error(error_message("will")) + N >= 3M+5 || @warn error_message("might") n_conv_check === nothing && (n_conv_check = M) resid_history = zeros(real(eltype(X)), M, maxiter+1) @@ -271,6 +337,7 @@ end nlocked = 0 niter = 0 # the first iteration is fake λs = @views [(X[:,n]'*AX[:,n]) / (X[:,n]'BX[:,n]) for n=1:M] + λs = oftype(X[:,1], λs) #Offload to GPU if needed new_X = X new_AX = AX new_BX = BX @@ -286,13 +353,13 @@ end # Form Rayleigh-Ritz subspace if niter > 1 - Y = mortar((X, R, P)) - AY = mortar((AX, AR, AP)) - BY = mortar((BX, BR, BP)) # data shared with (X, R, P) in non-general case + Y = make_block_vector(X, R, P) + AY = make_block_vector(AX, AR, AP) + BY = make_block_vector(BX, BR, BP) # data shared with (X, R, P) in non-general case else - Y = mortar((X, R)) - AY = mortar((AX, AR)) - BY = mortar((BX, BR)) # data shared with (X, R) in non-general case + Y = make_block_vector(X, R) + AY = make_block_vector(AX, AR) + BY = make_block_vector(BX, BR) # data shared with (X, R) in non-general case end cX, λs = rayleigh_ritz(Y, AY, M-nlocked) @@ -300,9 +367,9 @@ end # wait on updating P because we have to know which vectors # to lock (and therefore the residuals) before computing P # only for the unlocked vectors. This results in better convergence. - new_X = array_mul(Y, cX) - new_AX = array_mul(AY, cX) # no accuracy loss, since cX orthogonal - new_BX = (B == I) ? new_X : array_mul(BY, cX) + new_X = Y * cX + new_AX = AY * cX # no accuracy loss, since cX orthogonal + new_BX = (B == I) ? new_X : BY * cX end ### Compute new residuals @@ -356,20 +423,23 @@ end # orthogonalization, see Hetmaniuk & Lehoucq, and Duersch et. al. # cP = copy(cX) # cP[Xn_indices,:] .= 0 - e = zeros(eltype(X), size(cX, 1), M - prev_nlocked) - for i in 1:length(Xn_indices) - e[Xn_indices[i], i] = 1 - end + + lenXn = length(Xn_indices) + e = zero(similar(X, size(cX, 1), M - prev_nlocked)) + lower_diag = one(similar(X, lenXn, lenXn)) + #e has zeros everywhere except on one of its lower diagonal + e[Xn_indices[1] : last(Xn_indices), 1 : lenXn] = lower_diag + cP = cX .- e cP = cP[:, Xn_indices] # orthogonalize against all Xn (including newly locked) ortho!(cP, cX, cX, tol=ortho_tol) # Get new P - new_P = array_mul( Y, cP) - new_AP = array_mul(AY, cP) + new_P = Y * cP + new_AP = AY * cP if B != I - new_BP = array_mul(BY, cP) + new_BP = BY * cP else new_BP = new_P end @@ -414,8 +484,8 @@ end # Orthogonalize R wrt all X, newly active P if niter > 0 - Z = mortar((full_X, P)) - BZ = mortar((full_BX, BP)) # data shared with (full_X, P) in non-general case + Z = make_block_vector(full_X, P) + BZ = make_block_vector(full_BX, BP) # data shared with (full_X, P) in non-general case else Z = full_X BZ = full_BX diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl index 4c26f39618..6e37d54ae4 100644 --- a/src/eigen/preconditioners.jl +++ b/src/eigen/preconditioners.jl @@ -27,7 +27,7 @@ PreconditionerNone(basis, kpt) = I mutable struct PreconditionerTPA{T <: Real} basis::PlaneWaveBasis kpt::Kpoint - kin::Vector{T} # kinetic energy of every G + kin::AbstractVector{T} # kinetic energy of every G mean_kin::Union{Nothing, Vector{T}} # mean kinetic energy of every band default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift end @@ -37,6 +37,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift= isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.") scaling = only(kinetic_term).scaling_factor kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2) + kin = convert(array_type(basis), kin) PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift) end @@ -68,4 +69,5 @@ end function precondprep!(P::PreconditionerTPA, X) P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)] + P.mean_kin = convert(array_type(P.basis), P.mean_kin) end diff --git a/src/fft.jl b/src/fft.jl index 8d2b73105d..09f1c16bd3 100644 --- a/src/fft.jl +++ b/src/fft.jl @@ -1,4 +1,7 @@ import FFTW +import CUDA +import GPUArrays +import AbstractFFTs # # Perform (i)FFTs. @@ -253,15 +256,15 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE Plan a FFT of type `T` and size `fft_size`, spending some time on finding an optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned. """ -function build_fft_plans(T::Union{Type{Float32}, Type{Float64}}, fft_size) - tmp = Array{Complex{T}}(undef, fft_size...) - ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T)) - opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T)) +#Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp) +function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}} + tmp = similar(array_type, Complex{T}, fft_size...) + ipFFT = AbstractFFTs.plan_fft!(tmp) + opFFT = AbstractFFTs.plan_fft(tmp) # backward by inverting and stripping off normalizations ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p end - # TODO Some grid sizes are broken in the generic FFT implementation # in FourierTransforms, for more details see workarounds/fft_generic.jl default_primes(::Type{Float32}) = (2, 3, 5) diff --git a/src/guess_density.jl b/src/guess_density.jl index a7458030be..796a7e82d4 100644 --- a/src/guess_density.jl +++ b/src/guess_density.jl @@ -66,7 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti @warn("Returning zero spin density guess, because no initial magnetization has " * "been specified in any of the given elements / atoms. Your SCF will likely " * "not converge to a spin-broken solution.") - return zeros(T, basis.fft_size) + return convert(array_type(basis),zeros(T, basis.fft_size)) end @assert length(magmoms) == length(atoms) == length(positions) @@ -93,23 +93,27 @@ which follow the functional form and are placed at `position` (in fractional coordinates). """ function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T} - ρ = zeros(complex(T), basis.fft_size) - isempty(gaussians) && return G_to_r(basis, ρ) + isempty(gaussians) && return G_to_r(basis, similar(G_vectors(basis),complex(T), basis.fft_size)) + + #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility) + recip_lattice = basis.model.recip_lattice + gaussians = SVector{size(gaussians)[1]}(gaussians) # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx, # where f(x) is a weighted gaussian - # - # is formed from a superposition of atomic densities, each scaled by a prefactor - for (iG, G) in enumerate(G_vectors(basis)) - Gsq = sum(abs2, basis.model.recip_lattice * G) + function build_ρ(G) + Gsq = sum(abs2, recip_lattice * G) + res = zero(complex(T)) for (coeff, decay_length, r) in gaussians form_factor::T = exp(-Gsq * T(decay_length)^2) - ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r)) + res += T(coeff) * form_factor* cis2pi(-dot(G, r)) end + res end + ρ = map(build_ρ, basis.G_vectors)/ sqrt(basis.model.unit_cell_volume) #Can't use map! as we are converting an array of Vec3 to an array of complex # projection in the normalized plane wave basis - G_to_r(basis, ρ / sqrt(basis.model.unit_cell_volume)) + G_to_r(basis, ρ) end diff --git a/src/interpolation_transfer.jl b/src/interpolation_transfer.jl index 91aea2b884..7643952d5d 100644 --- a/src/interpolation_transfer.jl +++ b/src/interpolation_transfer.jl @@ -81,6 +81,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat, n_bands = size(data_in, 2) n_Gk_out = length(G_vectors(basis_out, kpoint_out)) data_out = similar(data_in, n_Gk_out, n_bands) .= 0 + #TODO: use a map, or this will not be GPU compatible (scalar indexing) for iin in 1:size(data_in, 1) idx_fft = kpoint_in.mapping[iin] idx_fft in keys(kpoint_out.mapping_inv) || continue diff --git a/src/orbitals.jl b/src/orbitals.jl index 35104c3320..22ce9c2e51 100644 --- a/src/orbitals.jl +++ b/src/orbitals.jl @@ -51,6 +51,9 @@ function unsafe_unpack_ψ(x, sizes_ψ) end unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ)) +using Random function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T} - ortho_qr(randn(Complex{T}, length(G_vectors(basis, kpt)), howmany)) -end + orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany) + randn!(TaskLocalRNG(), orbitals) #Force the use of GPUArrays.jl's random function if using the GPU + ortho_qr(orbitals) +end \ No newline at end of file diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl index 4784bf1f9e..9f73ddc98e 100644 --- a/src/terms/Hamiltonian.jl +++ b/src/terms/Hamiltonian.jl @@ -51,7 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch end end function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T} - (ψ_reals=[zeros(complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], ) + (ψ_reals=[convert(array_type(basis), zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], ) end Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ) @@ -89,9 +89,8 @@ Base.:*(H::Hamiltonian, ψ) = mul!(deepcopy(ψ), H, ψ) T = eltype(H.basis) n_bands = size(ψ, 2) Hψ_fourier = similar(Hψ[:, 1]) - ψ_real = zeros(complex(T), H.basis.fft_size...) - Hψ_real = zeros(complex(T), H.basis.fft_size...) - + ψ_real = similar(ψ, complex(T), H.basis.fft_size...) + Hψ_real = similar(Hψ, complex(T), H.basis.fft_size...) # take ψi, IFFT it to ψ_real, apply each term to Hψ_fourier and Hψ_real, and add it to Hψ for iband = 1:n_bands Hψ_real .= 0 diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl index 07dc61548a..9c72e2d58d 100644 --- a/src/terms/hartree.jl +++ b/src/terms/hartree.jl @@ -31,13 +31,17 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where T # Solving the Poisson equation ΔV = -4π ρ in Fourier space # is multiplying elementwise by 4π / |G|^2. - poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)] + poisson_green_coeffs = map(G_vectors_cart(basis)) do G + 4T(π) /sum(abs2, G) + end if !isempty(model.atoms) # Assume positive charge from nuclei is exactly compensated by the electrons sum_charges = sum(charge_ionic, model.atoms) @assert sum_charges == model.n_electrons end - poisson_green_coeffs[1] = 0 # Compensating charge background => Zero DC + poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1)) + #Hackish way to do the following + # poisson_green_coeffs[1] = 0 # Compensating charge background => Zero DC TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs) end diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl index b5d757f24a..0d200f78a5 100644 --- a/src/terms/kinetic.jl +++ b/src/terms/kinetic.jl @@ -16,9 +16,10 @@ struct TermKinetic <: Term kinetic_energies::Vector{<:AbstractVector} # kinetic energy 1/2|G+k|^2 for every kpoint end function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T} - kinetic_energies = [[T(scaling_factor) * sum(abs2, Gk) / 2 - for Gk in Gplusk_vectors_cart(basis, kpt)] - for kpt in basis.kpoints] + kinetic_energies = [convert(array_type(basis), + [T(scaling_factor) * sum(abs2, Gk) / 2 + for Gk in Gplusk_vectors_cart(basis, kpt)]) + for kpt in basis.kpoints] TermKinetic(T(scaling_factor), kinetic_energies) end diff --git a/src/terms/local.jl b/src/terms/local.jl index 678adf2197..9867eeef1e 100644 --- a/src/terms/local.jl +++ b/src/terms/local.jl @@ -74,7 +74,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T} # positions, this involves a form factor (`local_potential_fourier`) # and a structure factor e^{-i G·r} - pot_fourier = map(G_vectors(basis)) do G + #This operation needs to be done only once, so let's try to make it happen on CPU (else we needs to isbitsify the pseudopotentials) + pot_fourier = map(Array(G_vectors(basis))) do G pot = sum(model.atom_groups) do group element = model.atoms[first(group)] form_factor::T = local_potential_fourier(element, norm(model.recip_lattice * G)) @@ -82,8 +83,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T} end pot / sqrt(model.unit_cell_volume) end - - pot_real = G_to_r(basis, pot_fourier) + #If needed, send to the GPU the atomic local term. + pot_real = G_to_r(basis, convert(array_type(basis),pot_fourier)) TermAtomicLocal(pot_real) end diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl index 8f6cbc96fa..47ab910b52 100644 --- a/src/terms/nonlocal.jl +++ b/src/terms/nonlocal.jl @@ -15,7 +15,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T} isempty(psp_groups) && return TermNoop() ops = map(basis.kpoints) do kpt P = build_projection_vectors_(basis, kpt, psps, psp_positions) - D = build_projection_coefficients_(T, psps, psp_positions) + D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type(basis)) NonlocalOperator(basis, kpt, P, D) end TermAtomicNonlocal(ops) @@ -31,6 +31,8 @@ end isnothing(ψ) && return (E=T(Inf), ops=term.ops) E = zero(T) + occ = [convert(array_type(basis), oc) for oc in occ] + for (ik, kpt) in enumerate(basis.kpoints) Pψ = term.ops[ik].P' * ψ[ik] # nproj x nband band_enes = dropdims(sum(real.(conj.(Pψ) .* (term.ops[ik].D * Pψ)), dims=1), dims=1) @@ -90,7 +92,7 @@ end # The ordering of the projector indices is (A,l,m,i), where A is running over all # atoms, l, m are AM quantum numbers and i is running over all projectors for a # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree. -function build_projection_coefficients_(T, psps, psp_positions) +function build_projection_coefficients_(T, psps, psp_positions; array_type = Array) # TODO In the current version the proj_coeffs still has a lot of zeros. # One could improve this by storing the blocks as a list or in a # BlockDiagonal data structure @@ -106,7 +108,7 @@ function build_projection_coefficients_(T, psps, psp_positions) end # psp, r @assert count == n_proj - proj_coeffs + convert(array_type,proj_coeffs) end # Builds the projection coefficient matrix for a single atom @@ -170,7 +172,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint, end end @assert offset == n_proj - proj_vectors + convert(array_type(basis), proj_vectors) end """ diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl new file mode 100644 index 0000000000..cb3e43fd09 --- /dev/null +++ b/src/workarounds/gpu_arrays.jl @@ -0,0 +1,16 @@ +#TODO: remove this when it is implemented in GPUArrays +import LinearAlgebra.dot, LinearAlgebra.eigen, LinearAlgebra.RealHermSymComplexHerm +using LinearAlgebra +using GPUArrays +using CUDA + +LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y) + +function LinearAlgebra.eigen(A::RealHermSymComplexHerm{T,AT}) where {T,AT <: CuArray} + if eltype(A) <: Complex + vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data) + else + vals, vects = CUDA.CUSOLVER.syevd!('V','U',A.data) + end + (vectors = vects, values = vals) +end