diff --git a/Project.toml b/Project.toml
index 27715904b8..26c2f924b7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,12 +8,14 @@ AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 Brillouin = "23470ee3-d0df-4052-8b1a-8cbd6363e7f0"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DftFunctionals = "6bd331d2-b28d-4fd3-880e-1a1c7f37947f"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 InteratomicPotentials = "a9efe35a-c65d-452d-b8a8-82646cd5cb04"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -33,6 +35,7 @@ Primes = "27ebfcd6-29c5-5fa9-bf4b-fb8fc14df3ae"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
diff --git a/examples/gpu.jl b/examples/gpu.jl
new file mode 100644
index 0000000000..f59c12d12b
--- /dev/null
+++ b/examples/gpu.jl
@@ -0,0 +1,26 @@
+using DFTK
+using CUDA
+using MKL
+setup_threading(n_blas=1)
+
+a = 10.263141334305942  # Lattice constant in Bohr
+lattice = a / 2 .* [[0 1 1.]; [1 0 1.]; [1 1 0.]]
+Si = ElementPsp(:Si, psp=load_psp("hgh/lda/Si-q4"))
+atoms     = [Si, Si]
+positions = [ones(3)/8, -ones(3)/8];
+terms_LDA = [Kinetic(), AtomicLocal(), AtomicNonlocal()]
+
+# Setup an LDA model and discretize using
+# a single k-point and a small `Ecut` of 5 Hartree.
+mod = Model(lattice, atoms, positions; terms=terms_LDA,symmetries=false)
+basis = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1))
+basis_gpu = PlaneWaveBasis(mod; Ecut=30, kgrid=(1, 1, 1), array_type = CuArray)
+
+
+DFTK.reset_timer!(DFTK.timer)
+scfres = self_consistent_field(basis; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
+println(DFTK.timer)
+
+DFTK.reset_timer!(DFTK.timer)
+scfres_gpu = self_consistent_field(basis_gpu; solver=scf_damping_solver(1.0), is_converged=DFTK.ScfConvergenceDensity(1e-3))
+println(DFTK.timer)
diff --git a/src/DFTK.jl b/src/DFTK.jl
index a3550f76fb..82b4d72b35 100644
--- a/src/DFTK.jl
+++ b/src/DFTK.jl
@@ -13,6 +13,10 @@ using spglib_jll
 using Unitful
 using UnitfulAtomic
 using ForwardDiff
+using AbstractFFTs
+using GPUArrays
+using CUDA
+using Random
 using ChainRulesCore
 
 export Vec3
diff --git a/src/Model.jl b/src/Model.jl
index 6f9e0c90cd..9878841c27 100644
--- a/src/Model.jl
+++ b/src/Model.jl
@@ -263,13 +263,48 @@ Examples of covectors are forces.
 Reciprocal vectors are a special case: they are covectors, but conventionally have an
 additional factor of 2π in their definition, so they transform rather with 2π times the
 inverse lattice transpose: q_cart = 2π lattice' \ q_red = recip_lattice * q_red.
+
+The trans_mat functions return the transition matrices required to do such a change of basis.
 =#
-vector_red_to_cart(model::Model, rred)        = model.lattice * rred
-vector_cart_to_red(model::Model, rcart)       = model.inv_lattice * rcart
-covector_red_to_cart(model::Model, fred)      = model.inv_lattice' * fred
-covector_cart_to_red(model::Model, fcart)     = model.lattice' * fcart
-recip_vector_red_to_cart(model::Model, qred)  = model.recip_lattice * qred
-recip_vector_cart_to_red(model::Model, qcart) = model.inv_recip_lattice * qcart
+
+trans_mat_vector_red_to_cart(model::Model) = model.lattice
+trans_mat_vector_cart_to_red(model::Model)   = model.inv_lattice
+trans_mat_covector_red_to_cart(model::Model)      = model.inv_lattice'
+trans_mat_covector_cart_to_red(model::Model)     = model.lattice'
+trans_mat_recip_vector_red_to_cart(model::Model)  = model.recip_lattice 
+trans_mat_recip_vector_cart_to_red(model::Model) = model.inv_recip_lattice
+
+fun_mat_list =(:vector_red_to_cart,
+                :vector_cart_to_red,
+                :covector_red_to_cart,
+                :covector_cart_to_red,
+                :recip_vector_red_to_cart,
+                :recip_vector_cart_to_red
+)
+
+for fun1 in fun_mat_list
+    #=
+    The following functions compute the change of basis for a given vector. To do so,
+    they call the trans_mat functions to get the corresponding transition matrix.
+    These functions can be broadcasted over an Array of vectors: however, they are
+    not GPU compatible, as they require the model, which is no isbits.
+    =#
+    @eval $fun1(model::Model, vec) = $(Symbol("trans_mat_"*string(fun1)))(model::Model) * vec
+    #=
+    The following functions take an AbstractArray of vectors and compute the change of basis
+    for every vector in the AbstractArray: they return an AbstractArray of the same type
+    and size as the input, but containing the vectors in a new basis.
+    These functions are GPU compatible (ie the AbstractArray can be a GPUArray), since
+    they use a map and the transition matrices are static arrays.
+    =#
+    @eval function $(Symbol("map_"*string(fun1)))(model::Model, A::AbstractArray{AT}) where {AT <: Vec3}
+        trans_matrix = $(Symbol("trans_mat_"*string(fun1)))(model)
+        in_new_basis = map(A) do Ai
+            trans_matrix  * Ai
+        end
+        in_new_basis
+    end
+end
 
 #=
 Transformations on vectors and covectors are matrices and comatrices.
diff --git a/src/PlaneWaveBasis.jl b/src/PlaneWaveBasis.jl
index 43cc911a5f..0c7fe3bbca 100644
--- a/src/PlaneWaveBasis.jl
+++ b/src/PlaneWaveBasis.jl
@@ -39,7 +39,7 @@ Normalization conventions:
 
 `G_to_r` and `r_to_G` convert between these representations.
 """
-struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
+struct PlaneWaveBasis{T, VT, AT, GT, RT} <: AbstractBasis{T} where {VT <: Real, GT <: AT, RT <: AT, AT <: AbstractArray}
     # T is the default type to express data, VT the corresponding bare value type (i.e. not dual)
     model::Model{T, VT}
 
@@ -67,8 +67,8 @@ struct PlaneWaveBasis{T, VT} <: AbstractBasis{T} where {VT <: Real}
     G_to_r_normalization::T  # G_to_r = G_to_r_normalization * BFFT
 
     # "cubic" basis in reciprocal and real space, on which potentials and densities are stored
-    G_vectors::Array{Vec3{Int}, 3}
-    r_vectors::Array{Vec3{VT }, 3}
+    G_vectors::GT
+    r_vectors::RT
 
     ## MPI-local information of the kpoints this processor treats
     # Irreducible kpoints. In the case of collinear spin,
@@ -148,7 +148,7 @@ end
 # and are stored in PlaneWaveBasis for easy reconstruction.
 function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
                         kcoords, kweights, kgrid, kshift,
-                        symmetries_respect_rgrid, comm_kpts) where {T <: Real}
+                        symmetries_respect_rgrid, comm_kpts, array_type = Array) where {T <: Real}
     # Validate fft_size
     if variational
         max_E = sum(abs2, model.recip_lattice * floor.(Int, Vec3(fft_size) ./ 2)) / 2
@@ -191,7 +191,8 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     kweights_global = kweights
 
     # Setup FFT plans
-    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(T, fft_size)
+    Gs = G_vectors(fft_size, array_type)
+    (ipFFT, opFFT, ipBFFT, opBFFT) = build_fft_plans(similar(Gs,T), fft_size)
 
     # Normalization constants
     # r_to_G = r_to_G_normalization * FFT
@@ -244,22 +245,25 @@ function PlaneWaveBasis(model::Model{T}, Ecut::Number, fft_size, variational,
     end
     @assert mpi_sum(sum(kweights_thisproc), comm_kpts) ≈ model.n_spin_components
     @assert length(kpoints) == length(kweights_thisproc)
+    Threads.nthreads() != 1 && Gs isa AbstractGPUArray && error("Can't mix multi-threading and GPU computations yet.")
 
     VT = value_type(T)
     dvol  = model.unit_cell_volume ./ prod(fft_size)
     r_vectors = [Vec3{VT}(VT(i-1) / N1, VT(j-1) / N2, VT(k-1) / N3) for i = 1:N1, j = 1:N2, k = 1:N3]
     terms = Vector{Any}(undef, length(model.term_types))  # Dummy terms array, filled below
 
-    basis = PlaneWaveBasis{T,value_type(T)}(
+    RT = array_type{Vec3{VT }, 3}
+    GT = array_type{Vec3{Int }, 3}
+
+    basis = PlaneWaveBasis{T,value_type(T), array_type, GT, RT}(
         model, fft_size, dvol,
         Ecut, variational,
         opFFT, ipFFT, opBFFT, ipBFFT,
         r_to_G_normalization, G_to_r_normalization,
-        G_vectors(fft_size), r_vectors,
+        Gs, r_vectors,
         kpoints, kweights_thisproc, kgrid, kshift,
         kcoords_global, kweights_global, comm_kpts, krange_thisproc, krange_allprocs,
         symmetries, symmetries_respect_rgrid, terms)
-
     # Instantiate the terms with the basis
     for (it, t) in enumerate(model.term_types)
         term_name = string(nameof(typeof(t)))
@@ -277,7 +281,7 @@ end
                                 variational=true, fft_size=nothing,
                                 kgrid=nothing, kshift=nothing,
                                 symmetries_respect_rgrid=isnothing(fft_size),
-                                comm_kpts=MPI.COMM_WORLD) where {T <: Real}
+                                comm_kpts=MPI.COMM_WORLD, array_type = Array) where {T <: Real}
     if isnothing(fft_size)
         @assert variational
         if symmetries_respect_rgrid
@@ -295,7 +299,7 @@ end
         fft_size = compute_fft_size(model, Ecut, kcoords; factors)
     end
     PlaneWaveBasis(model, Ecut, fft_size, variational, kcoords, kweights,
-                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts)
+                   kgrid, kshift, symmetries_respect_rgrid, comm_kpts, array_type)
 end
 
 @doc raw"""
@@ -322,7 +326,7 @@ Creates a new basis identical to `basis`, but with a custom set of kpoints
     PlaneWaveBasis(basis.model, basis.Ecut,
                    basis.fft_size, basis.variational,
                    kcoords, kweights, kgrid, kshift,
-                   basis.symmetries_respect_rgrid, basis.comm_kpts)
+                   basis.symmetries_respect_rgrid, basis.comm_kpts, array_type = array_type(basis))
 end
 
 """
@@ -331,13 +335,15 @@ end
 The wave vectors `G` in reduced (integer) coordinates for a cubic basis set
 of given sizes.
 """
-function G_vectors(fft_size::Union{Tuple,AbstractVector})
+
+function G_vectors(fft_size::Union{Tuple,AbstractVector}, array_type = Array)
     # Note that a collect(G_vectors_generator(fft_size)) is 100-fold slower
     # than this implementation, hence the code duplication.
     start = .- cld.(fft_size .- 1, 2)
     stop  = fld.(fft_size .- 1, 2)
     axes  = [[collect(0:stop[i]); collect(start[i]:-1)] for i in 1:3]
-    [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
+    Gs = [Vec3{Int}(i, j, k) for i in axes[1], j in axes[2], k in axes[3]]
+    convert(array_type, Gs) #Offload to GPU if needed.
 end
 function G_vectors_generator(fft_size::Union{Tuple,AbstractVector})
     # The generator version is used mainly in symmetry.jl for lowpass_for_symmetry! and
@@ -358,6 +364,12 @@ or a ``k``-point `kpt`.
 G_vectors(basis::PlaneWaveBasis) = basis.G_vectors
 G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
+"""
+Return the type of array used for computations (Array if on CPU, CuArray, 
+ROCArray... if on GPU).
+"""
+array_type(basis::PlaneWaveBasis{T,VT,AT}) where {T, VT, AT} = AT
+
 
 @doc raw"""
     G_vectors_cart(basis::PlaneWaveBasis)
@@ -365,7 +377,7 @@ G_vectors(::PlaneWaveBasis, kpt::Kpoint) = kpt.G_vectors
 
 The list of ``G`` vectors of a given `basis` or `kpt`, in cartesian coordinates.
 """
-G_vectors_cart(basis::PlaneWaveBasis) = recip_vector_red_to_cart.(basis.model, G_vectors(basis))
+G_vectors_cart(basis::PlaneWaveBasis) = map_recip_vector_red_to_cart(basis.model, G_vectors(basis))
 function G_vectors_cart(basis::PlaneWaveBasis, kpt::Kpoint)
     recip_vector_red_to_cart.(basis.model, G_vectors(basis, kpt))
 end
diff --git a/src/common/ortho.jl b/src/common/ortho.jl
index a0f7508339..3936c5bb25 100644
--- a/src/common/ortho.jl
+++ b/src/common/ortho.jl
@@ -1,2 +1,3 @@
 # Orthonormalize
-ortho_qr(φk) = Matrix(qr(φk).Q)
+ortho_qr(φk::AbstractArray) = Matrix(qr(φk).Q) #LinearAlgebra.QRCompactWYQ -> Matrix
+ortho_qr(φk::CuArray) = CuArray(qr(φk).Q) #CUDA.CUSOLVER.CuQRPackedQ -> CuArray
diff --git a/src/densities.jl b/src/densities.jl
index 570e852937..4855bce9c3 100644
--- a/src/densities.jl
+++ b/src/densities.jl
@@ -26,18 +26,18 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
     chunk_length = cld(length(ik_n), Threads.nthreads())
 
     # chunk-local variables
-    ρ_chunklocal = Array{T,4}[zeros(T, basis.fft_size..., basis.model.n_spin_components)
-                               for _ = 1:Threads.nthreads()]
-    ψnk_real_chunklocal = Array{complex(T),3}[zeros(complex(T), basis.fft_size)
-                                               for _ = 1:Threads.nthreads()]
+    ρ_chunklocal = [convert(array_type(basis), zeros(T, basis.fft_size..., basis.model.n_spin_components))
+                    for _ = 1:Threads.nthreads()]
+    ψnk_real_chunklocal = [convert(array_type(basis), zeros(complex(T), basis.fft_size)) 
+                            for _ = 1:Threads.nthreads()]
 
     @sync for (ichunk, chunk) in enumerate(Iterators.partition(ik_n, chunk_length))
         Threads.@spawn for (ik, n) in chunk  # spawn a task per chunk
+            kpt = basis.kpoints[ik]
             ψnk_real = ψnk_real_chunklocal[ichunk]
             ρ_loc = ρ_chunklocal[ichunk]
 
-            kpt = basis.kpoints[ik]
-            G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])
+            G_to_r!(ψnk_real, basis, kpt, ψ[ik][:, n])            
             ρ_loc[:, :, :, kpt.spin] .+= occupation[ik][n] .* basis.kweights[ik] .* abs2.(ψnk_real)
         end
     end
diff --git a/src/eigen/diag.jl b/src/eigen/diag.jl
index 38a555f262..ee46fa1199 100644
--- a/src/eigen/diag.jl
+++ b/src/eigen/diag.jl
@@ -54,7 +54,10 @@ function diagonalize_all_kblocks(eigensolver, ham::Hamiltonian, nev_per_kpoint::
     end
 
     # Transform results into a nicer datastructure
-    (λ=[real.(res.λ) for res in results],
+    # TODO: keep λ on the gpu? Careful then, as self_consistent_field's eigenvalues
+    # will be a CuArray -> due to the Smearing.occupation function, occupation will also
+    # be a CuArray, so no scalar indexing (in ene_ops, in compute_density...)
+    (λ=[Array(real.(res.λ)) for res in results],
      X=[res.X for res in results],
      residual_norms=[res.residual_norms for res in results],
      iterations=[res.iterations for res in results],
diff --git a/src/eigen/diag_lobpcg_hyper.jl b/src/eigen/diag_lobpcg_hyper.jl
index 832eaf2cee..8548d79c58 100644
--- a/src/eigen/diag_lobpcg_hyper.jl
+++ b/src/eigen/diag_lobpcg_hyper.jl
@@ -9,6 +9,7 @@ function lobpcg_hyper(A, X0; maxiter=100, prec=nothing,
     result = LOBPCG(A, X0, I, prec, tol, maxiter; n_conv_check=n_conv_check, kwargs...)
 
     n_conv_check === nothing && (n_conv_check = size(X0, 2))
+
     converged = maximum(result.residual_norms[1:n_conv_check]) < tol
     iterations = size(result.residual_history, 2) - 1
 
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
index 205943017d..702acbd2c5 100644
--- a/src/eigen/lobpcg_hyper_impl.jl
+++ b/src/eigen/lobpcg_hyper_impl.jl
@@ -43,17 +43,83 @@
 vprintln(args...) = nothing
 
 using LinearAlgebra
-using BlockArrays # used for the `mortar` command which makes block matrices
+import Base: *
+include("../workarounds/gpu_arrays.jl")
 
-# when X or Y are BlockArrays, this makes the return value be a proper array (not a BlockArray)
-function array_mul(X::AbstractArray{T}, Y) where T
-    Z = Array{T}(undef, size(X, 1), size(Y, 2))
-    mul!(Z, X, Y)
+# For now, BlockMatrix can store arrays of different types (for example, an element 
+# of type views and one of type Matrix). Maybe for performance issues it should only
+# store arrays of the same type?
+
+struct BlockMatrix
+    blocks::Tuple
+    size::Tuple{Int64,Int64}
+end
+
+"""
+Build a BlockMatrix containing the given arrays, from left to right.
+This function will fail (for now) if:
+    -the arrays do not all have the same "height" (ie size[1] must match).
+"""
+function make_block_vector(arrays::AbstractArray...)
+    length(arrays) ==0 && error("Empty BlockMatrix is not currently implemented")
+    n_ref= size(arrays[1])[1]
+    m=0
+    for array in arrays
+        n_i, m_i = size(array)
+        n_ref != n_i && error("The given arrays do not have matching 'height': cannot build a BlockMatrix out of them.")
+        m += m_i
+    end
+    BlockMatrix(arrays, (n_ref,m))
+end
+
+
+"""
+Given A and B as two BlockMatrixs [A1, A2, A3], [B1, B2, B3] form the matrix
+A'B (which is not a BlockMatrix). block_overlap also has compatible versions with two Arrays. 
+block_overlap always compute some form of adjoint, ie the product A'*B.
+"""
+@views function block_overlap(A::BlockMatrix, B::BlockMatrix)
+    rows = A.size[2]
+    cols = B.size[2]
+    ret = similar(A.blocks[1], rows, cols)
+
+    orow = 0  # row offset
+    for (iA, blA) in enumerate(A.blocks)
+        ocol = 0  # column offset
+        for (iB, blB) in enumerate(B.blocks)
+            ret[orow .+ (1:size(blA, 2)), ocol .+ (1:size(blB, 2))] = blA' * blB
+            ocol += size(blB, 2)
+        end
+        orow += size(blA, 2)
+    end
+    ret
+end
+
+block_overlap(blocksA::BlockMatrix, B) = block_overlap(blocksA, make_block_vector(B))
+block_overlap(A, B) = A' * B # Default fallback method. Note the adjoint.
+
+"""
+Given A as a BlockMatrix [A1, A2, A3] and B a Matrix, compute the matrix-matrix product
+A * B avoiding a concatenation of the blocks to a dense array. 
+"""
+@views function *(Ablock::BlockMatrix, B)
+    res = Ablock.blocks[1] * B[1:size(Ablock.blocks[1], 2), :]  # First multiplication
+    offset = size(Ablock.blocks[1], 2)
+    for block in Ablock.blocks[2:end]
+        mul!(res, block, B[offset .+ (1:size(block, 2)), :], 1, 1)
+        offset += size(block, 2)
+    end
+    res
+end
+
+function LinearAlgebra.mul!(res,A::BlockMatrix,B::AbstractArray,α,β)
+    # Has slightly better performances than a naive res = α*A*B - β*res
+    mul!(res, A*B, I, α, β)
 end
 
 # Perform a Rayleigh-Ritz for the N first eigenvectors.
-@timing function rayleigh_ritz(X, AX, N)
-    F = eigen(Hermitian(array_mul(X', AX)))
+@timing function rayleigh_ritz(X::BlockMatrix, AX::BlockMatrix, N)
+    F = eigen(Hermitian(block_overlap(X, AX))) # block_overlap(X,AX) is an AbstractArray, not a BlockMatrix
     F.vectors[:,1:N], F.values[1:N]
 end
 
@@ -170,16 +236,16 @@ end
     niter = 1
     ninners = zeros(Int,0)
     while true
-        BYX = BY'X
-        # XXX the one(T) instead of plain old 1 is because of https://github.com/JuliaArrays/BlockArrays.jl/issues/176
+        BYX = block_overlap(BY,X) # = BY' X
         mul!(X, Y, BYX, -one(T), one(T)) # X -= Y*BY'X
         # If the orthogonalization has produced results below 2eps, we drop them
         # This is to be able to orthogonalize eg [1;0] against [e^iθ;0],
         # as can happen in extreme cases in the ortho!(cP, cX)
         dropped = drop!(X)
         if dropped != []
-            @views mul!(X[:, dropped], Y, BY' * (X[:, dropped]), -one(T), one(T)) # X -= Y*BY'X
+            X[:, dropped] .-= Y * block_overlap(BY,X[:, dropped]) #X = X - Y'*BY*X
         end
+
         if norm(BYX) < tol && niter > 1
             push!(ninners, 0)
             break
@@ -215,11 +281,10 @@ function final_retval(X, AX, resid_history, niter, n_matvec)
     residuals = AX .- X*Diagonal(λ)
     (λ=λ, X=X,
      residual_norms=[norm(residuals[:, i]) for i in 1:size(residuals, 2)],
-     residual_history=resid_history[:, 1:niter+1],
-     n_matvec=n_matvec)
+     residual_history=resid_history[:, 1:niter+1], n_matvec=n_matvec)
+    #λ doesn't have to be on the GPU, but X does (ψ should always be on GPU throughout the code).
 end
 
-
 ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
 ### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P)
 ### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y
@@ -230,12 +295,13 @@ end
                         miniter=1, ortho_tol=2eps(real(eltype(X))),
                         n_conv_check=nothing, display_progress=false)
     N, M = size(X)
+
     # If N is too small, we will likely get in trouble
     error_message(verb) = "The eigenproblem is too small, and the iterative " *
-                           "eigensolver $verb fail; increase the number of " *
-                           "degrees of freedom, or use a dense eigensolver."
-    N > 3M    || error(error_message("will"))
-    N >= 3M+5 || @warn error_message("might")
+                            "eigensolver $verb fail; increase the number of " *
+                            "degrees of freedom, or use a dense eigensolver."
+     N > 3M    || error(error_message("will"))
+     N >= 3M+5 || @warn error_message("might")
 
     n_conv_check === nothing && (n_conv_check = M)
     resid_history = zeros(real(eltype(X)), M, maxiter+1)
@@ -271,6 +337,7 @@ end
     nlocked = 0
     niter = 0  # the first iteration is fake
     λs = @views [(X[:,n]'*AX[:,n]) / (X[:,n]'BX[:,n]) for n=1:M]
+    λs = oftype(X[:,1], λs) #Offload to GPU if needed
     new_X = X
     new_AX = AX
     new_BX = BX
@@ -286,13 +353,13 @@ end
 
             # Form Rayleigh-Ritz subspace
             if niter > 1
-                Y = mortar((X, R, P))
-                AY = mortar((AX, AR, AP))
-                BY = mortar((BX, BR, BP))  # data shared with (X, R, P) in non-general case
+                Y = make_block_vector(X, R, P)
+                AY = make_block_vector(AX, AR, AP)
+                BY = make_block_vector(BX, BR, BP)  # data shared with (X, R, P) in non-general case
             else
-                Y  = mortar((X, R))
-                AY = mortar((AX, AR))
-                BY = mortar((BX, BR))  # data shared with (X, R) in non-general case
+                Y  = make_block_vector(X, R)
+                AY = make_block_vector(AX, AR)
+                BY = make_block_vector(BX, BR)  # data shared with (X, R) in non-general case
             end
             cX, λs = rayleigh_ritz(Y, AY, M-nlocked)
 
@@ -300,9 +367,9 @@ end
             # wait on updating P because we have to know which vectors
             # to lock (and therefore the residuals) before computing P
             # only for the unlocked vectors. This results in better convergence.
-            new_X  = array_mul(Y, cX)
-            new_AX = array_mul(AY, cX)  # no accuracy loss, since cX orthogonal
-            new_BX = (B == I) ? new_X : array_mul(BY, cX)
+            new_X  = Y * cX
+            new_AX = AY * cX  # no accuracy loss, since cX orthogonal
+            new_BX = (B == I) ? new_X : BY * cX
         end
 
         ### Compute new residuals
@@ -356,20 +423,23 @@ end
             # orthogonalization, see Hetmaniuk & Lehoucq, and Duersch et. al.
             # cP = copy(cX)
             # cP[Xn_indices,:] .= 0
-            e = zeros(eltype(X), size(cX, 1), M - prev_nlocked)
-            for i in 1:length(Xn_indices)
-                e[Xn_indices[i], i] = 1
-            end
+
+            lenXn = length(Xn_indices)
+            e = zero(similar(X, size(cX, 1), M - prev_nlocked))
+            lower_diag = one(similar(X, lenXn, lenXn))
+            #e has zeros everywhere except on one of its lower diagonal
+            e[Xn_indices[1] : last(Xn_indices), 1 : lenXn] = lower_diag
+
             cP = cX .- e
             cP = cP[:, Xn_indices]
             # orthogonalize against all Xn (including newly locked)
             ortho!(cP, cX, cX, tol=ortho_tol)
 
             # Get new P
-            new_P  = array_mul( Y, cP)
-            new_AP = array_mul(AY, cP)
+            new_P  = Y * cP
+            new_AP = AY * cP
             if B != I
-                new_BP = array_mul(BY, cP)
+                new_BP = BY * cP
             else
                 new_BP = new_P
             end
@@ -414,8 +484,8 @@ end
 
         # Orthogonalize R wrt all X, newly active P
         if niter > 0
-            Z  = mortar((full_X, P))
-            BZ = mortar((full_BX, BP))  # data shared with (full_X, P) in non-general case
+            Z  = make_block_vector(full_X, P)
+            BZ = make_block_vector(full_BX, BP) # data shared with (full_X, P) in non-general case
         else
             Z  = full_X
             BZ = full_BX
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
index 4c26f39618..6e37d54ae4 100644
--- a/src/eigen/preconditioners.jl
+++ b/src/eigen/preconditioners.jl
@@ -27,7 +27,7 @@ PreconditionerNone(basis, kpt) = I
 mutable struct PreconditionerTPA{T <: Real}
     basis::PlaneWaveBasis
     kpt::Kpoint
-    kin::Vector{T}  # kinetic energy of every G
+    kin::AbstractVector{T}  # kinetic energy of every G
     mean_kin::Union{Nothing, Vector{T}}  # mean kinetic energy of every band
     default_shift::T # if mean_kin is not set by `precondprep!`, this will be used for the shift
 end
@@ -37,6 +37,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     isempty(kinetic_term) && error("Preconditioner should be disabled when no Kinetic term is used.")
     scaling = only(kinetic_term).scaling_factor
     kin = Vector{T}([scaling * sum(abs2, q) for q in Gplusk_vectors_cart(basis, kpt)] ./ 2)
+    kin = convert(array_type(basis), kin)
     PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
 end
 
@@ -68,4 +69,5 @@ end
 
 function precondprep!(P::PreconditionerTPA, X)
     P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)]
+    P.mean_kin = convert(array_type(P.basis), P.mean_kin)
 end
diff --git a/src/fft.jl b/src/fft.jl
index 8d2b73105d..09f1c16bd3 100644
--- a/src/fft.jl
+++ b/src/fft.jl
@@ -1,4 +1,7 @@
 import FFTW
+import CUDA
+import GPUArrays
+import AbstractFFTs
 
 #
 # Perform (i)FFTs.
@@ -253,15 +256,15 @@ _fftw_flags(::Type{Float64}) = FFTW.MEASURE
 Plan a FFT of type `T` and size `fft_size`, spending some time on finding an
 optimal algorithm. (Inplace, out-of-place) x (forward, backward) FFT plans are returned.
 """
-function build_fft_plans(T::Union{Type{Float32}, Type{Float64}}, fft_size)
-    tmp = Array{Complex{T}}(undef, fft_size...)
-    ipFFT = FFTW.plan_fft!(tmp, flags=_fftw_flags(T))
-    opFFT = FFTW.plan_fft(tmp, flags=_fftw_flags(T))
+#Removed the flags as CUDA's plan_fft doesn't need flags. If this is a performance issue, we should check array_type's type then call either FFTW.plan_fft(tmp, flags = ...) or CUDA.plan_fft(tmp)
+function build_fft_plans(array_type::AbstractArray{T}, fft_size) where {T<:Union{Float32,Float64}}
+    tmp = similar(array_type, Complex{T}, fft_size...)
+    ipFFT = AbstractFFTs.plan_fft!(tmp)
+    opFFT = AbstractFFTs.plan_fft(tmp)
     # backward by inverting and stripping off normalizations
     ipFFT, opFFT, inv(ipFFT).p, inv(opFFT).p
 end
 
-
 # TODO Some grid sizes are broken in the generic FFT implementation
 # in FourierTransforms, for more details see workarounds/fft_generic.jl
 default_primes(::Type{Float32}) = (2, 3, 5)
diff --git a/src/guess_density.jl b/src/guess_density.jl
index a7458030be..796a7e82d4 100644
--- a/src/guess_density.jl
+++ b/src/guess_density.jl
@@ -66,7 +66,7 @@ function _guess_spin_density(basis::PlaneWaveBasis{T}, atoms, positions, magneti
         @warn("Returning zero spin density guess, because no initial magnetization has " *
               "been specified in any of the given elements / atoms. Your SCF will likely " *
               "not converge to a spin-broken solution.")
-        return zeros(T, basis.fft_size)
+        return convert(array_type(basis),zeros(T, basis.fft_size))
     end
 
     @assert length(magmoms) == length(atoms) == length(positions)
@@ -93,23 +93,27 @@ which follow the functional form
 and are placed at `position` (in fractional coordinates).
 """
 function gaussian_superposition(basis::PlaneWaveBasis{T}, gaussians) where {T}
-    ρ = zeros(complex(T), basis.fft_size)
-    isempty(gaussians) && return G_to_r(basis, ρ)
+    isempty(gaussians) && return G_to_r(basis, similar(G_vectors(basis),complex(T), basis.fft_size))
+
+    #These copies are required so that recip_lattice and gaussians are isbits (GPU compatibility)
+    recip_lattice = basis.model.recip_lattice
+    gaussians = SVector{size(gaussians)[1]}(gaussians)
 
     # Fill ρ with the (unnormalized) Fourier transform, i.e. ∫ e^{-iGx} f(x) dx,
     # where f(x) is a weighted gaussian
-    #
-    # is formed from a superposition of atomic densities, each scaled by a prefactor
-    for (iG, G) in enumerate(G_vectors(basis))
-        Gsq = sum(abs2, basis.model.recip_lattice * G)
+    function build_ρ(G)
+        Gsq = sum(abs2, recip_lattice * G)
+        res = zero(complex(T))
         for (coeff, decay_length, r) in gaussians
             form_factor::T = exp(-Gsq * T(decay_length)^2)
-            ρ[iG] += T(coeff) * form_factor * cis2pi(-dot(G, r))
+            res += T(coeff) * form_factor* cis2pi(-dot(G, r))
         end
+        res
     end
+    ρ = map(build_ρ, basis.G_vectors)/ sqrt(basis.model.unit_cell_volume) #Can't use map! as we are converting an array of Vec3 to an array of complex
 
     # projection in the normalized plane wave basis
-    G_to_r(basis, ρ / sqrt(basis.model.unit_cell_volume))
+    G_to_r(basis, ρ)
 end
 
 
diff --git a/src/interpolation_transfer.jl b/src/interpolation_transfer.jl
index 91aea2b884..7643952d5d 100644
--- a/src/interpolation_transfer.jl
+++ b/src/interpolation_transfer.jl
@@ -81,6 +81,7 @@ function interpolate_kpoint(data_in::AbstractVecOrMat,
     n_bands  = size(data_in, 2)
     n_Gk_out = length(G_vectors(basis_out, kpoint_out))
     data_out = similar(data_in, n_Gk_out, n_bands) .= 0
+    #TODO: use a map, or this will not be GPU compatible (scalar indexing)
     for iin in 1:size(data_in, 1)
         idx_fft = kpoint_in.mapping[iin]
         idx_fft in keys(kpoint_out.mapping_inv) || continue
diff --git a/src/orbitals.jl b/src/orbitals.jl
index 35104c3320..22ce9c2e51 100644
--- a/src/orbitals.jl
+++ b/src/orbitals.jl
@@ -51,6 +51,9 @@ function unsafe_unpack_ψ(x, sizes_ψ)
 end
 unpack_ψ(x, sizes_ψ) = deepcopy(unsafe_unpack_ψ(x, sizes_ψ))
 
+using Random
 function random_orbitals(basis::PlaneWaveBasis{T}, kpt::Kpoint, howmany) where {T}
-    ortho_qr(randn(Complex{T}, length(G_vectors(basis, kpt)), howmany))
-end
+    orbitals = similar(basis.G_vectors, Complex{T}, length(G_vectors(basis, kpt)), howmany)
+    randn!(TaskLocalRNG(), orbitals) #Force the use of GPUArrays.jl's random function if using the GPU
+    ortho_qr(orbitals)
+end
\ No newline at end of file
diff --git a/src/terms/Hamiltonian.jl b/src/terms/Hamiltonian.jl
index 4784bf1f9e..9f73ddc98e 100644
--- a/src/terms/Hamiltonian.jl
+++ b/src/terms/Hamiltonian.jl
@@ -51,7 +51,7 @@ function HamiltonianBlock(basis, kpoint, operators, scratch=ham_allocate_scratch
     end
 end
 function ham_allocate_scratch_(basis::PlaneWaveBasis{T}) where {T}
-    (ψ_reals=[zeros(complex(T), basis.fft_size...) for _ = 1:Threads.nthreads()], )
+    (ψ_reals=[convert(array_type(basis), zeros(complex(T), basis.fft_size...)) for _ = 1:Threads.nthreads()], )
 end
 
 Base.:*(H::HamiltonianBlock, ψ) = mul!(similar(ψ), H, ψ)
@@ -89,9 +89,8 @@ Base.:*(H::Hamiltonian, ψ) = mul!(deepcopy(ψ), H, ψ)
     T = eltype(H.basis)
     n_bands = size(ψ, 2)
     Hψ_fourier = similar(Hψ[:, 1])
-    ψ_real  = zeros(complex(T), H.basis.fft_size...)
-    Hψ_real = zeros(complex(T), H.basis.fft_size...)
-
+    ψ_real  = similar(ψ, complex(T), H.basis.fft_size...)
+    Hψ_real = similar(Hψ, complex(T), H.basis.fft_size...)
     # take ψi, IFFT it to ψ_real, apply each term to Hψ_fourier and Hψ_real, and add it to Hψ
     for iband = 1:n_bands
         Hψ_real .= 0
diff --git a/src/terms/hartree.jl b/src/terms/hartree.jl
index 07dc61548a..9c72e2d58d 100644
--- a/src/terms/hartree.jl
+++ b/src/terms/hartree.jl
@@ -31,13 +31,17 @@ function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where T
 
     # Solving the Poisson equation ΔV = -4π ρ in Fourier space
     # is multiplying elementwise by 4π / |G|^2.
-    poisson_green_coeffs = 4T(π) ./ [sum(abs2, G) for G in G_vectors_cart(basis)]
+    poisson_green_coeffs = map(G_vectors_cart(basis)) do G
+        4T(π) /sum(abs2, G)
+    end
     if !isempty(model.atoms)
         # Assume positive charge from nuclei is exactly compensated by the electrons
         sum_charges = sum(charge_ionic, model.atoms)
         @assert sum_charges == model.n_electrons
     end
-    poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
+    poisson_green_coeffs[1:1,1:1,1:1] .= zero(similar(G_vectors(basis), T, 1,1,1))
+    #Hackish way to do the following
+    # poisson_green_coeffs[1] = 0  # Compensating charge background => Zero DC
 
     TermHartree(T(scaling_factor), T(scaling_factor) .* poisson_green_coeffs)
 end
diff --git a/src/terms/kinetic.jl b/src/terms/kinetic.jl
index b5d757f24a..0d200f78a5 100644
--- a/src/terms/kinetic.jl
+++ b/src/terms/kinetic.jl
@@ -16,9 +16,10 @@ struct TermKinetic <: Term
     kinetic_energies::Vector{<:AbstractVector}  # kinetic energy 1/2|G+k|^2 for every kpoint
 end
 function TermKinetic(basis::PlaneWaveBasis{T}, scaling_factor) where {T}
-    kinetic_energies = [[T(scaling_factor) * sum(abs2, Gk) / 2
-                         for Gk in Gplusk_vectors_cart(basis, kpt)]
-                        for kpt in basis.kpoints]
+    kinetic_energies = [convert(array_type(basis),
+                            [T(scaling_factor) * sum(abs2, Gk) / 2
+                            for Gk in Gplusk_vectors_cart(basis, kpt)])
+                            for kpt in basis.kpoints]
     TermKinetic(T(scaling_factor), kinetic_energies)
 end
 
diff --git a/src/terms/local.jl b/src/terms/local.jl
index 678adf2197..9867eeef1e 100644
--- a/src/terms/local.jl
+++ b/src/terms/local.jl
@@ -74,7 +74,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
     # positions, this involves a form factor (`local_potential_fourier`)
     # and a structure factor e^{-i G·r}
 
-    pot_fourier = map(G_vectors(basis)) do G
+    #This operation needs to be done only once, so let's try to make it happen on CPU (else we needs to isbitsify the pseudopotentials)
+    pot_fourier = map(Array(G_vectors(basis))) do G
         pot = sum(model.atom_groups) do group
             element = model.atoms[first(group)]
             form_factor::T = local_potential_fourier(element, norm(model.recip_lattice * G))
@@ -82,8 +83,8 @@ function (::AtomicLocal)(basis::PlaneWaveBasis{T}) where {T}
         end
         pot / sqrt(model.unit_cell_volume)
     end
-
-    pot_real = G_to_r(basis, pot_fourier)
+    #If needed, send to the GPU the atomic local term.
+    pot_real = G_to_r(basis, convert(array_type(basis),pot_fourier))
     TermAtomicLocal(pot_real)
 end
 
diff --git a/src/terms/nonlocal.jl b/src/terms/nonlocal.jl
index 8f6cbc96fa..47ab910b52 100644
--- a/src/terms/nonlocal.jl
+++ b/src/terms/nonlocal.jl
@@ -15,7 +15,7 @@ function (::AtomicNonlocal)(basis::PlaneWaveBasis{T}) where {T}
     isempty(psp_groups) && return TermNoop()
     ops = map(basis.kpoints) do kpt
         P = build_projection_vectors_(basis, kpt, psps, psp_positions)
-        D = build_projection_coefficients_(T, psps, psp_positions)
+        D = build_projection_coefficients_(T, psps, psp_positions, array_type = array_type(basis))
         NonlocalOperator(basis, kpt, P, D)
     end
     TermAtomicNonlocal(ops)
@@ -31,6 +31,8 @@ end
     isnothing(ψ) && return (E=T(Inf), ops=term.ops)
 
     E = zero(T)
+    occ = [convert(array_type(basis), oc) for oc in occ]
+
     for (ik, kpt) in enumerate(basis.kpoints)
         Pψ = term.ops[ik].P' * ψ[ik]  # nproj x nband
         band_enes = dropdims(sum(real.(conj.(Pψ) .* (term.ops[ik].D * Pψ)), dims=1), dims=1)
@@ -90,7 +92,7 @@ end
 # The ordering of the projector indices is (A,l,m,i), where A is running over all
 # atoms, l, m are AM quantum numbers and i is running over all projectors for a
 # given l. The matrix is block-diagonal with non-zeros only if A, l and m agree.
-function build_projection_coefficients_(T, psps, psp_positions)
+function build_projection_coefficients_(T, psps, psp_positions; array_type = Array)
     # TODO In the current version the proj_coeffs still has a lot of zeros.
     #      One could improve this by storing the blocks as a list or in a
     #      BlockDiagonal data structure
@@ -106,7 +108,7 @@ function build_projection_coefficients_(T, psps, psp_positions)
     end # psp, r
     @assert count == n_proj
 
-    proj_coeffs
+    convert(array_type,proj_coeffs)
 end
 
 # Builds the projection coefficient matrix for a single atom
@@ -170,7 +172,7 @@ function build_projection_vectors_(basis::PlaneWaveBasis{T}, kpt::Kpoint,
         end
     end
     @assert offset == n_proj
-    proj_vectors
+    convert(array_type(basis), proj_vectors)
 end
 
 """
diff --git a/src/workarounds/gpu_arrays.jl b/src/workarounds/gpu_arrays.jl
new file mode 100644
index 0000000000..cb3e43fd09
--- /dev/null
+++ b/src/workarounds/gpu_arrays.jl
@@ -0,0 +1,16 @@
+#TODO: remove this when it is implemented in GPUArrays
+import LinearAlgebra.dot, LinearAlgebra.eigen, LinearAlgebra.RealHermSymComplexHerm
+using LinearAlgebra
+using GPUArrays
+using CUDA
+
+LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal,y::AbstractGPUArray) = x'*(D*y)
+
+function LinearAlgebra.eigen(A::RealHermSymComplexHerm{T,AT}) where {T,AT <: CuArray}
+    if eltype(A) <: Complex
+        vals, vects = CUDA.CUSOLVER.heevd!('V','U', A.data)
+    else
+        vals, vects = CUDA.CUSOLVER.syevd!('V','U',A.data)
+    end
+    (vectors = vects, values = vals)
+end