Skip to content

Commit 5e679d8

Browse files
committed
Add semi-fast second and slow third approach
1 parent 234dbb5 commit 5e679d8

File tree

1 file changed

+241
-46
lines changed

1 file changed

+241
-46
lines changed

ext/PointNeighborsCUDAExt.jl

Lines changed: 241 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
88
using PointNeighbors.KernelAbstractions: KernelAbstractions, @kernel, @index
99
using PointNeighbors.Adapt: Adapt
1010
using Base: @propagate_inbounds
11+
using PointNeighbors.BenchmarkTools
1112

1213
const UnifiedCuArray = CuArray{<:Any, <:Any, CUDA.UnifiedMemory}
1314

@@ -67,61 +68,72 @@ end
6768
CUDA.device!(0)
6869
end
6970

70-
function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
71-
CUDA.LLVM.Interop.@asmcall(
72-
"atom.sys.global.add.u32 \$0, [\$1], \$2;",
73-
"=r,l,r,~{memory}",
74-
true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
75-
ptr, val
76-
)
77-
end
71+
############################################################################################
72+
# First approach: Use actual system atomics on unified memory
73+
############################################################################################
74+
# function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
75+
# CUDA.LLVM.Interop.@asmcall(
76+
# "atom.sys.global.add.u32 \$0, [\$1], \$2;",
77+
# "=r,l,r,~{memory}",
78+
# true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
79+
# ptr, val
80+
# )
81+
# end
7882

79-
@inline function pushat_atomic_system!(vov, i, value)
80-
(; backend, lengths) = vov
83+
# @inline function pushat_atomic_system!(vov, i, value)
84+
# (; backend, lengths) = vov
8185

82-
@boundscheck checkbounds(vov, i)
86+
# @boundscheck checkbounds(vov, i)
8387

84-
# Increment the column length with an atomic add to avoid race conditions.
85-
# Store the new value since it might be changed immediately afterwards by another
86-
# thread.
87-
# new_length = Atomix.@atomic lengths[i] += 1
88-
new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
88+
# # Increment the column length with an atomic add to avoid race conditions.
89+
# # Store the new value since it might be changed immediately afterwards by another
90+
# # thread.
91+
# # new_length = Atomix.@atomic lengths[i] += 1
92+
# new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
8993

90-
# We can write here without race conditions, since the atomic add guarantees
91-
# that `new_length` is different for each thread.
92-
backend[new_length, i] = value
94+
# # We can write here without race conditions, since the atomic add guarantees
95+
# # that `new_length` is different for each thread.
96+
# backend[new_length, i] = value
9397

94-
return vov
95-
end
98+
# return vov
99+
# end
96100

97-
const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
101+
# const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
98102

99-
function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
100-
(; cell_list) = neighborhood_search
103+
# function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
104+
# (; cell_list) = neighborhood_search
101105

102-
cell_list.cells.lengths .= 0
106+
# cell_list.cells.lengths .= 0
103107

104-
if neighborhood_search.search_radius < eps()
105-
# Cannot initialize with zero search radius.
106-
# This is used in TrixiParticles when a neighborhood search is not used.
107-
return neighborhood_search
108-
end
108+
# if neighborhood_search.search_radius < eps()
109+
# # Cannot initialize with zero search radius.
110+
# # This is used in TrixiParticles when a neighborhood search is not used.
111+
# return neighborhood_search
112+
# end
109113

110-
# Faster on a single GPU
111-
@threaded CUDABackend() for point in axes(y, 2)
112-
# Get cell index of the point's cell
113-
point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
114-
cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
114+
# # Faster on a single GPU
115+
# # CUDA.NVTX.@range "threaded loop" begin
116+
# @threaded y for point in axes(y, 2)
117+
# # Get cell index of the point's cell
118+
# point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
119+
# cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
115120

116-
# Add point to corresponding cell
117-
pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
118-
end
121+
# # Add point to corresponding cell
122+
# pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
123+
# end
124+
# # end
119125

120-
return neighborhood_search
121-
end
126+
# return neighborhood_search
127+
# end
128+
129+
# function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
130+
# y::AbstractMatrix; parallelization_backend = y)
131+
# PointNeighbors.initialize_grid!(neighborhood_search, y)
132+
# end
122133

123-
# This might be slightly faster, but causes problems with synchronization in the interaction
124-
# kernel because we are carrying around device memory.
134+
############################################################################################
135+
# Second approach: Use a device array to update and copy to unified memory
136+
############################################################################################
125137
# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126138
# vov_unified :: VU
127139
# vov_device :: VD
@@ -133,9 +145,8 @@ end
133145

134146
# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135147

136-
# function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137-
# @info "" to
138-
# return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
148+
# function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
149+
# return Adapt.adapt(to, vov.vov_unified)
139150
# end
140151

141152
# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
@@ -182,7 +193,7 @@ end
182193

183194
# # The atomic version of `pushat!` uses atomics to avoid race conditions when
184195
# # `pushat!` is used in a parallel loop.
185-
# @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
196+
# @inbounds PointNeighbors.pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186197
# end
187198

188199
# # Copy vector of vectors to unified memory
@@ -203,4 +214,188 @@ end
203214
# PointNeighbors.initialize_grid!(neighborhood_search, y)
204215
# end
205216

217+
############################################################################################
218+
# Third approach: Like second approach, but a device array for each GPU.
219+
220+
# This should have the advantage that each GPU updates the part of the grid that should
221+
# already be in its memory and we can avoid moving everything to the first GPU.
222+
############################################################################################
223+
KernelAbstractions.@kernel function generic_kernel2(f, gpu)
224+
i = KernelAbstractions.@index(Global)
225+
@inline f(i, gpu)
226+
end
227+
228+
@inline function parallel_foreach2(f::T, iterator, vov_per_gpu) where {T}
229+
# On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
230+
# and index with `iterator[eachindex(iterator)[i]]`.
231+
# Note that this only works with vector-like iterators that support arbitrary indexing.
232+
indices = eachindex(iterator)
233+
234+
# Skip empty loops
235+
length(indices) == 0 && return
236+
237+
# Partition `ndrange` to the GPUs
238+
n_gpus = length(CUDA.devices())
239+
indices_split = Iterators.partition(indices, ceil(Int, length(indices) / n_gpus))
240+
@assert length(indices_split) <= n_gpus
241+
242+
backend = CUDABackend()
243+
244+
# Synchronize each device
245+
for i in 1:n_gpus
246+
CUDA.device!(i - 1)
247+
KernelAbstractions.synchronize(backend)
248+
end
249+
250+
# Launch kernel on each device
251+
for (i, indices_) in enumerate(indices_split)
252+
# Select the correct device for this partition
253+
CUDA.device!(i - 1)
254+
255+
# Call the generic kernel, which only calls a function with the global GPU index
256+
generic_kernel2(backend)(vov_per_gpu[i], ndrange = length(indices_)) do j, vov_per_gpu
257+
@inbounds @inline f(iterator[indices_[j]], vov_per_gpu)
258+
end
259+
end
260+
261+
# Synchronize each device
262+
for i in 1:n_gpus
263+
CUDA.device!(i - 1)
264+
KernelAbstractions.synchronize(backend)
265+
end
266+
267+
# Select first device again
268+
CUDA.device!(0)
269+
end
270+
271+
struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
272+
vov_unified::VOV
273+
vov_per_gpu::V
274+
end
275+
276+
function MultiGPUVectorOfVectors(vov_unified, vov_per_gpu)
277+
MultiGPUVectorOfVectors{eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)}(vov_unified, vov_per_gpu)
278+
end
279+
280+
# Adapt.@adapt_structure(MultiGPUVectorOfVectors)
281+
282+
function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
283+
return Adapt.adapt(to, vov.vov_unified)
284+
end
285+
286+
function Adapt.adapt_structure(to::CUDAMultiGPUBackend, vov::DynamicVectorOfVectors{T}) where {T}
287+
max_inner_length, max_outer_length = size(vov.backend)
288+
289+
n_gpus = length(CUDA.devices())
290+
vov_per_gpu = [DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) for _ in 1:n_gpus]
291+
292+
vov_unified = DynamicVectorOfVectors(Adapt.adapt(to, vov.backend), Adapt.adapt(to, vov.length_), Adapt.adapt(to, vov.lengths))
293+
vov_per_gpu_ = ntuple(i -> Adapt.adapt(CuArray, vov_per_gpu[i]), n_gpus)
294+
for vov__ in vov_per_gpu_
295+
vov__.length_[] = vov.length_[]
296+
end
297+
298+
return MultiGPUVectorOfVectors{T, typeof(vov_unified), typeof(vov_per_gpu_)}(vov_unified, vov_per_gpu_)
299+
end
300+
301+
const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:MultiGPUVectorOfVectors}}
302+
303+
function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
304+
(; cell_list) = neighborhood_search
305+
(; cells) = cell_list
306+
(; vov_unified, vov_per_gpu) = cells
307+
308+
for vov_ in vov_per_gpu
309+
vov_.lengths .= 0
310+
end
311+
312+
if neighborhood_search.search_radius < eps()
313+
# Cannot initialize with zero search radius.
314+
# This is used in TrixiParticles when a neighborhood search is not used.
315+
return neighborhood_search
316+
end
317+
318+
# Fill cell lists per GPU
319+
parallel_foreach2(axes(y, 2), vov_per_gpu) do point, vov
320+
# Get cell index of the point's cell
321+
point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
322+
cell = cell_coords(point_coords, neighborhood_search)
323+
324+
# Add point to corresponding cell
325+
@boundscheck PointNeighbors.check_cell_bounds(cell_list, cell)
326+
327+
# The atomic version of `pushat!` uses atomics to avoid race conditions when
328+
# `pushat!` is used in a parallel loop.
329+
@inbounds PointNeighbors.pushat_atomic!(vov, cell_index(cell_list, cell), point)
330+
end
331+
332+
lengths = ntuple(gpu -> vov_per_gpu[gpu].lengths, length(vov_per_gpu))
333+
CUDA.synchronize()
334+
offsets_ = cu(cumsum(lengths), unified = true)
335+
CUDA.synchronize()
336+
vov_unified.lengths .= offsets_[end]
337+
CUDA.synchronize()
338+
offsets = offsets_ .- lengths
339+
nonempty = ntuple(gpu -> findall(x -> x > 0, lengths[gpu]), length(vov_per_gpu))
340+
341+
n_gpus = length(CUDA.devices())
342+
backend = CUDABackend()
343+
344+
# Synchronize each device
345+
for i in 1:n_gpus
346+
CUDA.device!(i - 1)
347+
KernelAbstractions.synchronize(backend)
348+
end
349+
350+
# Launch kernel on each device
351+
for (gpu, indices_) in enumerate(nonempty)
352+
# Select the correct device for this partition
353+
CUDA.device!(gpu - 1)
354+
355+
# Call the generic kernel, which only calls a function with the global GPU index
356+
generic_kernel2(backend)((vov_per_gpu[gpu], offsets[gpu]), ndrange = length(indices_)) do j, (vov, offset_gpu)
357+
cell = @inbounds indices_[j]
358+
offset = offset_gpu[cell]
359+
# offset = offsets[gpu][cell]
360+
361+
points = vov[cell]
362+
# points = vov_per_gpu[gpu][cell]
363+
for i in eachindex(points)
364+
vov_unified.backend[offset + i, cell] = points[i]
365+
end
366+
end
367+
end
368+
369+
# Synchronize each device
370+
for i in 1:n_gpus
371+
CUDA.device!(i - 1)
372+
KernelAbstractions.synchronize(backend)
373+
end
374+
375+
# Select first device again
376+
CUDA.device!(0)
377+
378+
# # Merge cell lists
379+
# parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
380+
# offset = offsets[gpu][cell]
381+
382+
# points = vov_per_gpu[gpu][cell]
383+
# for i in eachindex(points)
384+
# vov_unified.backend[offset + i, cell] = points[i]
385+
# end
386+
# end
387+
388+
return neighborhood_search
389+
end
390+
391+
function PointNeighbors.update!(neighborhood_search::MultiGPUNeighborhoodSearch,
392+
x::AbstractMatrix, y::AbstractMatrix;
393+
points_moving = (true, true), parallelization_backend = x)
394+
# The coordinates of the first set of points are irrelevant for this NHS.
395+
# Only update when the second set is moving.
396+
points_moving[2] || return neighborhood_search
397+
398+
PointNeighbors.initialize_grid!(neighborhood_search, y)
399+
end
400+
206401
end # module

0 commit comments

Comments
 (0)