@@ -8,6 +8,7 @@ using UnsafeAtomics: UnsafeAtomics
8
8
using PointNeighbors. KernelAbstractions: KernelAbstractions, @kernel , @index
9
9
using PointNeighbors. Adapt: Adapt
10
10
using Base: @propagate_inbounds
11
+ using PointNeighbors. BenchmarkTools
11
12
12
13
const UnifiedCuArray = CuArray{<: Any , <: Any , CUDA. UnifiedMemory}
13
14
67
68
CUDA. device! (0 )
68
69
end
69
70
70
- function atomic_system_add (ptr:: CUDA.LLVMPtr{Int32, CUDA.AS.Global} , val:: Int32 )
71
- CUDA. LLVM. Interop. @asmcall (
72
- " atom.sys.global.add.u32 \$ 0, [\$ 1], \$ 2;" ,
73
- " =r,l,r,~{memory}" ,
74
- true , Int32, Tuple{CUDA. LLVMPtr{Int32, CUDA. AS. Global}, Int32},
75
- ptr, val
76
- )
77
- end
71
+ # ###########################################################################################
72
+ # First approach: Use actual system atomics on unified memory
73
+ # ###########################################################################################
74
+ # function atomic_system_add(ptr::CUDA.LLVMPtr{Int32, CUDA.AS.Global}, val::Int32)
75
+ # CUDA.LLVM.Interop.@asmcall(
76
+ # "atom.sys.global.add.u32 \$0, [\$1], \$2;",
77
+ # "=r,l,r,~{memory}",
78
+ # true, Int32, Tuple{CUDA.LLVMPtr{Int32, CUDA.AS.Global}, Int32},
79
+ # ptr, val
80
+ # )
81
+ # end
78
82
79
- @inline function pushat_atomic_system! (vov, i, value)
80
- (; backend, lengths) = vov
83
+ # @inline function pushat_atomic_system!(vov, i, value)
84
+ # (; backend, lengths) = vov
81
85
82
- @boundscheck checkbounds (vov, i)
86
+ # @boundscheck checkbounds(vov, i)
83
87
84
- # Increment the column length with an atomic add to avoid race conditions.
85
- # Store the new value since it might be changed immediately afterwards by another
86
- # thread.
87
- # new_length = Atomix.@atomic lengths[i] += 1
88
- new_length = atomic_system_add (pointer (lengths, i), Int32 (1 )) + 1
88
+ # # Increment the column length with an atomic add to avoid race conditions.
89
+ # # Store the new value since it might be changed immediately afterwards by another
90
+ # # thread.
91
+ # # new_length = Atomix.@atomic lengths[i] += 1
92
+ # new_length = atomic_system_add(pointer(lengths, i), Int32(1)) + 1
89
93
90
- # We can write here without race conditions, since the atomic add guarantees
91
- # that `new_length` is different for each thread.
92
- backend[new_length, i] = value
94
+ # # We can write here without race conditions, since the atomic add guarantees
95
+ # # that `new_length` is different for each thread.
96
+ # backend[new_length, i] = value
93
97
94
- return vov
95
- end
98
+ # return vov
99
+ # end
96
100
97
- const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}} }
101
+ # const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<:Any, <:Any, <:FullGridCellList{<:DynamicVectorOfVectors{<:Any, <:CuArray{<:Any, <:Any, CUDA.UnifiedMemory}}}}
98
102
99
- function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
100
- (; cell_list) = neighborhood_search
103
+ # function PointNeighbors.initialize_grid!(neighborhood_search::MultiGPUNeighborhoodSearch, y::AbstractMatrix)
104
+ # (; cell_list) = neighborhood_search
101
105
102
- cell_list. cells. lengths .= 0
106
+ # cell_list.cells.lengths .= 0
103
107
104
- if neighborhood_search. search_radius < eps ()
105
- # Cannot initialize with zero search radius.
106
- # This is used in TrixiParticles when a neighborhood search is not used.
107
- return neighborhood_search
108
- end
108
+ # if neighborhood_search.search_radius < eps()
109
+ # # Cannot initialize with zero search radius.
110
+ # # This is used in TrixiParticles when a neighborhood search is not used.
111
+ # return neighborhood_search
112
+ # end
109
113
110
- # Faster on a single GPU
111
- @threaded CUDABackend () for point in axes (y, 2 )
112
- # Get cell index of the point's cell
113
- point_coords = PointNeighbors. extract_svector (y, Val (ndims (neighborhood_search)), point)
114
- cell = PointNeighbors. cell_coords (point_coords, neighborhood_search)
114
+ # # Faster on a single GPU
115
+ # # CUDA.NVTX.@range "threaded loop" begin
116
+ # @threaded y for point in axes(y, 2)
117
+ # # Get cell index of the point's cell
118
+ # point_coords = PointNeighbors.extract_svector(y, Val(ndims(neighborhood_search)), point)
119
+ # cell = PointNeighbors.cell_coords(point_coords, neighborhood_search)
115
120
116
- # Add point to corresponding cell
117
- pushat_atomic_system! (cell_list. cells, PointNeighbors. cell_index (cell_list, cell), point)
118
- end
121
+ # # Add point to corresponding cell
122
+ # pushat_atomic_system!(cell_list.cells, PointNeighbors.cell_index(cell_list, cell), point)
123
+ # end
124
+ # # end
119
125
120
- return neighborhood_search
121
- end
126
+ # return neighborhood_search
127
+ # end
122
128
123
- # This might be slightly faster, but causes problems with synchronization in the interaction
124
- # kernel because we are carrying around device memory.
129
+ # function PointNeighbors.update_grid!(neighborhood_search::MultiGPUNeighborhoodSearch,
130
+ # y::AbstractMatrix; parallelization_backend = y)
131
+ # PointNeighbors.initialize_grid!(neighborhood_search, y)
132
+ # end
133
+
134
+ # ###########################################################################################
135
+ # Second approach: Use a device array to update and copy to unified memory
136
+ # ###########################################################################################
125
137
# struct MultiGPUVectorOfVectors{T, VU, VD} <: AbstractVector{Array{T, 1}}
126
138
# vov_unified :: VU
127
139
# vov_device :: VD
133
145
134
146
# # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
135
147
136
- # function Adapt.adapt_structure(to, vov::MultiGPUVectorOfVectors)
137
- # @info "" to
138
- # return MultiGPUVectorOfVectors(Adapt.adapt(to, vov.vov_unified), Adapt.adapt(to, vov.vov_device))
148
+ # function Adapt.adapt_structure(to::CUDA.KernelAdaptor, vov::MultiGPUVectorOfVectors)
149
+ # return Adapt.adapt(to, vov.vov_unified)
139
150
# end
140
151
141
152
# @propagate_inbounds function Base.getindex(vov::MultiGPUVectorOfVectors, i)
182
193
183
194
# # The atomic version of `pushat!` uses atomics to avoid race conditions when
184
195
# # `pushat!` is used in a parallel loop.
185
- # @inbounds pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
196
+ # @inbounds PointNeighbors. pushat_atomic!(vov_device, cell_index(cell_list, cell), point)
186
197
# end
187
198
188
199
# # Copy vector of vectors to unified memory
@@ -203,4 +214,191 @@ end
203
214
# PointNeighbors.initialize_grid!(neighborhood_search, y)
204
215
# end
205
216
217
+ # ###########################################################################################
218
+ # Third approach: Like second approach, but a device array for each GPU.
219
+
220
+ # This should have the advantage that each GPU updates the part of the grid that should
221
+ # already be in its memory and we can avoid moving everything to the first GPU.
222
+ # ###########################################################################################
223
+ KernelAbstractions. @kernel function generic_kernel2 (f, gpu)
224
+ i = KernelAbstractions. @index (Global)
225
+ @inline f (i, gpu)
226
+ end
227
+
228
+ @inline function parallel_foreach2 (f:: T , iterator, vov_per_gpu) where {T}
229
+ # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
230
+ # and index with `iterator[eachindex(iterator)[i]]`.
231
+ # Note that this only works with vector-like iterators that support arbitrary indexing.
232
+ indices = eachindex (iterator)
233
+
234
+ # Skip empty loops
235
+ length (indices) == 0 && return
236
+
237
+ # Partition `ndrange` to the GPUs
238
+ n_gpus = length (CUDA. devices ())
239
+ indices_split = Iterators. partition (indices, ceil (Int, length (indices) / n_gpus))
240
+ @assert length (indices_split) <= n_gpus
241
+
242
+ backend = CUDABackend ()
243
+
244
+ # Synchronize each device
245
+ for i in 1 : n_gpus
246
+ CUDA. device! (i - 1 )
247
+ KernelAbstractions. synchronize (backend)
248
+ end
249
+
250
+ # Launch kernel on each device
251
+ for (i, indices_) in enumerate (indices_split)
252
+ # Select the correct device for this partition
253
+ CUDA. device! (i - 1 )
254
+
255
+ # Call the generic kernel, which only calls a function with the global GPU index
256
+ generic_kernel2 (backend)(vov_per_gpu[i], ndrange = length (indices_)) do j, vov_per_gpu
257
+ @inbounds @inline f (iterator[indices_[j]], vov_per_gpu)
258
+ end
259
+ end
260
+
261
+ # Synchronize each device
262
+ for i in 1 : n_gpus
263
+ CUDA. device! (i - 1 )
264
+ KernelAbstractions. synchronize (backend)
265
+ end
266
+
267
+ # Select first device again
268
+ CUDA. device! (0 )
269
+ end
270
+
271
+ struct MultiGPUVectorOfVectors{T, VOV, V} <: AbstractVector{Array{T, 1}}
272
+ vov_unified:: VOV
273
+ vov_per_gpu:: V
274
+ end
275
+
276
+ function MultiGPUVectorOfVectors (vov_unified, vov_per_gpu)
277
+ MultiGPUVectorOfVectors {eltype(vov_unified.backend), typeof(vov_unified), typeof(vov_per_gpu)} (vov_unified, vov_per_gpu)
278
+ end
279
+
280
+ # Adapt.@adapt_structure(MultiGPUVectorOfVectors)
281
+
282
+ function Adapt. adapt_structure (to:: CUDA.KernelAdaptor , vov:: MultiGPUVectorOfVectors )
283
+ return Adapt. adapt (to, vov. vov_unified)
284
+ end
285
+
286
+ function Adapt. adapt_structure (to:: CUDAMultiGPUBackend , vov:: DynamicVectorOfVectors{T} ) where {T}
287
+ max_inner_length, max_outer_length = size (vov. backend)
288
+
289
+ n_gpus = length (CUDA. devices ())
290
+ vov_per_gpu = [DynamicVectorOfVectors {T} (; max_outer_length, max_inner_length) for _ in 1 : n_gpus]
291
+
292
+ vov_unified = DynamicVectorOfVectors (Adapt. adapt (to, vov. backend), Adapt. adapt (to, vov. length_), Adapt. adapt (to, vov. lengths))
293
+ vov_per_gpu_ = ntuple (i -> Adapt. adapt (CuArray, vov_per_gpu[i]), n_gpus)
294
+ for vov__ in vov_per_gpu_
295
+ vov__. length_[] = vov. length_[]
296
+ end
297
+
298
+ return MultiGPUVectorOfVectors {T, typeof(vov_unified), typeof(vov_per_gpu_)} (vov_unified, vov_per_gpu_)
299
+ end
300
+
301
+ const MultiGPUNeighborhoodSearch = GridNeighborhoodSearch{<: Any , <: Any , <: FullGridCellList{<:MultiGPUVectorOfVectors} }
302
+
303
+ KernelAbstractions. @kernel function kernel3 (vov, vov_unified, previous_lengths, :: Val{gpu} , :: Val{islast} ) where {gpu, islast}
304
+ cell = KernelAbstractions. @index (Global)
305
+
306
+ length = @inbounds vov. lengths[cell]
307
+ if length > 0 || islast
308
+ offset = 0
309
+ for length_ in previous_lengths
310
+ offset += @inbounds length_[cell]
311
+ end
312
+
313
+ for i in 1 : length
314
+ @inbounds vov_unified. backend[offset + i, cell] = vov. backend[i, cell]
315
+ end
316
+
317
+ if islast
318
+ @inbounds vov_unified. lengths[cell] = offset + length
319
+ end
320
+ end
321
+ end
322
+
323
+ function PointNeighbors. initialize_grid! (neighborhood_search:: MultiGPUNeighborhoodSearch , y:: AbstractMatrix )
324
+ (; cell_list) = neighborhood_search
325
+ (; cells) = cell_list
326
+ (; vov_unified, vov_per_gpu) = cells
327
+
328
+ for vov_ in vov_per_gpu
329
+ vov_. lengths .= 0
330
+ end
331
+
332
+ if neighborhood_search. search_radius < eps ()
333
+ # Cannot initialize with zero search radius.
334
+ # This is used in TrixiParticles when a neighborhood search is not used.
335
+ return neighborhood_search
336
+ end
337
+
338
+ # Fill cell lists per GPU
339
+ parallel_foreach2 (axes (y, 2 ), vov_per_gpu) do point, vov
340
+ # Get cell index of the point's cell
341
+ point_coords = extract_svector (y, Val (ndims (neighborhood_search)), point)
342
+ cell = cell_coords (point_coords, neighborhood_search)
343
+
344
+ # Add point to corresponding cell
345
+ @boundscheck PointNeighbors. check_cell_bounds (cell_list, cell)
346
+
347
+ # The atomic version of `pushat!` uses atomics to avoid race conditions when
348
+ # `pushat!` is used in a parallel loop.
349
+ @inbounds PointNeighbors. pushat_atomic! (vov, cell_index (cell_list, cell), point)
350
+ end
351
+
352
+ n_gpus = length (CUDA. devices ())
353
+ backend = CUDABackend ()
354
+
355
+ # Synchronize each device
356
+ for i in 1 : n_gpus
357
+ CUDA. device! (i - 1 )
358
+ KernelAbstractions. synchronize (backend)
359
+ end
360
+
361
+ # Launch kernel on each device
362
+ for gpu in 1 : n_gpus
363
+ # Select the correct device
364
+ CUDA. device! (gpu - 1 )
365
+
366
+ previous_lengths = ntuple (gpu -> vov_per_gpu[gpu]. lengths, gpu - 1 )
367
+
368
+ # Call the generic kernel, which only calls a function with the global GPU index
369
+ kernel3 (backend)(vov_per_gpu[gpu], vov_unified, previous_lengths, Val (gpu), Val (gpu == n_gpus), ndrange = length (vov_per_gpu[gpu]))
370
+ end
371
+
372
+ # Synchronize each device
373
+ for i in 1 : n_gpus
374
+ CUDA. device! (i - 1 )
375
+ KernelAbstractions. synchronize (backend)
376
+ end
377
+
378
+ # Select first device again
379
+ CUDA. device! (0 )
380
+
381
+ # # Merge cell lists
382
+ # parallel_foreach2(axes(vov_unified, 2), y, partition = false) do cell, gpu
383
+ # offset = offsets[gpu][cell]
384
+
385
+ # points = vov_per_gpu[gpu][cell]
386
+ # for i in eachindex(points)
387
+ # vov_unified.backend[offset + i, cell] = points[i]
388
+ # end
389
+ # end
390
+
391
+ return neighborhood_search
392
+ end
393
+
394
+ function PointNeighbors. update! (neighborhood_search:: MultiGPUNeighborhoodSearch ,
395
+ x:: AbstractMatrix , y:: AbstractMatrix ;
396
+ points_moving = (true , true ), parallelization_backend = x)
397
+ # The coordinates of the first set of points are irrelevant for this NHS.
398
+ # Only update when the second set is moving.
399
+ points_moving[2 ] || return neighborhood_search
400
+
401
+ PointNeighbors. initialize_grid! (neighborhood_search, y)
402
+ end
403
+
206
404
end # module
0 commit comments