Revert "Adapt to GPUArrays.jl transition to KernelAbstractions.jl. (JuliaGPU#461)"

christiangnrd · christiangnrd · commit 7c2949846612 · 2024-10-18T19:37:29.000-03:00
This reverts commit 711758d.
diff --git a/Project.toml b/Project.toml
@@ -40,7 +40,7 @@ BFloat16s = "0.5"
 CEnum = "0.4, 0.5"
 CodecBzip2 = "0.8"
 ExprTools = "0.1"
-GPUArrays = "11"
+GPUArrays = "10.1"
 GPUCompiler = "0.26, 0.27, 1"
 KernelAbstractions = "0.9.1"
 LLVM = "7.2, 8, 9"
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
@@ -1,5 +1,59 @@
 ## GPUArrays interfaces
 
+## execution
+
+struct mtlArrayBackend <: AbstractGPUBackend end
+
+struct mtlKernelContext <: AbstractKernelContext end
+
+@inline function GPUArrays.launch_heuristic(::mtlArrayBackend, f::F, args::Vararg{Any,N};
+                                             elements::Int, elements_per_thread::Int) where {F,N}
+    kernel = @metal launch=false f(mtlKernelContext(), args...)
+
+    # The pipeline state automatically computes occupancy stats
+    threads = min(elements, kernel.pipeline.maxTotalThreadsPerThreadgroup)
+    blocks  = cld(elements, threads)
+
+    return (; threads=Int(threads), blocks=Int(blocks))
+end
+
+function GPUArrays.gpu_call(::mtlArrayBackend, f, args, threads::Int, groups::Int;
+                            name::Union{String,Nothing})
+    @metal threads groups name f(mtlKernelContext(), args...)
+end
+
+
+## on-device
+
+# indexing
+GPUArrays.blockidx(ctx::mtlKernelContext)     = threadgroup_position_in_grid_1d()
+GPUArrays.blockdim(ctx::mtlKernelContext)     = threads_per_threadgroup_1d()
+GPUArrays.threadidx(ctx::mtlKernelContext)    = thread_position_in_threadgroup_1d()
+GPUArrays.griddim(ctx::mtlKernelContext)      = threadgroups_per_grid_1d()
+GPUArrays.global_index(ctx::mtlKernelContext) = thread_position_in_grid_1d()
+GPUArrays.global_size(ctx::mtlKernelContext)  = threads_per_grid_1d()
+
+# memory
+
+@inline function GPUArrays.LocalMemory(::mtlKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
+                                      ) where {T, dims, id}
+    ptr = emit_threadgroup_memory(T, Val(prod(dims)))
+    MtlDeviceArray(dims, ptr)
+end
+
+# synchronization
+
+@inline GPUArrays.synchronize_threads(::mtlKernelContext) =
+    threadgroup_barrier(MemoryFlagThreadGroup)
+
+
+
+#
+# Host abstractions
+#
+
+GPUArrays.backend(::Type{<:MtlArray}) = mtlArrayBackend()
+
 const GLOBAL_RNGs = Dict{MTLDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:MtlArray})
     dev = device()
diff --git a/test/random.jl b/test/random.jl
@@ -246,7 +246,8 @@ const OOPLACE_TUPLES = [[(Metal.rand, rand, T) for T in RAND_TYPES];
             a = f(T, d)
             Metal.seed!(1)
             b = f(T, d)
-            @test Array(a) == Array(b)
+            # TODO: Remove broken parameter once https://github.com/JuliaGPU/GPUArrays.jl/issues/530 is fixed
+            @test Array(a) == Array(b) broken = (T == Float16 && d == (1000,1000))
         end
     end
 end # testset