Add and test shuffle and fill intrinsics

christiangnrd · christiangnrd · commit 6b2880b023e4 · 2025-03-03T14:49:45.000-04:00
diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
@@ -1,5 +1,5 @@
 export simdgroup_load, simdgroup_store, simdgroup_multiply, simdgroup_multiply_accumulate,
-        simd_shuffle_down, simd_shuffle_up
+        simd_shuffle_down, simd_shuffle_up, simd_shuffle_and_fill_down, simd_shuffle_and_fill_up
 
 using Core: LLVMPtr
 
@@ -104,6 +104,15 @@ for (jltype, suffix) in simd_shuffle_map
         @device_function simd_shuffle_up(data::$jltype, delta::Integer) =
             ccall($"extern air.simd_shuffle_up.$suffix",
                 llvmcall, $jltype, ($jltype, Int16), data, delta)
+
+        # TODO: Emulate or disallow on M1 (Apple7)
+        @device_function simd_shuffle_and_fill_down(data::$jltype, filling_data::$jltype, delta::Integer, modulo::Integer=threads_per_simdgroup()) =
+            ccall($"extern air.simd_shuffle_and_fill_down.$suffix",
+                llvmcall, $jltype, ($jltype, $jltype, Int16, Int16), data, filling_data, delta, modulo)
+
+        @device_function simd_shuffle_and_fill_up(data::$jltype, filling_data::$jltype, delta::Integer, modulo::Integer=threads_per_simdgroup()) =
+            ccall($"extern air.simd_shuffle_and_fill_up.$suffix",
+                llvmcall, $jltype, ($jltype, $jltype, Int16, Int16), data, filling_data, delta, modulo)
     end
 end
 
@@ -134,3 +143,39 @@ modify the lower `delta` lanes of `data` because it doesn't wrap values around t
 T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
 simd_shuffle_up
+
+@doc """
+    simd_shuffle_and_fill_down(data::T, filling_data::T, delta::Integer, [modulo::Integer])
+
+Returns `data` or `filling_data` for each vector from the thread whose SIMD lane ID is the
+difference from the caller's SIMD lane ID minus `delta`.
+
+If the difference is negative, the operation copies values from the upper `delta` lanes of
+`filling_data` to the lower `delta` lanes of `data`.
+
+The value of `delta` needs to be the same for all threads in a SIMD-group.
+
+The `modulo` parameter defines the vector width that splits the SIMD-group into separate vectors
+ and must be 2, 4, 8, 16, or 32.
+
+T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+"""
+simd_shuffle_and_fill_down
+
+@doc """
+    simd_shuffle_and_fill_up(data::T, filling_data::T, delta::Integer, [modulo::Integer])
+
+Returns `data` or `filling_data` for each vector from the thread whose SIMD lane ID is the
+sum of the caller's SIMD lane ID and `delta`.
+
+If the sum is greater than `modulo`, the function copies values from the lower `delta` lanes of
+`filling_data` into the upper `delta` lanes of `data`.
+
+The value of `delta` needs to be the same for all threads in a SIMD-group.
+
+The `modulo` parameter defines the vector width that splits the SIMD-group into separate vectors
+ and must be 2, 4, 8, 16, or 32.
+
+T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+"""
+simd_shuffle_and_fill_up
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
@@ -659,6 +659,41 @@ end
     @test sum(a) ≈ b[res_idx]
 end
 
+@testset "$f($typ)" for typ in [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8], (f,nshift) in [(simd_shuffle_and_fill_down, -4), (simd_shuffle_and_fill_up, 2)]
+    function kernel_mod(data::MtlDeviceVector{T}, filling_data::MtlDeviceVector{T}) where T
+        idx = thread_position_in_grid_1d()
+        idx_in_simd = thread_index_in_simdgroup() #simd_lane_id
+        simd_idx = simdgroup_index_in_threadgroup() #simd_group_id
+
+        temp_data = MtlThreadGroupArray(T, 16)
+        temp_data[idx] = data[idx]
+        temp_filling_data = MtlThreadGroupArray(T, 16)
+        temp_filling_data[idx] = filling_data[idx]
+        simdgroup_barrier(Metal.MemoryFlagThreadGroup)
+
+        if simd_idx == 1
+            dat_value = temp_data[idx_in_simd]
+            dat_fil_value = temp_filling_data[idx_in_simd]
+
+            value = f(dat_value, dat_fil_value, abs(nshift), length(data))
+
+            data[idx] = value
+        end
+        return
+    end
+
+    dev_a = Metal.zeros(typ, 16; storage=Metal.SharedStorage)
+    dev_b = Metal.zeros(typ, 16; storage=Metal.SharedStorage)
+    # GC.@preserve dev_a dev_b begin
+    a = unsafe_wrap(Array{typ}, dev_a, 16)
+    b = unsafe_wrap(Array{typ}, dev_b, 16)
+
+    a .= 1:16
+    b .= 1:16
+
+    Metal.@sync @metal threads=16 kernel_mod(dev_a, dev_b)
+    @test a == circshift(b,nshift)
+end
 @testset "matrix functions" begin
     @testset "load_store($typ)" for typ in [Float16, Float32]
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T},