|
1 | 1 | export simdgroup_load, simdgroup_store, simdgroup_multiply, simdgroup_multiply_accumulate,
|
2 |
| - simd_shuffle_down, simd_shuffle_up |
| 2 | + simd_shuffle_down, simd_shuffle_up, simd_shuffle_and_fill_down, simd_shuffle_and_fill_up |
3 | 3 |
|
4 | 4 | using Core: LLVMPtr
|
5 | 5 |
|
@@ -104,6 +104,15 @@ for (jltype, suffix) in simd_shuffle_map
|
104 | 104 | @device_function simd_shuffle_up(data::$jltype, delta::Integer) =
|
105 | 105 | ccall($"extern air.simd_shuffle_up.$suffix",
|
106 | 106 | llvmcall, $jltype, ($jltype, Int16), data, delta)
|
| 107 | + |
| 108 | + # TODO: Emulate or disallow on M1 (Apple7) |
| 109 | + @device_function simd_shuffle_and_fill_down(data::$jltype, filling_data::$jltype, delta::Integer, modulo::Integer=threads_per_simdgroup()) = |
| 110 | + ccall($"extern air.simd_shuffle_and_fill_down.$suffix", |
| 111 | + llvmcall, $jltype, ($jltype, $jltype, Int16, Int16), data, filling_data, delta, modulo) |
| 112 | + |
| 113 | + @device_function simd_shuffle_and_fill_up(data::$jltype, filling_data::$jltype, delta::Integer, modulo::Integer=threads_per_simdgroup()) = |
| 114 | + ccall($"extern air.simd_shuffle_and_fill_up.$suffix", |
| 115 | + llvmcall, $jltype, ($jltype, $jltype, Int16, Int16), data, filling_data, delta, modulo) |
107 | 116 | end
|
108 | 117 | end
|
109 | 118 |
|
@@ -134,3 +143,45 @@ modify the lower `delta` lanes of `data` because it doesn't wrap values around t
|
134 | 143 | T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
|
135 | 144 | """
|
136 | 145 | simd_shuffle_up
|
| 146 | + |
| 147 | +@doc """ |
| 148 | + simd_shuffle_and_fill_down(data::T, filling_data::T, delta::Integer, [modulo::Integer]) |
| 149 | +
|
| 150 | +Returns `data` or `filling_data` for each vector from the thread whose SIMD lane ID is the |
| 151 | +difference from the caller's SIMD lane ID minus `delta`. |
| 152 | +
|
| 153 | +If the difference is negative, the operation copies values from the upper `delta` lanes of |
| 154 | +`filling_data` to the lower `delta` lanes of `data`. |
| 155 | +
|
| 156 | +The value of `delta` needs to be the same for all threads in a SIMD-group. |
| 157 | +
|
| 158 | +The `modulo` parameter defines the vector width that splits the SIMD-group into separate vectors |
| 159 | + and must be 2, 4, 8, 16, or 32. |
| 160 | +
|
| 161 | +T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 |
| 162 | +
|
| 163 | +!!! note |
| 164 | + `simd_shuffle_and_fill_down` is only available on Apple8+ GPUs (M2 and newer) |
| 165 | +""" |
| 166 | +simd_shuffle_and_fill_down |
| 167 | + |
| 168 | +@doc """ |
| 169 | + simd_shuffle_and_fill_up(data::T, filling_data::T, delta::Integer, [modulo::Integer]) |
| 170 | +
|
| 171 | +Returns `data` or `filling_data` for each vector from the thread whose SIMD lane ID is the |
| 172 | +sum of the caller's SIMD lane ID and `delta`. |
| 173 | +
|
| 174 | +If the sum is greater than `modulo`, the function copies values from the lower `delta` lanes of |
| 175 | +`filling_data` into the upper `delta` lanes of `data`. |
| 176 | +
|
| 177 | +The value of `delta` needs to be the same for all threads in a SIMD-group. |
| 178 | +
|
| 179 | +The `modulo` parameter defines the vector width that splits the SIMD-group into separate vectors |
| 180 | + and must be 2, 4, 8, 16, or 32. |
| 181 | +
|
| 182 | +T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8 |
| 183 | +
|
| 184 | +!!! note |
| 185 | + `simd_shuffle_and_fill_up` is only available on Apple8+ GPUs (M2 and newer) |
| 186 | +""" |
| 187 | +simd_shuffle_and_fill_up |
0 commit comments