Skip to content

Commit 721ef62

Browse files
committed
Adapt for Metal
[only benchmarks]
1 parent b9ab40c commit 721ef62

File tree

8 files changed

+139
-194
lines changed

8 files changed

+139
-194
lines changed

perf/array.jl

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ const n = 1000
55

66
# generate some arrays
77
cpu_mat = rand(rng, Float32, m, n)
8-
gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
8+
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
99
gpu_vec = reshape(gpu_mat, length(gpu_mat))
1010
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
1111
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
12-
gpu_mat_ints = CuArray(rand(rng, Int, m, n))
12+
gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
1313
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
14-
gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
14+
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
1515
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
1616

17-
group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
17+
group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
1818

1919
group["copy"] = @async_benchmarkable copy($gpu_mat)
2020

@@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!")
2626
end
2727

2828
let group = addgroup!(group, "iteration")
29-
group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
29+
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
3030

3131
group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
3232

@@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration")
4646
end
4747
end
4848

49-
let group = addgroup!(group, "reverse")
50-
group["1d"] = @async_benchmarkable reverse($gpu_vec)
51-
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
52-
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
53-
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
54-
end
49+
# let group = addgroup!(group, "reverse")
50+
# group["1d"] = @async_benchmarkable reverse($gpu_vec)
51+
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
52+
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
53+
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
54+
# end
5555

5656
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
5757

@@ -77,31 +77,31 @@ end
7777

7878
let group = addgroup!(group, "random")
7979
let group = addgroup!(group, "rand")
80-
group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
81-
group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
80+
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
81+
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
8282
end
8383

8484
let group = addgroup!(group, "rand!")
85-
group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
86-
group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
85+
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
86+
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
8787
end
8888

8989
let group = addgroup!(group, "randn")
90-
group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
91-
#group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
90+
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
91+
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
9292
end
9393

9494
let group = addgroup!(group, "randn!")
95-
group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
96-
#group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
95+
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
96+
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
9797
end
9898
end
9999

100-
let group = addgroup!(group, "sorting")
101-
group["1d"] = @async_benchmarkable sort($gpu_vec)
102-
group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
103-
group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
104-
end
100+
# let group = addgroup!(group, "sorting")
101+
# group["1d"] = @async_benchmarkable sort($gpu_vec)
102+
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
103+
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
104+
# end
105105

106106
let group = addgroup!(group, "permutedims")
107107
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))

perf/byval.jl

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
module ByVal
22

3-
using CUDA, BenchmarkTools, Random
4-
using CUDA: i32
3+
using Metal, BenchmarkTools, Random
54

65
const threads = 256
76

87
# simple add matrixes kernel
98
function kernel_add_mat(n, x1, x2, y)
10-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
9+
i = thread_position_in_grid_1d()
1110
if i <= n
1211
@inbounds y[i] = x1[i] + x2[i]
1312
end
@@ -20,8 +19,8 @@ end
2019

2120
# add arrays of matrixes kernel
2221
function kernel_add_mat_z_slices(n, vararg...)
23-
x1, x2, y = get_inputs3(blockIdx().y, vararg...)
24-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
22+
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
23+
i = thread_position_in_grid_1d()
2524
if i <= n
2625
@inbounds y[i] = x1[i] + x2[i]
2726
end
@@ -30,15 +29,15 @@ end
3029

3130
function add_z_slices!(y, x1, x2)
3231
m1, n1 = size(x1[1]) #get size of first slice
33-
blocks = (m1 * n1 + threads - 1) ÷ threads
34-
# get length(x1) more blocks than needed to process 1 slice
35-
@cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
32+
groups = (m1 * n1 + threads - 1) ÷ threads
33+
# get length(x1) more groups than needed to process 1 slice
34+
@metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
3635
end
3736

3837
function add!(y, x1, x2)
3938
m1, n1 = size(x1)
40-
blocks = (m1 * n1 + threads - 1) ÷ threads
41-
@cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
39+
groups = (m1 * n1 + threads - 1) ÷ threads
40+
@metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
4241
end
4342

4443
function main()
@@ -54,22 +53,22 @@ function main()
5453
m, n = 3072, 1536 # 256 multiplier
5554
#m, n = 6007, 3001 # prime numbers to test memory access correctness
5655

57-
x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
58-
x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
56+
x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
57+
x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
5958
y1 = [similar(x1[1]) for i = 1:num_z_slices]
6059

6160
# reference down to bones add on GPU
62-
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
61+
results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])
6362

6463
# adding arrays in an array
6564
for slices = 1:num_z_slices
66-
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
65+
results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
6766
end
6867

6968
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
70-
CUDA.unsafe_free!.(x1)
71-
CUDA.unsafe_free!.(x2)
72-
CUDA.unsafe_free!.(y1)
69+
Metal.unsafe_free!.(x1)
70+
Metal.unsafe_free!.(x2)
71+
Metal.unsafe_free!.(y1)
7372

7473
return results
7574
end

perf/kernel.jl

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,35 @@
1-
using CUDA: i32
1+
# using GPUArrays
22

33
group = addgroup!(SUITE, "kernel")
44

5-
group["launch"] = @benchmarkable @cuda identity(nothing)
5+
group["launch"] = @benchmarkable @metal identity(nothing)
66

7-
group["occupancy"] = @benchmarkable begin
8-
kernel = @cuda launch=false identity(nothing)
9-
launch_configuration(kernel.fun)
10-
end
7+
# group["occupancy"] = @benchmarkable begin
8+
# kernel = @metal launch=false identity(nothing)
9+
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
10+
# return
11+
# end
1112

12-
src = CUDA.rand(Float32, 512, 1000)
13+
src = Metal.rand(Float32, 512, 1000)
1314
dest = similar(src)
1415
function indexing_kernel(dest, src)
15-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
16+
i = thread_position_in_grid_1d()
1617
@inbounds dest[i] = src[i]
1718
return
1819
end
19-
group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
20+
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
2021

2122
function checked_indexing_kernel(dest, src)
22-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
23+
i = thread_position_in_grid_1d()
2324
dest[i] = src[i]
2425
return
2526
end
26-
group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
27+
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
2728

28-
function rand_kernel(dest::AbstractArray{T}) where {T}
29-
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
30-
dest[i] = rand(T)
31-
return
32-
end
33-
group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
29+
## DELETE
30+
# function rand_kernel(dest::AbstractArray{T}) where {T}
31+
# i = thread_position_in_grid_1d()
32+
# dest[i] = Metal.rand(T)
33+
# return
34+
# end
35+
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)

perf/latency.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module Latency
22

3-
using CUDA
3+
using Metal
44
using BenchmarkTools
55

66
function main()
@@ -11,24 +11,24 @@ function main()
1111
base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
1212
end
1313
# NOTE: we don't ust Base.active_project() here because of how CI launches this script,
14-
# starting with --project in the main CUDA.jl project.
14+
# starting with --project in the main Metal.jl project.
1515

1616
# time to precompile the package and its dependencies
1717
precompile_cmd =
18-
`$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
18+
`$base_cmd -e "pkg = Base.identify_package(\"Metal\")
1919
Base.compilecache(pkg)"`
2020
results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60
2121

2222
# time to actually import the package
2323
import_cmd =
24-
`$base_cmd -e "using CUDA"`
24+
`$base_cmd -e "using Metal"`
2525
results["import"] = @benchmark run($import_cmd) evals=1 seconds=30
2626

2727
# time to actually compile a kernel
2828
ttfp_cmd =
29-
`$base_cmd -e "using CUDA
29+
`$base_cmd -e "using Metal
3030
kernel() = return
31-
CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
31+
Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
3232
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
3333

3434
results

perf/metal.jl

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,6 @@
1-
group = addgroup!(SUITE, "cuda")
1+
group = addgroup!(SUITE, "metal")
22

33
let group = addgroup!(group, "synchronization")
4-
let group = addgroup!(group, "stream")
5-
group["blocking"] = @benchmarkable synchronize(blocking=true)
6-
group["auto"] = @benchmarkable synchronize()
7-
group["nonblocking"] = @benchmarkable synchronize(spin=false)
8-
end
9-
let group = addgroup!(group, "context")
10-
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
11-
group["auto"] = @benchmarkable device_synchronize()
12-
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
13-
end
4+
group["stream"] = @benchmarkable synchronize()
5+
group["context"] = @benchmarkable device_synchronize()
146
end

perf/metaldevrt.jl

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,42 @@
1-
module cudadevrt
1+
module metaldevrt
22

3-
using CUDA, BenchmarkTools, Random
3+
using Metal, BenchmarkTools, Random
44

55
const threads = 256
66
#simple add matrix and vector kernel
77
function kernel_add_mat_vec(m, x1, x2, y)
88
# one block per column
9-
offset = (blockIdx().x-1) * m
10-
@inbounds xtmp = x2[blockIdx().x]
11-
for i = threadIdx().x : blockDim().x : m
9+
offset = (threadgroup_position_in_grid_2d().x-1) * m
10+
@inbounds xtmp = x2[threadgroup_position_in_grid_2d().x]
11+
for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m
1212
@inbounds y[offset + i] = x1[offset + i] + xtmp
1313
end
1414
return
1515
end
1616

1717
function add!(y, x1, x2)
1818
m, n = size(x1)
19-
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
19+
@metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
2020
end
2121

2222
function main()
2323
Random.seed!(1)
2424
m, n = 3072, 1536 # 256 multiplier
25-
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
26-
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
25+
x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5))
26+
x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5))
2727
y1 = similar(x1)
2828

29-
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
29+
results = @benchmark Metal.@sync add!($y1, $x1, $x2)
3030

3131
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
32-
CUDA.unsafe_free!(x1)
33-
CUDA.unsafe_free!(x2)
34-
CUDA.unsafe_free!(y1)
32+
Metal.unsafe_free!(x1)
33+
Metal.unsafe_free!(x2)
34+
Metal.unsafe_free!(y1)
3535

3636
return results
3737
end
3838

3939
end
4040

41-
cudadevrt.main()
41+
metaldevrt.main()
4242

0 commit comments

Comments
 (0)