Skip to content

Commit 8652754

Browse files
Add Benchmarking CI (#420)
1 parent 5b983a5 commit 8652754

File tree

12 files changed

+796
-4
lines changed

12 files changed

+796
-4
lines changed

.buildkite/pipeline.yml

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@ steps:
1919
queue: "juliaecosystem"
2020
os: "macos"
2121
arch: "aarch64"
22-
if: build.message !~ /\[skip tests\]/
22+
if: |
23+
build.message =~ /\[only tests\]/ ||
24+
build.message =~ /\[only julia\]/ ||
25+
build.message !~ /\[only/ &&
26+
build.message !~ /\[skip tests\]/ &&
27+
build.message !~ /\[skip julia\]/
2328
timeout_in_minutes: 60
2429
matrix:
2530
setup:
@@ -46,7 +51,12 @@ steps:
4651
queue: "juliaecosystem"
4752
os: "macos"
4853
arch: "aarch64"
49-
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
54+
if: |
55+
build.message =~ /\[only tests\]/ ||
56+
build.message =~ /\[only special\]/ ||
57+
build.message !~ /\[only/ && !build.pull_request.draft &&
58+
build.message !~ /\[skip tests\]/ &&
59+
build.message !~ /\[skip special\]/
5060
timeout_in_minutes: 60
5161
matrix:
5262
setup:
@@ -75,7 +85,12 @@ steps:
7585
queue: "juliaecosystem"
7686
os: "macos"
7787
arch: "aarch64"
78-
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
88+
if: |
89+
build.message =~ /\[only tests\]/ ||
90+
build.message =~ /\[only special\]/ ||
91+
build.message !~ /\[only/ && !build.pull_request.draft &&
92+
build.message !~ /\[skip tests\]/ &&
93+
build.message !~ /\[skip special\]/
7994
timeout_in_minutes: 60
8095
- label: "Opaque pointers"
8196
plugins:
@@ -95,5 +110,44 @@ steps:
95110
queue: "juliaecosystem"
96111
os: "macos"
97112
arch: "aarch64"
98-
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
113+
if: |
114+
build.message =~ /\[only tests\]/ ||
115+
build.message =~ /\[only special\]/ ||
116+
build.message !~ /\[only/ && !build.pull_request.draft &&
117+
build.message !~ /\[skip tests\]/ &&
118+
build.message !~ /\[skip special\]/
99119
timeout_in_minutes: 60
120+
121+
# we want to benchmark every commit on the master branch, even if it failed CI
122+
- wait: ~
123+
# continue_on_failure: true
124+
125+
- group: ":racehorse: Benchmarks"
126+
steps:
127+
- label: "Benchmarks"
128+
plugins:
129+
- JuliaCI/julia#v1:
130+
version: "1.10"
131+
command: |
132+
julia --project=perf -e '
133+
using Pkg
134+
135+
println("--- :julia: Instantiating project")
136+
Pkg.develop([PackageSpec(path=pwd())])
137+
Pkg.instantiate()
138+
push!(LOAD_PATH, @__DIR__)
139+
140+
println("+++ :julia: Benchmarking")
141+
include("perf/runbenchmarks.jl")'
142+
artifact_paths:
143+
- "benchmarkresults.json"
144+
agents:
145+
queue: "juliaecosystem"
146+
os: "macos"
147+
arch: "aarch64"
148+
macos_version: "15.0"
149+
if: |
150+
build.message =~ /\[only benchmarks\]/ ||
151+
build.message !~ /\[only/ && !build.pull_request.draft &&
152+
build.message !~ /\[skip benchmarks\]/
153+
timeout_in_minutes: 30

.github/workflows/Benchmark.yml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
name: Benchmarks
2+
permissions:
3+
contents: write # contents permission to update benchmark contents in gh-pages branch
4+
statuses: read
5+
deployments: write # deployments permission to deploy GitHub pages website
6+
pull-requests: write
7+
8+
on:
9+
pull_request:
10+
branches:
11+
- main
12+
paths:
13+
- "src/**/*"
14+
- "ext/**/*"
15+
- "perf/**/*"
16+
- ".buildkite/**/*"
17+
- "Project.toml"
18+
- ".github/workflows/Benchmark.yml"
19+
push:
20+
branches:
21+
- main
22+
paths:
23+
- "src/**/*"
24+
- "ext/**/*"
25+
- "benchmarks/**/*"
26+
- ".buildkite/**/*"
27+
- "Project.toml"
28+
- ".github/workflows/Benchmark.yml"
29+
30+
jobs:
31+
benchmark:
32+
if: ${{ contains(github.event.head_commit.message, '[only benchmarks]') || !contains(github.event.head_commit.message, '[only') && !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.draft == false }}
33+
runs-on: ubuntu-latest
34+
steps:
35+
- uses: actions/checkout@v4
36+
- name: Download Buildkite Artifacts
37+
id: download
38+
uses: EnricoMi/download-buildkite-artifact-action@v1
39+
with:
40+
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
41+
ignore_build_states: blocked,canceled,skipped,not_run,failed
42+
ignore_job_states: timed_out,failed
43+
output_path: artifacts
44+
45+
- name: Locate Benchmarks Artifact
46+
id: locate
47+
if: ${{ steps.download.outputs.download-state == 'success' }}
48+
run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
49+
50+
- name: Upload Benchmark Results
51+
if: ${{ steps.locate.outputs.path != '' }}
52+
uses: benchmark-action/github-action-benchmark@v1
53+
with:
54+
name: Metal Benchmarks
55+
tool: "julia"
56+
output-file-path: ${{ steps.locate.outputs.path }}
57+
benchmark-data-dir-path: ""
58+
github-token: ${{ secrets.GITHUB_TOKEN }}
59+
comment-always: true
60+
summary-always: true
61+
alert-threshold: "150%"
62+
fail-on-alert: false
63+
auto-push: ${{ github.event_name != 'pull_request' }}

perf/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
results.json
2+
reference.json

perf/Project.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[deps]
2+
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
3+
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
4+
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
5+
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
6+
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
7+
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

perf/array.jl

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
group = addgroup!(SUITE, "array")
2+
3+
const m = 512
4+
const n = 1000
5+
6+
# generate some arrays
7+
cpu_mat = rand(rng, Float32, m, n)
8+
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
9+
gpu_vec = reshape(gpu_mat, length(gpu_mat))
10+
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
11+
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
12+
gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
13+
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
14+
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
15+
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
16+
17+
group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
18+
19+
group["copy"] = @async_benchmarkable copy($gpu_mat)
20+
21+
gpu_mat2 = copy(gpu_mat)
22+
let group = addgroup!(group, "copyto!")
23+
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
24+
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
25+
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
26+
end
27+
28+
let group = addgroup!(group, "iteration")
29+
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
30+
31+
group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
32+
33+
let group = addgroup!(group, "findall")
34+
group["bool"] = @benchmarkable findall($gpu_vec_bools)
35+
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
36+
end
37+
38+
let group = addgroup!(group, "findfirst")
39+
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
40+
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
41+
end
42+
43+
let group = addgroup!(group, "findmin") # findmax
44+
group["1d"] = @async_benchmarkable findmin($gpu_vec)
45+
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
46+
end
47+
end
48+
49+
# let group = addgroup!(group, "reverse")
50+
# group["1d"] = @async_benchmarkable reverse($gpu_vec)
51+
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
52+
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
53+
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
54+
# end
55+
56+
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
57+
58+
# no need to test inplace version, which performs the same operation (but with an alloc)
59+
let group = addgroup!(group, "accumulate")
60+
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
61+
group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
62+
end
63+
64+
let group = addgroup!(group, "reductions")
65+
let group = addgroup!(group, "reduce")
66+
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
67+
group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
68+
end
69+
70+
let group = addgroup!(group, "mapreduce")
71+
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
72+
group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
73+
end
74+
75+
# used by sum, prod, minimum, maximum, all, any, count
76+
end
77+
78+
let group = addgroup!(group, "random")
79+
let group = addgroup!(group, "rand")
80+
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
81+
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
82+
end
83+
84+
let group = addgroup!(group, "rand!")
85+
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
86+
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
87+
end
88+
89+
let group = addgroup!(group, "randn")
90+
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
91+
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
92+
end
93+
94+
let group = addgroup!(group, "randn!")
95+
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
96+
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
97+
end
98+
end
99+
100+
# let group = addgroup!(group, "sorting")
101+
# group["1d"] = @async_benchmarkable sort($gpu_vec)
102+
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
103+
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
104+
# end
105+
106+
let group = addgroup!(group, "permutedims")
107+
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
108+
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
109+
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
110+
end

perf/byval.jl

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
module ByVal
2+
3+
using Metal, BenchmarkTools, Random
4+
5+
const threads = 256
6+
7+
# simple add matrixes kernel
8+
function kernel_add_mat(n, x1, x2, y)
9+
i = thread_position_in_grid_1d()
10+
if i <= n
11+
@inbounds y[i] = x1[i] + x2[i]
12+
end
13+
return
14+
end
15+
16+
@inline get_inputs3(indx_y, a, b, c) = (a, b, c)
17+
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
18+
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)
19+
20+
# add arrays of matrixes kernel
21+
function kernel_add_mat_z_slices(n, vararg...)
22+
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
23+
i = thread_position_in_grid_1d()
24+
if i <= n
25+
@inbounds y[i] = x1[i] + x2[i]
26+
end
27+
return
28+
end
29+
30+
function add_z_slices!(y, x1, x2)
31+
m1, n1 = size(x1[1]) #get size of first slice
32+
groups = (m1 * n1 + threads - 1) ÷ threads
33+
# get length(x1) more groups than needed to process 1 slice
34+
@metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
35+
end
36+
37+
function add!(y, x1, x2)
38+
m1, n1 = size(x1)
39+
groups = (m1 * n1 + threads - 1) ÷ threads
40+
@metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
41+
end
42+
43+
function main()
44+
results = BenchmarkGroup()
45+
46+
num_z_slices = 3
47+
Random.seed!(1)
48+
49+
#m, n = 7, 5 # tiny to measure overhead
50+
#m, n = 521, 111
51+
#m, n = 1521, 1111
52+
#m, n = 3001, 1511 # prime numbers to test memory access correctness
53+
m, n = 3072, 1536 # 256 multiplier
54+
#m, n = 6007, 3001 # prime numbers to test memory access correctness
55+
56+
x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
57+
x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
58+
y1 = [similar(x1[1]) for i = 1:num_z_slices]
59+
60+
# reference down to bones add on GPU
61+
results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])
62+
63+
# adding arrays in an array
64+
for slices = 1:num_z_slices
65+
results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
66+
end
67+
68+
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
69+
Metal.unsafe_free!.(x1)
70+
Metal.unsafe_free!.(x2)
71+
Metal.unsafe_free!.(y1)
72+
73+
return results
74+
end
75+
76+
end
77+
78+
ByVal.main()

perf/kernel.jl

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# using GPUArrays
2+
3+
group = addgroup!(SUITE, "kernel")
4+
5+
group["launch"] = @benchmarkable @metal identity(nothing)
6+
7+
# group["occupancy"] = @benchmarkable begin
8+
# kernel = @metal launch=false identity(nothing)
9+
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
10+
# return
11+
# end
12+
13+
src = Metal.rand(Float32, 512, 1000)
14+
dest = similar(src)
15+
function indexing_kernel(dest, src)
16+
i = thread_position_in_grid_1d()
17+
@inbounds dest[i] = src[i]
18+
return
19+
end
20+
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)
21+
22+
function checked_indexing_kernel(dest, src)
23+
i = thread_position_in_grid_1d()
24+
dest[i] = src[i]
25+
return
26+
end
27+
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)
28+
29+
## DELETE
30+
# function rand_kernel(dest::AbstractArray{T}) where {T}
31+
# i = thread_position_in_grid_1d()
32+
# dest[i] = Metal.rand(T)
33+
# return
34+
# end
35+
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)

0 commit comments

Comments
 (0)