JuliaGPU · christiangnrd · Sep 16, 2024 · Sep 16, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -19,7 +19,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only julia\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip julia\]/
         timeout_in_minutes: 60
         matrix:
           setup:
@@ -46,7 +51,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
         matrix:
           setup:
@@ -75,7 +85,12 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
       - label: "Opaque pointers"
         plugins:
@@ -95,5 +110,114 @@ steps:
           queue: "juliaecosystem"
           os: "macos"
           arch: "aarch64"
-        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only special\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
+
+  # we want to benchmark every commit on the master branch, even if it failed CI
+  - wait: ~
+    # continue_on_failure: true
+
+  - group: ":racehorse: Benchmarks"
+      steps:
+      - label: "Metal: Run Benchmarks"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "1.10"
+        command: |
+            julia --project=perf -e '
+              using Pkg
+
+              println("--- :julia: Instantiating project")
+              Pkg.develop([PackageSpec(path=pwd())])
+              Pkg.instantiate()
+              push!(LOAD_PATH, @__DIR__)
+
+              println("+++ :julia: Benchmarking")
+              include("perf/runbenchmarks.jl")'
+        artifact_paths:
+          - "benchmarkresults.json"
+        agents:
+          queue: "juliaecosystem"
+          os: "macos"
+          arch: "aarch64"
+        if: |
+          build.message =~ /\[only benchmarks\]/ ||
+          build.message !~ /\[only/ && !build.pull_request.draft &&
+            build.message !~ /\[skip benchmarks\]/
+        timeout_in_minutes: 30
+      # - label: "Benchmarks (dry run)"
+      #   plugins:
+      #     - JuliaCI/julia#v1:
+      #         version: "1.10"
+      #   command: |
+      #     julia --project -e '
+      #       using Pkg
+
+      #       println("--- :julia: Instantiating project")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       Pkg.activate("perf")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       push!(LOAD_PATH, @__DIR__)
+
+      #       println("+++ :julia: Benchmarking")
+      #       include("perf/runbenchmarks.jl")'
+      #   artifact_paths:
+      #     - "results.json"
+      #   agents:
+      #     queue: "juliaecosystem"
+      #     os: "macos"
+      #     arch: "aarch64"
+      #   if: |
+      #     build.message =~ /\[only benchmarks\]/ ||
+      #     build.message !~ /\[only/ && !build.pull_request.draft &&
+      #       build.message !~ /\[skip benchmarks\]/
+      #   timeout_in_minutes: 30
+
+      # if we will submit results, use the benchmark queue so that we will
+      # be running on the same system each time
+      # - label: "Benchmarks on Julia {{matrix.julia}}"
+      #   plugins:
+      #     - JuliaCI/julia#v1:
+      #         version: "{{matrix.julia}}"
+      #   env:
+      #     BENCHMARKS: "true"
+      #     CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
+      #     CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
+      #     CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
+      #     CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}"
+      #   command: |
+      #     julia --project -e '
+      #       using Pkg
+      #       ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
+
+      #       println("--- :julia: Instantiating project")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       Pkg.activate("perf")
+      #       Pkg.resolve()
+      #       Pkg.instantiate()
+      #       push!(LOAD_PATH, @__DIR__)
+
+      #       println("+++ :julia: Benchmarking")
+      #       include("perf/runbenchmarks.jl")'
+      #   agents:
+      #     queue: "benchmark"
+      #     gpu: "rtx2070"
+      #     cuda: "*"
+      #   if: |
+      #     build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ ||
+      #     build.branch =~ /^master$$/ && build.message !~ /\[only/ &&
+      #       build.message !~ /\[skip benchmarks\]/
+      #   matrix:
+      #     setup:
+      #       julia:
+      #         - "1.10"
+      #         - "1.11"
+      #   timeout_in_minutes: 30
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
@@ -0,0 +1,63 @@
+name: Benchmarks
+permissions:
+  contents: write # contents permission to update benchmark contents in gh-pages branch
+  statuses: read
+  deployments: write # deployments permission to deploy GitHub pages website
+  pull-requests: write
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "ext/**/*"
+      - "perf/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+      - ".github/workflows/Benchmark.yml"
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "ext/**/*"
+      - "benchmarks/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+      - ".github/workflows/Benchmark.yml"
+
+jobs:
+  benchmark:
+    if: ${{ !contains(github.event.head_commit.message, '[skip benchmarks]') }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Download Buildkite Artifacts
+        id: download
+        uses: EnricoMi/download-buildkite-artifact-action@v1
+        with:
+          buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+          ignore_build_states: blocked,canceled,skipped,not_run,failed
+          ignore_job_states: timed_out,failed
+          output_path: artifacts
+
+      - name: Locate Benchmarks Artifact
+        id: locate
+        if: ${{ steps.download.outputs.download-state == 'success' }}
+        run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+      - name: Upload Benchmark Results
+        if: ${{ steps.locate.outputs.path != '' }}
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: Metal Benchmarks
+          tool: "julia"
+          output-file-path: ${{ steps.locate.outputs.path }}
+          benchmark-data-dir-path: ""
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: true
+          summary-always: true
+          alert-threshold: "150%"
+          fail-on-alert: false
+          auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/perf/.gitignore b/perf/.gitignore
@@ -0,0 +1,2 @@
+results.json
+reference.json
diff --git a/perf/Project.toml b/perf/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/array.jl b/perf/array.jl
@@ -0,0 +1,110 @@
+group = addgroup!(SUITE, "array")
+
+const m = 512
+const n = 1000
+
+# generate some arrays
+cpu_mat = rand(rng, Float32, m, n)
+gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
+gpu_vec = reshape(gpu_mat, length(gpu_mat))
+gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
+gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
+gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
+
+group["copy"] = @async_benchmarkable copy($gpu_mat)
+
+gpu_mat2 = copy(gpu_mat)
+let group = addgroup!(group, "copyto!")
+    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+end
+
+let group = addgroup!(group, "iteration")
+    group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
+
+    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+
+    let group = addgroup!(group, "findall")
+        group["bool"] = @benchmarkable findall($gpu_vec_bools)
+        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findfirst")
+        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "findmin") # findmax
+        group["1d"] = @async_benchmarkable findmin($gpu_vec)
+        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+    end
+end
+
+# let group = addgroup!(group, "reverse")
+#     group["1d"] = @async_benchmarkable reverse($gpu_vec)
+#     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
+#     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
+#     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
+# end
+
+group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# no need to test inplace version, which performs the same operation (but with an alloc)
+let group = addgroup!(group, "accumulate")
+    group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
+    group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+end
+
+let group = addgroup!(group, "reductions")
+    let group = addgroup!(group, "reduce")
+        group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
+        group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+    end
+
+    let group = addgroup!(group, "mapreduce")
+        group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+        group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+    end
+
+    # used by sum, prod, minimum, maximum, all, any, count
+end
+
+let group = addgroup!(group, "random")
+    let group = addgroup!(group, "rand")
+        group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
+        group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "rand!")
+        group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
+        group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
+    end
+
+    let group = addgroup!(group, "randn")
+        group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
+        # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
+    end
+
+    let group = addgroup!(group, "randn!")
+        group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
+        # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
+    end
+end
+
+# let group = addgroup!(group, "sorting")
+#     group["1d"] = @async_benchmarkable sort($gpu_vec)
+#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+# end
+
+let group = addgroup!(group, "permutedims")
+    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+end