Skip to content

[DO NOT MERGE] Test accumulate acceleratedkernel #590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from

Conversation

christiangnrd
Copy link
Member

@christiangnrd christiangnrd commented May 19, 2025

I'm getting errors locally

Copy link
Contributor

github-actions bot commented May 19, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.
diff --git a/perf/array.jl b/perf/array.jl
index f3ef0adf..e2d2b823 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
     gpu_vec = reshape(gpu_mat, length(gpu_mat))
     gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
     gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n))
+    gpu_mat_ints = MtlMatrix{Int, S}(rand(rng, -10:10, m, n))
     gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
     gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
     gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
@@ -60,13 +60,13 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
     let group = addgroup!(group, "accumulate")
         let group = addgroup!(group, "Float32")
             group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
-            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
-            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 2)
         end
         let group = addgroup!(group, "Int64")
             group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints)
-            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 2)
         end
     end
 
@@ -74,26 +74,26 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
         let group = addgroup!(group, "reduce")
             let group = addgroup!(group, "Float32")
                 group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
-                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 2)
             end
             let group = addgroup!(group, "Int64")
                 group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints)
-                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 2)
             end
         end
 
         let group = addgroup!(group, "mapreduce")
             let group = addgroup!(group, "Float32")
-                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
-                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2)
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 2)
             end
             let group = addgroup!(group, "Int64")
-                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints)
-                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec_ints)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 2)
             end
         end
 
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 7af43a09..db6fa99a 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -170,17 +170,17 @@ end
 ## Base interface
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
 Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
 
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result)))
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init = AK.neutral_element(op, eltype(result)))
 
 # default behavior unless dims are specified by the user
 function Base.accumulate(op, A::WrappedMtlArray;
@@ -199,5 +199,5 @@ function Base.accumulate(op, A::WrappedMtlArray;
     else
         throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
     end
-    AK.accumulate!(op, out, A, MetalBackend(); dims, init)
+    return AK.accumulate!(op, out, A, MetalBackend(); dims, init)
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 59e5f8db..a6eb3ab3 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -142,12 +142,16 @@ end
 
 ## COV_EXCL_STOP
 
-Base.mapreduce(f, op, A::WrappedMtlArray;
-            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
-            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
-Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
-            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
-            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+    f, op, A::WrappedMtlArray;
+    dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+    f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
+    dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
 
 # "Borrowed" from GPUArrays
 @inline function _init_value(f, op, init, As...)
@@ -164,7 +168,7 @@ end
 
 function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer})
     init_val = _init_value(f, op, init, A)
-    AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims)
+    return AK.mapreduce(f, op, A; init = init_val, neutral = init_val, dims)
 end
 _mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing)
 _mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init)

@christiangnrd christiangnrd marked this pull request as draft May 20, 2025 02:24
@maleadt
Copy link
Member

maleadt commented May 20, 2025

Why not implement this at the GPUArrays.jl level? Or is this purely for debugging?

@christiangnrd
Copy link
Member Author

@maleadt purely for debugging. I’ll add do not merge in the title

@christiangnrd christiangnrd changed the title Test accumulate acceleratedkernel [DO NOT MERGE] Test accumulate acceleratedkernel May 20, 2025
@christiangnrd
Copy link
Member Author

The goal behind this was to get an idea of what the benchmark results look like, but there seem to be strange failures that I haven't yet determined if they're a Metal issue or an AcceleratedKernels issue.

@christiangnrd christiangnrd force-pushed the testak branch 2 times, most recently from a3a50f3 to 98dddc9 Compare May 21, 2025 01:27
@christiangnrd christiangnrd changed the base branch from main to tests May 21, 2025 01:30
@christiangnrd christiangnrd changed the base branch from tests to main May 21, 2025 01:32
@christiangnrd christiangnrd marked this pull request as ready for review May 21, 2025 01:33
@christiangnrd
Copy link
Member Author

Just for benchmarks, this still shouldn't be merged

@christiangnrd christiangnrd force-pushed the testak branch 2 times, most recently from 69e30e0 to e38640b Compare May 21, 2025 13:14
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Metal Benchmarks

Benchmark suite Current: e1935ed Previous: 4abe75f Ratio
private array/construct 23500 ns 26364.666666666668 ns 0.89
private array/broadcast 460562.5 ns 466750 ns 0.99
private array/random/randn/Float32 935166 ns 843083 ns 1.11
private array/random/randn!/Float32 598083 ns 633208 ns 0.94
private array/random/rand!/Int64 556895.5 ns 560750 ns 0.99
private array/random/rand!/Float32 565833 ns 594542 ns 0.95
private array/random/rand/Int64 865959 ns 767854 ns 1.13
private array/random/rand/Float32 798625 ns 621500 ns 1.28
private array/copyto!/gpu_to_gpu 578250 ns 629625 ns 0.92
private array/copyto!/cpu_to_gpu 633000 ns 655708.5 ns 0.97
private array/copyto!/gpu_to_cpu 666645.5 ns 821708 ns 0.81
private array/accumulate/Int64/1d 2000520.5 ns
private array/accumulate/Int64/dims=1 2176625 ns
private array/accumulate/Int64/dims=2 3040000 ns
private array/accumulate/Float32/1d 1774916 ns
private array/accumulate/Float32/dims=1 1962042 ns
private array/accumulate/Float32/dims=2 2351812.5 ns
private array/iteration/findall/int 1912167 ns 1848833 ns 1.03
private array/iteration/findall/bool 1670270.5 ns 1610708.5 ns 1.04
private array/iteration/findfirst/int 1830083 ns 1716416 ns 1.07
private array/iteration/findfirst/bool 1740187 ns 1681917 ns 1.03
private array/iteration/scalar 2474083 ns 3851541.5 ns 0.64
private array/iteration/logical 3091209 ns 3013084 ns 1.03
private array/iteration/findmin/1d 1897792 ns 1793417 ns 1.06
private array/iteration/findmin/2d 1434417 ns 1369667 ns 1.05
private array/reductions/reduce/Int64/1d 995500 ns
private array/reductions/reduce/Int64/dims=1 836416.5 ns
private array/reductions/reduce/Int64/dims=2 712729.5 ns
private array/reductions/reduce/Float32/1d 1133834 ns
private array/reductions/reduce/Float32/dims=1 825375.5 ns
private array/reductions/reduce/Float32/dims=2 743125 ns
private array/reductions/mapreduce/Int64/1d 1002250 ns
private array/reductions/mapreduce/Int64/dims=1 833417 ns
private array/reductions/mapreduce/Int64/dims=2 700062.5 ns
private array/reductions/mapreduce/Float32/1d 1124167 ns
private array/reductions/mapreduce/Float32/dims=1 813437.5 ns
private array/reductions/mapreduce/Float32/dims=2 745354.5 ns
private array/permutedims/4d 2673084 ns 2524541 ns 1.06
private array/permutedims/2d 1090146 ns 1040583 ns 1.05
private array/permutedims/3d 1805542 ns 1604084 ns 1.13
private array/copy 827292 ns 627437.5 ns 1.32
latency/precompile 9972765625 ns 9812875250 ns 1.02
latency/ttfp 4152996666 ns 4071015166 ns 1.02
latency/import 1306905250 ns 1279968583 ns 1.02
integration/metaldevrt 770750 ns 706083 ns 1.09
integration/byval/slices=1 1667250 ns 1645500 ns 1.01
integration/byval/slices=3 19804125 ns 10475417 ns 1.89
integration/byval/reference 1664208 ns 1618417 ns 1.03
integration/byval/slices=2 2840000 ns 2694041.5 ns 1.05
kernel/indexing 473500 ns 458250 ns 1.03
kernel/indexing_checked 467292 ns 462875 ns 1.01
kernel/launch 9923.666666666666 ns 8167 ns 1.22
metal/synchronization/stream 15042 ns 14958 ns 1.01
metal/synchronization/context 16084 ns 15208 ns 1.06
shared array/construct 23558.4 ns 23701.416666666664 ns 0.99
shared array/broadcast 463166.5 ns 467292 ns 0.99
shared array/random/randn/Float32 911624.5 ns 789125 ns 1.16
shared array/random/randn!/Float32 598208 ns 634708 ns 0.94
shared array/random/rand!/Int64 560417 ns 561209 ns 1.00
shared array/random/rand!/Float32 563000 ns 599792 ns 0.94
shared array/random/rand/Int64 864313 ns 807979 ns 1.07
shared array/random/rand/Float32 833417 ns 634062.5 ns 1.31
shared array/copyto!/gpu_to_gpu 83083 ns 79666 ns 1.04
shared array/copyto!/cpu_to_gpu 79042 ns 83792 ns 0.94
shared array/copyto!/gpu_to_cpu 79167 ns 83375 ns 0.95
shared array/accumulate/Int64/1d 2300958 ns
shared array/accumulate/Int64/dims=1 2963333 ns
shared array/accumulate/Int64/dims=2 3742875 ns
shared array/accumulate/Float32/1d 1777437.5 ns
shared array/accumulate/Float32/dims=1 2137375 ns
shared array/accumulate/Float32/dims=2 2496250 ns
shared array/iteration/findall/int 1954375.5 ns 1869000 ns 1.05
shared array/iteration/findall/bool 1695833.5 ns 1607125 ns 1.06
shared array/iteration/findfirst/int 1510645.5 ns 1430125 ns 1.06
shared array/iteration/findfirst/bool 1456125 ns 1386250 ns 1.05
shared array/iteration/scalar 161792 ns 160291.5 ns 1.01
shared array/iteration/logical 3178375 ns 2911209 ns 1.09
shared array/iteration/findmin/1d 1566958.5 ns 1472625 ns 1.06
shared array/iteration/findmin/2d 1448750 ns 1382417 ns 1.05
shared array/reductions/reduce/Int64/1d 1006541.5 ns
shared array/reductions/reduce/Int64/dims=1 838646 ns
shared array/reductions/reduce/Int64/dims=2 717500 ns
shared array/reductions/reduce/Float32/1d 1122875 ns
shared array/reductions/reduce/Float32/dims=1 840500 ns
shared array/reductions/reduce/Float32/dims=2 747667 ns
shared array/reductions/mapreduce/Int64/1d 1021292 ns
shared array/reductions/mapreduce/Int64/dims=1 837958 ns
shared array/reductions/mapreduce/Int64/dims=2 697709 ns
shared array/reductions/mapreduce/Float32/1d 1120375 ns
shared array/reductions/mapreduce/Float32/dims=1 837291 ns
shared array/reductions/mapreduce/Float32/dims=2 757083 ns
shared array/permutedims/4d 2645646 ns 2528916 ns 1.05
shared array/permutedims/2d 1115500 ns 1043958 ns 1.07
shared array/permutedims/3d 1823833 ns 1589500 ns 1.15
shared array/copy 211333 ns 247500 ns 0.85

This comment was automatically generated by workflow using github-action-benchmark.

Copy link

codecov bot commented May 27, 2025

Codecov Report

Attention: Patch coverage is 66.66667% with 3 lines in your changes missing coverage. Please review.

Project coverage is 79.61%. Comparing base (4abe75f) to head (4a8c4e1).

Files with missing lines Patch % Lines
src/accumulate.jl 66.66% 3 Missing ⚠️
Additional details and impacted files
@@            Coverage Diff             @@
##             main     #590      +/-   ##
==========================================
- Coverage   80.66%   79.61%   -1.06%     
==========================================
  Files          61       61              
  Lines        2690     2688       -2     
==========================================
- Hits         2170     2140      -30     
- Misses        520      548      +28     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

@christiangnrd christiangnrd force-pushed the testak branch 2 times, most recently from 9449e39 to 8da0d16 Compare June 16, 2025 18:30
[only benchmarks]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants