-
Notifications
You must be signed in to change notification settings - Fork 46
[DO NOT MERGE] Test accumulate acceleratedkernel #590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/perf/array.jl b/perf/array.jl
index f3ef0adf..e2d2b823 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
- gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n))
+ gpu_mat_ints = MtlMatrix{Int, S}(rand(rng, -10:10, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
@@ -60,13 +60,13 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
let group = addgroup!(group, "accumulate")
let group = addgroup!(group, "Float32")
group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 2)
end
end
@@ -74,26 +74,26 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
let group = addgroup!(group, "reduce")
let group = addgroup!(group, "Float32")
group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 2)
end
end
let group = addgroup!(group, "mapreduce")
let group = addgroup!(group, "Float32")
- group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2)
+ group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec)
+ group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
- group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+ group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec_ints)
+ group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 2)
end
end
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 7af43a09..db6fa99a 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -170,17 +170,17 @@ end
## Base interface
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result)))
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init = AK.neutral_element(op, eltype(result)))
# default behavior unless dims are specified by the user
function Base.accumulate(op, A::WrappedMtlArray;
@@ -199,5 +199,5 @@ function Base.accumulate(op, A::WrappedMtlArray;
else
throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
end
- AK.accumulate!(op, out, A, MetalBackend(); dims, init)
+ return AK.accumulate!(op, out, A, MetalBackend(); dims, init)
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 59e5f8db..a6eb3ab3 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -142,12 +142,16 @@ end
## COV_EXCL_STOP
-Base.mapreduce(f, op, A::WrappedMtlArray;
- dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
- # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
-Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
- dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
- # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+ f, op, A::WrappedMtlArray;
+ dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+ f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
+ dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
# "Borrowed" from GPUArrays
@inline function _init_value(f, op, init, As...)
@@ -164,7 +168,7 @@ end
function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer})
init_val = _init_value(f, op, init, A)
- AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims)
+ return AK.mapreduce(f, op, A; init = init_val, neutral = init_val, dims)
end
_mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing)
_mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init) |
Why not implement this at the GPUArrays.jl level? Or is this purely for debugging? |
@maleadt purely for debugging. I’ll add do not merge in the title |
The goal behind this was to get an idea of what the benchmark results look like, but there seem to be strange failures that I haven't yet determined if they're a Metal issue or an AcceleratedKernels issue. |
a3a50f3
to
98dddc9
Compare
Just for benchmarks, this still shouldn't be merged |
69e30e0
to
e38640b
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
Benchmark suite | Current: e1935ed | Previous: 4abe75f | Ratio |
---|---|---|---|
private array/construct |
23500 ns |
26364.666666666668 ns |
0.89 |
private array/broadcast |
460562.5 ns |
466750 ns |
0.99 |
private array/random/randn/Float32 |
935166 ns |
843083 ns |
1.11 |
private array/random/randn!/Float32 |
598083 ns |
633208 ns |
0.94 |
private array/random/rand!/Int64 |
556895.5 ns |
560750 ns |
0.99 |
private array/random/rand!/Float32 |
565833 ns |
594542 ns |
0.95 |
private array/random/rand/Int64 |
865959 ns |
767854 ns |
1.13 |
private array/random/rand/Float32 |
798625 ns |
621500 ns |
1.28 |
private array/copyto!/gpu_to_gpu |
578250 ns |
629625 ns |
0.92 |
private array/copyto!/cpu_to_gpu |
633000 ns |
655708.5 ns |
0.97 |
private array/copyto!/gpu_to_cpu |
666645.5 ns |
821708 ns |
0.81 |
private array/accumulate/Int64/1d |
2000520.5 ns |
||
private array/accumulate/Int64/dims=1 |
2176625 ns |
||
private array/accumulate/Int64/dims=2 |
3040000 ns |
||
private array/accumulate/Float32/1d |
1774916 ns |
||
private array/accumulate/Float32/dims=1 |
1962042 ns |
||
private array/accumulate/Float32/dims=2 |
2351812.5 ns |
||
private array/iteration/findall/int |
1912167 ns |
1848833 ns |
1.03 |
private array/iteration/findall/bool |
1670270.5 ns |
1610708.5 ns |
1.04 |
private array/iteration/findfirst/int |
1830083 ns |
1716416 ns |
1.07 |
private array/iteration/findfirst/bool |
1740187 ns |
1681917 ns |
1.03 |
private array/iteration/scalar |
2474083 ns |
3851541.5 ns |
0.64 |
private array/iteration/logical |
3091209 ns |
3013084 ns |
1.03 |
private array/iteration/findmin/1d |
1897792 ns |
1793417 ns |
1.06 |
private array/iteration/findmin/2d |
1434417 ns |
1369667 ns |
1.05 |
private array/reductions/reduce/Int64/1d |
995500 ns |
||
private array/reductions/reduce/Int64/dims=1 |
836416.5 ns |
||
private array/reductions/reduce/Int64/dims=2 |
712729.5 ns |
||
private array/reductions/reduce/Float32/1d |
1133834 ns |
||
private array/reductions/reduce/Float32/dims=1 |
825375.5 ns |
||
private array/reductions/reduce/Float32/dims=2 |
743125 ns |
||
private array/reductions/mapreduce/Int64/1d |
1002250 ns |
||
private array/reductions/mapreduce/Int64/dims=1 |
833417 ns |
||
private array/reductions/mapreduce/Int64/dims=2 |
700062.5 ns |
||
private array/reductions/mapreduce/Float32/1d |
1124167 ns |
||
private array/reductions/mapreduce/Float32/dims=1 |
813437.5 ns |
||
private array/reductions/mapreduce/Float32/dims=2 |
745354.5 ns |
||
private array/permutedims/4d |
2673084 ns |
2524541 ns |
1.06 |
private array/permutedims/2d |
1090146 ns |
1040583 ns |
1.05 |
private array/permutedims/3d |
1805542 ns |
1604084 ns |
1.13 |
private array/copy |
827292 ns |
627437.5 ns |
1.32 |
latency/precompile |
9972765625 ns |
9812875250 ns |
1.02 |
latency/ttfp |
4152996666 ns |
4071015166 ns |
1.02 |
latency/import |
1306905250 ns |
1279968583 ns |
1.02 |
integration/metaldevrt |
770750 ns |
706083 ns |
1.09 |
integration/byval/slices=1 |
1667250 ns |
1645500 ns |
1.01 |
integration/byval/slices=3 |
19804125 ns |
10475417 ns |
1.89 |
integration/byval/reference |
1664208 ns |
1618417 ns |
1.03 |
integration/byval/slices=2 |
2840000 ns |
2694041.5 ns |
1.05 |
kernel/indexing |
473500 ns |
458250 ns |
1.03 |
kernel/indexing_checked |
467292 ns |
462875 ns |
1.01 |
kernel/launch |
9923.666666666666 ns |
8167 ns |
1.22 |
metal/synchronization/stream |
15042 ns |
14958 ns |
1.01 |
metal/synchronization/context |
16084 ns |
15208 ns |
1.06 |
shared array/construct |
23558.4 ns |
23701.416666666664 ns |
0.99 |
shared array/broadcast |
463166.5 ns |
467292 ns |
0.99 |
shared array/random/randn/Float32 |
911624.5 ns |
789125 ns |
1.16 |
shared array/random/randn!/Float32 |
598208 ns |
634708 ns |
0.94 |
shared array/random/rand!/Int64 |
560417 ns |
561209 ns |
1.00 |
shared array/random/rand!/Float32 |
563000 ns |
599792 ns |
0.94 |
shared array/random/rand/Int64 |
864313 ns |
807979 ns |
1.07 |
shared array/random/rand/Float32 |
833417 ns |
634062.5 ns |
1.31 |
shared array/copyto!/gpu_to_gpu |
83083 ns |
79666 ns |
1.04 |
shared array/copyto!/cpu_to_gpu |
79042 ns |
83792 ns |
0.94 |
shared array/copyto!/gpu_to_cpu |
79167 ns |
83375 ns |
0.95 |
shared array/accumulate/Int64/1d |
2300958 ns |
||
shared array/accumulate/Int64/dims=1 |
2963333 ns |
||
shared array/accumulate/Int64/dims=2 |
3742875 ns |
||
shared array/accumulate/Float32/1d |
1777437.5 ns |
||
shared array/accumulate/Float32/dims=1 |
2137375 ns |
||
shared array/accumulate/Float32/dims=2 |
2496250 ns |
||
shared array/iteration/findall/int |
1954375.5 ns |
1869000 ns |
1.05 |
shared array/iteration/findall/bool |
1695833.5 ns |
1607125 ns |
1.06 |
shared array/iteration/findfirst/int |
1510645.5 ns |
1430125 ns |
1.06 |
shared array/iteration/findfirst/bool |
1456125 ns |
1386250 ns |
1.05 |
shared array/iteration/scalar |
161792 ns |
160291.5 ns |
1.01 |
shared array/iteration/logical |
3178375 ns |
2911209 ns |
1.09 |
shared array/iteration/findmin/1d |
1566958.5 ns |
1472625 ns |
1.06 |
shared array/iteration/findmin/2d |
1448750 ns |
1382417 ns |
1.05 |
shared array/reductions/reduce/Int64/1d |
1006541.5 ns |
||
shared array/reductions/reduce/Int64/dims=1 |
838646 ns |
||
shared array/reductions/reduce/Int64/dims=2 |
717500 ns |
||
shared array/reductions/reduce/Float32/1d |
1122875 ns |
||
shared array/reductions/reduce/Float32/dims=1 |
840500 ns |
||
shared array/reductions/reduce/Float32/dims=2 |
747667 ns |
||
shared array/reductions/mapreduce/Int64/1d |
1021292 ns |
||
shared array/reductions/mapreduce/Int64/dims=1 |
837958 ns |
||
shared array/reductions/mapreduce/Int64/dims=2 |
697709 ns |
||
shared array/reductions/mapreduce/Float32/1d |
1120375 ns |
||
shared array/reductions/mapreduce/Float32/dims=1 |
837291 ns |
||
shared array/reductions/mapreduce/Float32/dims=2 |
757083 ns |
||
shared array/permutedims/4d |
2645646 ns |
2528916 ns |
1.05 |
shared array/permutedims/2d |
1115500 ns |
1043958 ns |
1.07 |
shared array/permutedims/3d |
1823833 ns |
1589500 ns |
1.15 |
shared array/copy |
211333 ns |
247500 ns |
0.85 |
This comment was automatically generated by workflow using github-action-benchmark.
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #590 +/- ##
==========================================
- Coverage 80.66% 79.61% -1.06%
==========================================
Files 61 61
Lines 2690 2688 -2
==========================================
- Hits 2170 2140 -30
- Misses 520 548 +28 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
9449e39
to
8da0d16
Compare
[only benchmarks]
I'm getting errors locally