-
Notifications
You must be signed in to change notification settings - Fork 44
[DO NOT MERGE] Test accumulate acceleratedkernel #590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/src/accumulate.jl b/src/accumulate.jl
index dee031fb..781864ce 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -170,17 +170,17 @@ end
## Base interface
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
- @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes())
+ @inline AK.accumulate!(op, output, input; dims, init = AK.neutral_element(op, eltype(output)), alg = AK.ScanPrefixes())
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
- @inline AK.accumulate!(op, output, input; dims, init=AK.neutral_element(op, eltype(output)), alg=AK.ScanPrefixes())
+ @inline AK.accumulate!(op, output, input; dims, init = AK.neutral_element(op, eltype(output)), alg = AK.ScanPrefixes())
Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
- @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes())
+ @inline AK.accumulate!(op, output, input; dims, init = something(init), alg = AK.ScanPrefixes())
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
- @inline AK.accumulate!(op, output, input; dims, init=something(init), alg=AK.ScanPrefixes())
+ @inline AK.accumulate!(op, output, input; dims, init = something(init), alg = AK.ScanPrefixes())
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v; init=AK.neutral_element(op, eltype(result)), alg=AK.ScanPrefixes())
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v; init = AK.neutral_element(op, eltype(result)), alg = AK.ScanPrefixes())
# default behavior unless dims are specified by the user
function Base.accumulate(op, A::WrappedMtlArray;
@@ -188,7 +188,7 @@ function Base.accumulate(op, A::WrappedMtlArray;
nt = values(kw)
if dims === nothing && !(A isa AbstractVector)
# This branch takes care of the cases not handled by `_accumulate!`.
- return reshape(AK.accumulate(op, A[:]; init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A))), alg=AK.ScanPrefixes()), size(A))
+ return reshape(AK.accumulate(op, A[:]; init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A))), alg = AK.ScanPrefixes()), size(A))
end
if isempty(kw)
out = similar(A, Base.promote_op(op, eltype(A), eltype(A))) |
Why not implement this at the GPUArrays.jl level? Or is this purely for debugging? |
@maleadt purely for debugging. I’ll add do not merge in the title |
The goal behind this was to get an idea of what the benchmark results look like, but there seem to be strange failures that I haven't yet determined if they're a Metal issue or an AcceleratedKernels issue. |
a3a50f3
to
98dddc9
Compare
Just for benchmarks, this still shouldn't be merged |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Metal Benchmarks
Benchmark suite | Current: e38640b | Previous: caf2996 | Ratio |
---|---|---|---|
private array/construct |
24791.666666666668 ns |
25048.666666666668 ns |
0.99 |
private array/broadcast |
462292 ns |
457958 ns |
1.01 |
private array/random/randn/Float32 |
829000 ns |
800854.5 ns |
1.04 |
private array/random/randn!/Float32 |
630875 ns |
641125 ns |
0.98 |
private array/random/rand!/Int64 |
581166 ns |
568834 ns |
1.02 |
private array/random/rand!/Float32 |
596459 ns |
597333 ns |
1.00 |
private array/random/rand/Int64 |
772687.5 ns |
774854.5 ns |
1.00 |
private array/random/rand/Float32 |
620812.5 ns |
631333 ns |
0.98 |
private array/copyto!/gpu_to_gpu |
664458 ns |
660958 ns |
1.01 |
private array/copyto!/cpu_to_gpu |
623875 ns |
832125 ns |
0.75 |
private array/copyto!/gpu_to_cpu |
823875 ns |
590416 ns |
1.40 |
private array/accumulate/1d |
1577458 ns |
1352208 ns |
1.17 |
private array/accumulate/2d |
1725958.5 ns |
1397271 ns |
1.24 |
private array/iteration/findall/int |
1681750 ns |
1811500 ns |
0.93 |
private array/iteration/findall/bool |
1473708 ns |
1579583 ns |
0.93 |
private array/iteration/findfirst/int |
1701417 ns |
1738479.5 ns |
0.98 |
private array/iteration/findfirst/bool |
1658042 ns |
1671583.5 ns |
0.99 |
private array/iteration/scalar |
3600209 ns |
3925542 ns |
0.92 |
private array/iteration/logical |
2860896 ns |
3003542 ns |
0.95 |
private array/iteration/findmin/1d |
1749042 ns |
1771542 ns |
0.99 |
private array/iteration/findmin/2d |
1354167 ns |
1358104 ns |
1.00 |
private array/reductions/reduce/1d |
1057375 ns |
1034333 ns |
1.02 |
private array/reductions/reduce/2d |
658958 ns |
661771 ns |
1.00 |
private array/reductions/mapreduce/1d |
1042791 ns |
1042688 ns |
1.00 |
private array/reductions/mapreduce/2d |
661604.5 ns |
656417 ns |
1.01 |
private array/permutedims/4d |
2529354 ns |
2535708 ns |
1.00 |
private array/permutedims/2d |
1024166.5 ns |
1024729.5 ns |
1.00 |
private array/permutedims/3d |
1579562.5 ns |
1604959 ns |
0.98 |
private array/copy |
561083 ns |
573375 ns |
0.98 |
latency/precompile |
9974760250 ns |
9802163250 ns |
1.02 |
latency/ttfp |
4176340083 ns |
4108894000 ns |
1.02 |
latency/import |
1402819458.5 ns |
1286845416.5 ns |
1.09 |
integration/metaldevrt |
726625 ns |
708167 ns |
1.03 |
integration/byval/slices=1 |
1579854 ns |
1642916 ns |
0.96 |
integration/byval/slices=3 |
8492729 ns |
9533624.5 ns |
0.89 |
integration/byval/reference |
1613187 ns |
1619479 ns |
1.00 |
integration/byval/slices=2 |
2662416.5 ns |
2720083.5 ns |
0.98 |
kernel/indexing |
459500 ns |
461167 ns |
1.00 |
kernel/indexing_checked |
466291.5 ns |
459625 ns |
1.01 |
kernel/launch |
8250 ns |
7958 ns |
1.04 |
metal/synchronization/stream |
14750 ns |
14750 ns |
1 |
metal/synchronization/context |
15208 ns |
15250 ns |
1.00 |
shared array/construct |
24267.333333333332 ns |
23722.25 ns |
1.02 |
shared array/broadcast |
462084 ns |
462042 ns |
1.00 |
shared array/random/randn/Float32 |
855750 ns |
809042 ns |
1.06 |
shared array/random/randn!/Float32 |
647166 ns |
641042 ns |
1.01 |
shared array/random/rand!/Int64 |
570479 ns |
572917 ns |
1.00 |
shared array/random/rand!/Float32 |
600416 ns |
603417 ns |
1.00 |
shared array/random/rand/Int64 |
766541.5 ns |
792625 ns |
0.97 |
shared array/random/rand/Float32 |
645875 ns |
625166 ns |
1.03 |
shared array/copyto!/gpu_to_gpu |
80875 ns |
89375 ns |
0.90 |
shared array/copyto!/cpu_to_gpu |
83542 ns |
82834 ns |
1.01 |
shared array/copyto!/gpu_to_cpu |
84125 ns |
78708 ns |
1.07 |
shared array/accumulate/1d |
1560959 ns |
1358229 ns |
1.15 |
shared array/accumulate/2d |
1870458.5 ns |
1393083.5 ns |
1.34 |
shared array/iteration/findall/int |
1719167 ns |
1846500.5 ns |
0.93 |
shared array/iteration/findall/bool |
1471958 ns |
1593458 ns |
0.92 |
shared array/iteration/findfirst/int |
1399250 ns |
1406500 ns |
0.99 |
shared array/iteration/findfirst/bool |
1378875 ns |
1369834 ns |
1.01 |
shared array/iteration/scalar |
159541 ns |
160875 ns |
0.99 |
shared array/iteration/logical |
2647333 ns |
2872291.5 ns |
0.92 |
shared array/iteration/findmin/1d |
1465709 ns |
1514166 ns |
0.97 |
shared array/iteration/findmin/2d |
1365541 ns |
1377125 ns |
0.99 |
shared array/reductions/reduce/1d |
737000 ns |
725229 ns |
1.02 |
shared array/reductions/reduce/2d |
664896 ns |
664500 ns |
1.00 |
shared array/reductions/mapreduce/1d |
737396 ns |
739395.5 ns |
1.00 |
shared array/reductions/mapreduce/2d |
675416 ns |
666562.5 ns |
1.01 |
shared array/permutedims/4d |
2558375 ns |
2514583 ns |
1.02 |
shared array/permutedims/2d |
1024916.5 ns |
1027667 ns |
1.00 |
shared array/permutedims/3d |
1575834 ns |
1579792 ns |
1.00 |
shared array/copy |
249625 ns |
250375 ns |
1.00 |
This comment was automatically generated by workflow using github-action-benchmark.
I'm getting errors locally