-
Notifications
You must be signed in to change notification settings - Fork 252
Benchmark reverse on bigger arrays #2833
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/perf/array.jl b/perf/array.jl
index 30348a512..7adb375db 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -55,11 +55,11 @@ let group = addgroup!(group, "reverse")
group["1d"] = @async_benchmarkable reverse($gpu_vec)
group["1dL"] = @async_benchmarkable reverse($gpu_vec_long)
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
- group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims=1)
+ group["2dL"] = @async_benchmarkable reverse($gpu_mat_long; dims = 1)
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
group["1dL_inplace"] = @async_benchmarkable reverse!($gpu_vec_long)
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
- group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
+ group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims = 2)
end
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 |
Head branch was pushed to by a user without write access
391638b
to
33f73e0
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
Benchmark suite | Current: 7c005df | Previous: d670186 | Ratio |
---|---|---|---|
latency/precompile |
43572403994.5 ns |
43647228816 ns |
1.00 |
latency/ttfp |
7274860573 ns |
7277435679 ns |
1.00 |
latency/import |
3835703622.5 ns |
3852618873.5 ns |
1.00 |
integration/volumerhs |
9609940 ns |
9627931.5 ns |
1.00 |
integration/byval/slices=1 |
147028 ns |
147110 ns |
1.00 |
integration/byval/slices=3 |
426147 ns |
426020 ns |
1.00 |
integration/byval/reference |
145137 ns |
145041 ns |
1.00 |
integration/byval/slices=2 |
286450 ns |
286551.5 ns |
1.00 |
integration/cudadevrt |
103552 ns |
103546 ns |
1.00 |
kernel/indexing |
14270 ns |
14322 ns |
1.00 |
kernel/indexing_checked |
14989 ns |
14915 ns |
1.00 |
kernel/occupancy |
688.0526315789474 ns |
667.2075471698113 ns |
1.03 |
kernel/launch |
2132.95 ns |
2223.222222222222 ns |
0.96 |
kernel/rand |
14752 ns |
18438 ns |
0.80 |
array/reverse/1d |
19648 ns |
19972 ns |
0.98 |
array/reverse/2dL_inplace |
66921 ns |
||
array/reverse/1dL |
69924 ns |
||
array/reverse/2d |
22043 ns |
24087.5 ns |
0.92 |
array/reverse/1d_inplace |
9561 ns |
10265 ns |
0.93 |
array/reverse/2d_inplace |
11078 ns |
11666 ns |
0.95 |
array/reverse/2dL |
74018 ns |
||
array/reverse/1dL_inplace |
66771 ns |
||
array/copy |
20334 ns |
21022 ns |
0.97 |
array/iteration/findall/int |
157708 ns |
157451 ns |
1.00 |
array/iteration/findall/bool |
139787 ns |
138964 ns |
1.01 |
array/iteration/findfirst/int |
2161061.5 ns |
2145633 ns |
1.01 |
array/iteration/findfirst/bool |
2143779 ns |
2125537.5 ns |
1.01 |
array/iteration/scalar |
72525 ns |
72116 ns |
1.01 |
array/iteration/logical |
236269.5 ns |
235859 ns |
1.00 |
array/iteration/findmin/1d |
258429 ns |
258194 ns |
1.00 |
array/iteration/findmin/2d |
96303 ns |
96186 ns |
1.00 |
array/reductions/reduce/Int64/1d |
147342 ns |
147739.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=1 |
43938 ns |
44174 ns |
0.99 |
array/reductions/reduce/Int64/dims=2 |
61319 ns |
61550 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
88919 ns |
89011.5 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
654799 ns |
657807 ns |
1.00 |
array/reductions/reduce/Float32/1d |
103721 ns |
104011 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
40824 ns |
40950 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
59404 ns |
59626 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52297 ns |
52408 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
544052 ns |
547881 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
148541 ns |
149756 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=1 |
43984 ns |
44269 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=2 |
61397 ns |
61933 ns |
0.99 |
array/reductions/mapreduce/Int64/dims=1L |
88787 ns |
89034 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
680736 ns |
685908.5 ns |
0.99 |
array/reductions/mapreduce/Float32/1d |
104121 ns |
105015.5 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=1 |
40907 ns |
40993 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2 |
59419 ns |
59573 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
52511 ns |
52874 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=2L |
546452 ns |
550366 ns |
0.99 |
array/broadcast |
20216 ns |
20326 ns |
0.99 |
array/copyto!/gpu_to_gpu |
12817 ns |
12862 ns |
1.00 |
array/copyto!/cpu_to_gpu |
216578 ns |
214253 ns |
1.01 |
array/copyto!/gpu_to_cpu |
286120.5 ns |
286370.5 ns |
1.00 |
array/accumulate/Int64/1d |
124945 ns |
124932 ns |
1.00 |
array/accumulate/Int64/dims=1 |
83680 ns |
83506 ns |
1.00 |
array/accumulate/Int64/dims=2 |
157660.5 ns |
157937 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1710260 ns |
1720004 ns |
0.99 |
array/accumulate/Int64/dims=2L |
966715 ns |
968081 ns |
1.00 |
array/accumulate/Float32/1d |
109842 ns |
109339 ns |
1.00 |
array/accumulate/Float32/dims=1 |
80526 ns |
80455 ns |
1.00 |
array/accumulate/Float32/dims=2 |
147358 ns |
147609.5 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1618698.5 ns |
1618340 ns |
1.00 |
array/accumulate/Float32/dims=2L |
698592 ns |
700504 ns |
1.00 |
array/construct |
1616.4 ns |
1612.7 ns |
1.00 |
array/random/randn/Float32 |
44402 ns |
44195.5 ns |
1.00 |
array/random/randn!/Float32 |
24940 ns |
24926 ns |
1.00 |
array/random/rand!/Int64 |
27447 ns |
27742 ns |
0.99 |
array/random/rand!/Float32 |
8834.333333333334 ns |
8671.666666666666 ns |
1.02 |
array/random/rand/Int64 |
30044 ns |
30303 ns |
0.99 |
array/random/rand/Float32 |
12934.5 ns |
13171 ns |
0.98 |
array/permutedims/4d |
59971 ns |
60857.5 ns |
0.99 |
array/permutedims/2d |
53804 ns |
54404 ns |
0.99 |
array/permutedims/3d |
54872 ns |
55128 ns |
1.00 |
array/sorting/1d |
2757614 ns |
2756220.5 ns |
1.00 |
array/sorting/by |
3344832.5 ns |
3354879 ns |
1.00 |
array/sorting/2d |
1081338 ns |
1084703.5 ns |
1.00 |
cuda/synchronization/stream/auto |
1004.6923076923077 ns |
1064.1 ns |
0.94 |
cuda/synchronization/stream/nonblocking |
7875.1 ns |
7654.4 ns |
1.03 |
cuda/synchronization/stream/blocking |
798.6210526315789 ns |
827.0243902439024 ns |
0.97 |
cuda/synchronization/context/auto |
1170 ns |
1151.2 ns |
1.02 |
cuda/synchronization/context/nonblocking |
7330.2 ns |
7325.5 ns |
1.00 |
cuda/synchronization/context/blocking |
899.3617021276596 ns |
916.8333333333334 ns |
0.98 |
This comment was automatically generated by workflow using github-action-benchmark.
Test failures seem unrelated? |
33f73e0
to
7c005df
Compare
Yeah, that's #2885 |
Currently part of #2832 but this can (and should) be merged seperately beforehand.