Skip to content

Commit b2f2516

Browse files
committed
switch to scalar loads
1 parent 0cc3dd6 commit b2f2516

File tree

3 files changed

+2076
-1170
lines changed

3 files changed

+2076
-1170
lines changed

dev/aarch64_clean/src/ntt.S

Lines changed: 57 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,24 @@
2626
#include "../../../common.h"
2727
#if defined(MLD_ARITH_BACKEND_AARCH64)
2828

29+
.macro vins vec_out, gpr_in, lane
30+
ins \vec_out\().d[\lane], \gpr_in
31+
.endm
32+
33+
.macro ldr_vo vec, base, offset
34+
ldr xtmp0, [\base, #\offset]
35+
ldr xtmp1, [\base, #(\offset+8)]
36+
vins \vec, xtmp0, 0
37+
vins \vec, xtmp1, 1
38+
.endm
39+
40+
.macro ldr_vi vec, base, inc
41+
ldr xtmp0, [\base], #\inc
42+
ldr xtmp1, [\base, #(-\inc+8)]
43+
vins \vec, xtmp0, 0
44+
vins \vec, xtmp1, 1
45+
.endm
46+
2947
.macro mulmodq dst, src, const, idx0, idx1
3048
sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()]
3149
mul \dst\().4s, \src\().4s, \const\().s[\idx0\()]
@@ -51,35 +69,35 @@
5169
.endm
5270

5371
.macro load_roots_123
54-
ldr q_root0, [r012345_ptr], #64
55-
ldr q_root1, [r012345_ptr, #(-64 + 16)]
56-
ldr q_root2, [r012345_ptr, #(-64 + 32)]
57-
ldr q_root3, [r012345_ptr, #(-64 + 48)]
72+
ldr_vi root0, r012345_ptr, 64
73+
ldr_vo root1, r012345_ptr, (-64 + 16)
74+
ldr_vo root2, r012345_ptr, (-64 + 32)
75+
ldr_vo root3, r012345_ptr, (-64 + 48)
5876
.endm
5977

6078
.macro load_roots_456
61-
ldr q_root0, [r012345_ptr], #64
62-
ldr q_root1, [r012345_ptr, #(-64 + 16)]
63-
ldr q_root2, [r012345_ptr, #(-64 + 32)]
64-
ldr q_root3, [r012345_ptr, #(-64 + 48)]
79+
ldr_vi root0, r012345_ptr, 64
80+
ldr_vo root1, r012345_ptr, (-64 + 16)
81+
ldr_vo root2, r012345_ptr, (-64 + 32)
82+
ldr_vo root3, r012345_ptr, (-64 + 48)
6583
.endm
6684

6785
.macro load_roots_78_part1
68-
ldr q_root0, [r67_ptr], #(12*16)
69-
ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)]
70-
ldr q_root1, [r67_ptr, #(-12*16 + 2*16)]
71-
ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)]
72-
ldr q_root2, [r67_ptr, #(-12*16 + 4*16)]
73-
ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)]
86+
ldr_vi root0, r67_ptr, (12*16)
87+
ldr_vo root0_tw, r67_ptr, (-12*16 + 1*16)
88+
ldr_vo root1, r67_ptr, (-12*16 + 2*16)
89+
ldr_vo root1_tw, r67_ptr, (-12*16 + 3*16)
90+
ldr_vo root2, r67_ptr, (-12*16 + 4*16)
91+
ldr_vo root2_tw, r67_ptr, (-12*16 + 5*16)
7492
.endm
7593

7694
.macro load_roots_78_part2
77-
ldr q_root0, [r67_ptr, #(-12*16 + 6*16)]
78-
ldr q_root0_tw, [r67_ptr, #(-12*16 + 7*16)]
79-
ldr q_root1, [r67_ptr, #(-12*16 + 8*16)]
80-
ldr q_root1_tw, [r67_ptr, #(-12*16 + 9*16)]
81-
ldr q_root2, [r67_ptr, #(-12*16 + 10*16)]
82-
ldr q_root2_tw, [r67_ptr, #(-12*16 + 11*16)]
95+
ldr_vo root0, r67_ptr, (-12*16 + 6*16)
96+
ldr_vo root0_tw, r67_ptr, (-12*16 + 7*16)
97+
ldr_vo root1, r67_ptr, (-12*16 + 8*16)
98+
ldr_vo root1_tw, r67_ptr, (-12*16 + 9*16)
99+
ldr_vo root2, r67_ptr, (-12*16 + 10*16)
100+
ldr_vo root2_tw, r67_ptr, (-12*16 + 11*16)
83101
.endm
84102

85103
.macro transpose4 data0, data1, data2, data3
@@ -129,6 +147,9 @@
129147
xtmp .req x6
130148
wtmp .req w6
131149

150+
xtmp0 .req x6
151+
xtmp1 .req x7
152+
132153
data0 .req v9
133154
data1 .req v10
134155
data2 .req v11
@@ -193,14 +214,14 @@ MLD_ASM_FN_SYMBOL(ntt_asm)
193214

194215
.p2align 2
195216
layer123_start:
196-
ldr q_data0, [in, #(0*(1024/8))]
197-
ldr q_data1, [in, #(1*(1024/8))]
198-
ldr q_data2, [in, #(2*(1024/8))]
199-
ldr q_data3, [in, #(3*(1024/8))]
200-
ldr q_data4, [in, #(4*(1024/8))]
201-
ldr q_data5, [in, #(5*(1024/8))]
202-
ldr q_data6, [in, #(6*(1024/8))]
203-
ldr q_data7, [in, #(7*(1024/8))]
217+
ldr_vo data0, in, (0*(1024/8))
218+
ldr_vo data1, in, (1*(1024/8))
219+
ldr_vo data2, in, (2*(1024/8))
220+
ldr_vo data3, in, (3*(1024/8))
221+
ldr_vo data4, in, (4*(1024/8))
222+
ldr_vo data5, in, (5*(1024/8))
223+
ldr_vo data6, in, (6*(1024/8))
224+
ldr_vo data7, in, (7*(1024/8))
204225

205226
ct_butterfly data0, data4, root0, 0, 1
206227
ct_butterfly data1, data5, root0, 0, 1
@@ -245,14 +266,14 @@ layer123_start:
245266

246267
.p2align 2
247268
layer45678_start:
248-
ldr q_data0, [in, #(64 + 16*0)]
249-
ldr q_data1, [in, #(64 + 16*1)]
250-
ldr q_data2, [in, #(64 + 16*2)]
251-
ldr q_data3, [in, #(64 + 16*3)]
252-
ldr q_data4, [inpp, #(64 + 16*0)]
253-
ldr q_data5, [inpp, #(64 + 16*1)]
254-
ldr q_data6, [inpp, #(64 + 16*2)]
255-
ldr q_data7, [inpp, #(64 + 16*3)]
269+
ldr_vo data0, in, (64 + 16*0)
270+
ldr_vo data1, in, (64 + 16*1)
271+
ldr_vo data2, in, (64 + 16*2)
272+
ldr_vo data3, in, (64 + 16*3)
273+
ldr_vo data4, inpp, (64 + 16*0)
274+
ldr_vo data5, inpp, (64 + 16*1)
275+
ldr_vo data6, inpp, (64 + 16*2)
276+
ldr_vo data7, inpp, (64 + 16*3)
256277

257278
add in, in, #64
258279
add inpp, inpp, #64

mldsa/native/aarch64/src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
2020
-c sw_pipelining.allow_post \
2121
-c variable_size \
2222
-c constraints.stalls_first_attempt=64 \
23-
-c timeout=120 \
23+
-c timeout=300 \
2424
$(SLOTHY_EXTRA_FLAGS)
2525

2626
# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30.

0 commit comments

Comments
 (0)