Skip to content

Commit 2efa129

Browse files
committed
Run AArch64 INTT through SLOTHY
Signed-off-by: Matthias J. Kannwischer <[email protected]>
1 parent d4b509b commit 2efa129

File tree

3 files changed

+1550
-103
lines changed

3 files changed

+1550
-103
lines changed

dev/aarch64_clean/src/intt.S

+330
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
/* Copyright (c) 2022 Arm Limited
2+
* Copyright (c) 2022 Hanno Becker
3+
* Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
4+
* Copyright (c) The mldsa-native project authors
5+
* SPDX-License-Identifier: MIT
6+
*
7+
* Permission is hereby granted, free of charge, to any person obtaining a copy
8+
* of this software and associated documentation files (the "Software"), to deal
9+
* in the Software without restriction, including without limitation the rights
10+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
* copies of the Software, and to permit persons to whom the Software is
12+
* furnished to do so, subject to the following conditions:
13+
*
14+
* The above copyright notice and this permission notice shall be included in all
15+
* copies or substantial portions of the Software.
16+
*
17+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
* SOFTWARE.
24+
*/
25+
26+
#include "../../../common.h"
27+
#if defined(MLD_ARITH_BACKEND_AARCH64)
28+
29+
.macro mulmodq dst, src, const, idx0, idx1
30+
sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()]
31+
mul \dst\().4s, \src\().4s, \const\().s[\idx0\()]
32+
mls \dst\().4s, t2.4s, modulus.s[0]
33+
.endm
34+
35+
.macro mulmod dst, src, const, const_twisted
36+
sqrdmulh t2.4s, \src\().4s, \const_twisted\().4s
37+
mul \dst\().4s, \src\().4s, \const\().4s
38+
mls \dst\().4s, t2.4s, modulus.s[0]
39+
.endm
40+
41+
42+
.macro gs_butterfly a, b, root, idx0, idx1
43+
sub tmp.4s, \a\().4s, \b\().4s
44+
add \a\().4s, \a\().4s, \b\().4s
45+
mulmodq \b, tmp, \root, \idx0, \idx1
46+
.endm
47+
48+
.macro gs_butterfly_v a, b, root, root_twisted
49+
sub tmp.4s, \a\().4s, \b\().4s
50+
add \a\().4s, \a\().4s, \b\().4s
51+
mulmod \b, tmp, \root, \root_twisted
52+
.endm
53+
54+
.macro mul_ninv dst0, dst1, dst2, dst3, src0, src1, src2, src3
55+
mulmod \dst0, \src0, ninv, ninv_tw
56+
mulmod \dst1, \src1, ninv, ninv_tw
57+
mulmod \dst2, \src2, ninv, ninv_tw
58+
mulmod \dst3, \src3, ninv, ninv_tw
59+
.endm
60+
61+
.macro load_roots_123
62+
ldr q_root0, [r123456_ptr], #64
63+
ldr q_root1, [r123456_ptr, #(-64 + 16)]
64+
ldr q_root2, [r123456_ptr, #(-64 + 32)]
65+
ldr q_root3, [r123456_ptr, #(-64 + 48)]
66+
.endm
67+
68+
.macro load_roots_456
69+
ldr q_root0, [r123456_ptr], #64
70+
ldr q_root1, [r123456_ptr, #(-64 + 16)]
71+
ldr q_root2, [r123456_ptr, #(-64 + 32)]
72+
ldr q_root3, [r123456_ptr, #(-64 + 48)]
73+
.endm
74+
75+
.macro load_roots_78_part1
76+
ldr q_root0, [r78_ptr], #(12*16)
77+
ldr q_root0_tw, [r78_ptr, #(-12*16 + 1*16)]
78+
ldr q_root1, [r78_ptr, #(-12*16 + 2*16)]
79+
ldr q_root1_tw, [r78_ptr, #(-12*16 + 3*16)]
80+
ldr q_root2, [r78_ptr, #(-12*16 + 4*16)]
81+
ldr q_root2_tw, [r78_ptr, #(-12*16 + 5*16)]
82+
.endm
83+
84+
.macro load_roots_78_part2
85+
ldr q_root0, [r78_ptr, #(-12*16 + 6*16)]
86+
ldr q_root0_tw, [r78_ptr, #(-12*16 + 7*16)]
87+
ldr q_root1, [r78_ptr, #(-12*16 + 8*16)]
88+
ldr q_root1_tw, [r78_ptr, #(-12*16 + 9*16)]
89+
ldr q_root2, [r78_ptr, #(-12*16 + 10*16)]
90+
ldr q_root2_tw, [r78_ptr, #(-12*16 + 11*16)]
91+
.endm
92+
93+
.macro transpose4 data0, data1, data2, data3
94+
trn1 t0.4s, \data0\().4s, \data1\().4s
95+
trn2 t1.4s, \data0\().4s, \data1\().4s
96+
trn1 t2.4s, \data2\().4s, \data3\().4s
97+
trn2 t3.4s, \data2\().4s, \data3\().4s
98+
99+
trn2 \data2\().2d, t0.2d, t2.2d
100+
trn2 \data3\().2d, t1.2d, t3.2d
101+
trn1 \data0\().2d, t0.2d, t2.2d
102+
trn1 \data1\().2d, t1.2d, t3.2d
103+
.endm
104+
105+
.macro save_vregs
106+
sub sp, sp, #(16*4)
107+
stp d8, d9, [sp, #16*0]
108+
stp d10, d11, [sp, #16*1]
109+
stp d12, d13, [sp, #16*2]
110+
stp d14, d15, [sp, #16*3]
111+
.endm
112+
113+
.macro restore_vregs
114+
ldp d8, d9, [sp, #16*0]
115+
ldp d10, d11, [sp, #16*1]
116+
ldp d12, d13, [sp, #16*2]
117+
ldp d14, d15, [sp, #16*3]
118+
add sp, sp, #(16*4)
119+
.endm
120+
121+
.macro push_stack
122+
save_vregs
123+
.endm
124+
125+
.macro pop_stack
126+
restore_vregs
127+
.endm
128+
129+
.text
130+
.global MLD_ASM_NAMESPACE(intt_asm)
131+
.balign 4
132+
MLD_ASM_FN_SYMBOL(intt_asm)
133+
push_stack
134+
135+
in .req x0
136+
r78_ptr .req x1
137+
r123456_ptr .req x2
138+
139+
inp .req x3
140+
inpp .req x4
141+
count .req x5
142+
xtmp .req x6
143+
wtmp .req w6
144+
145+
data0 .req v9
146+
data1 .req v10
147+
data2 .req v11
148+
data3 .req v12
149+
data4 .req v13
150+
data5 .req v14
151+
data6 .req v15
152+
data7 .req v16
153+
154+
q_data0 .req q9
155+
q_data1 .req q10
156+
q_data2 .req q11
157+
q_data3 .req q12
158+
q_data4 .req q13
159+
q_data5 .req q14
160+
q_data6 .req q15
161+
q_data7 .req q16
162+
163+
root0 .req v0
164+
root1 .req v1
165+
root2 .req v2
166+
root3 .req v3
167+
168+
q_root0 .req q0
169+
q_root1 .req q1
170+
q_root2 .req q2
171+
q_root3 .req q3
172+
173+
tmp .req v24
174+
t0 .req v25
175+
t1 .req v26
176+
t2 .req v27
177+
t3 .req v28
178+
179+
modulus .req v8
180+
q_modulus .req q8
181+
182+
mov inp, in
183+
add inpp, inp, #64
184+
mov count, #8
185+
186+
root0_tw .req v4
187+
root1_tw .req v5
188+
root2_tw .req v6
189+
root3_tw .req v7
190+
q_root0_tw .req q4
191+
q_root1_tw .req q5
192+
q_root2_tw .req q6
193+
q_root3_tw .req q7
194+
195+
// load q = 8380417
196+
movz wtmp, #57345
197+
movk wtmp, #127, lsl #16
198+
dup modulus.4s, wtmp
199+
200+
.p2align 2
201+
layer45678_start:
202+
ld4 {data0.4S, data1.4S, data2.4S, data3.4S}, [inp]
203+
ld4 {data4.4S, data5.4S, data6.4S, data7.4S}, [inpp]
204+
205+
load_roots_78_part1
206+
207+
// Layer 8 Part 1
208+
gs_butterfly_v data0, data1, root1, root1_tw
209+
gs_butterfly_v data2, data3, root2, root2_tw
210+
// Layer 7 Part 1
211+
gs_butterfly_v data0, data2, root0, root0_tw
212+
gs_butterfly_v data1, data3, root0, root0_tw
213+
214+
load_roots_78_part2
215+
216+
// Layer 8 Part 2
217+
gs_butterfly_v data4, data5, root1, root1_tw
218+
gs_butterfly_v data6, data7, root2, root2_tw
219+
// Layer 7 Part 2
220+
gs_butterfly_v data4, data6, root0, root0_tw
221+
gs_butterfly_v data5, data7, root0, root0_tw
222+
223+
transpose4 data0, data1, data2, data3
224+
transpose4 data4, data5, data6, data7
225+
226+
load_roots_456
227+
228+
// Layer 6
229+
gs_butterfly data0, data1, root1, 2, 3
230+
gs_butterfly data2, data3, root2, 0, 1
231+
gs_butterfly data4, data5, root2, 2, 3
232+
gs_butterfly data6, data7, root3, 0, 1
233+
234+
// Layer 5
235+
gs_butterfly data0, data2, root0, 2, 3
236+
gs_butterfly data1, data3, root0, 2, 3
237+
gs_butterfly data4, data6, root1, 0, 1
238+
gs_butterfly data5, data7, root1, 0, 1
239+
240+
// Layer 4
241+
gs_butterfly data0, data4, root0, 0, 1
242+
gs_butterfly data1, data5, root0, 0, 1
243+
gs_butterfly data2, data6, root0, 0, 1
244+
gs_butterfly data3, data7, root0, 0, 1
245+
246+
// Standard way using vector instructions
247+
248+
str q_data0, [inp], #(16*4)
249+
str q_data1, [inp, #(-16*4 + 1*16)]
250+
str q_data2, [inp, #(-16*4 + 2*16)]
251+
str q_data3, [inp, #(-16*4 + 3*16)]
252+
253+
str q_data4, [inpp], #(16*4)
254+
str q_data5, [inpp, #(-16*4 + 1*16)]
255+
str q_data6, [inpp, #(-16*4 + 2*16)]
256+
str q_data7, [inpp, #(-16*4 + 3*16)]
257+
258+
add inp, inp, #64
259+
add inpp, inpp, #64
260+
261+
subs count, count, #1
262+
cbnz count, layer45678_start
263+
264+
ninv .req v25
265+
ninv_tw .req v26
266+
267+
268+
mov count, #8
269+
270+
271+
// load ninv
272+
mov wtmp, #16382 // 2^(32 - 8) mod Q
273+
dup ninv.4s, wtmp
274+
275+
// load ninv_tw = 4197891
276+
movz wtmp, #3587
277+
movk wtmp, #64, lsl #16
278+
dup ninv_tw.4s, wtmp
279+
280+
load_roots_123
281+
282+
.p2align 2
283+
layer123_start:
284+
285+
ldr q_data0, [in, #(0*(1024/8))]
286+
ldr q_data1, [in, #(1*(1024/8))]
287+
ldr q_data2, [in, #(2*(1024/8))]
288+
ldr q_data3, [in, #(3*(1024/8))]
289+
ldr q_data4, [in, #(4*(1024/8))]
290+
ldr q_data5, [in, #(5*(1024/8))]
291+
ldr q_data6, [in, #(6*(1024/8))]
292+
ldr q_data7, [in, #(7*(1024/8))]
293+
294+
gs_butterfly data0, data1, root1, 2, 3
295+
gs_butterfly data2, data3, root2, 0, 1
296+
gs_butterfly data4, data5, root2, 2, 3
297+
gs_butterfly data6, data7, root3, 0, 1
298+
299+
gs_butterfly data0, data2, root0, 2, 3
300+
gs_butterfly data1, data3, root0, 2, 3
301+
gs_butterfly data4, data6, root1, 0, 1
302+
gs_butterfly data5, data7, root1, 0, 1
303+
304+
// root0[0] includes ninv, manually computed.
305+
gs_butterfly data0, data4, root0, 0, 1
306+
gs_butterfly data1, data5, root0, 0, 1
307+
gs_butterfly data2, data6, root0, 0, 1
308+
gs_butterfly data3, data7, root0, 0, 1
309+
310+
str q_data4, [in, #(4*(1024/8))]
311+
str q_data5, [in, #(5*(1024/8))]
312+
str q_data6, [in, #(6*(1024/8))]
313+
str q_data7, [in, #(7*(1024/8))]
314+
315+
// Scale half the coeffs by 1/n; for the other half, the scaling has
316+
// been merged into the multiplication with the twiddle factor on the
317+
// last layer.
318+
mul_ninv data0, data1, data2, data3, data0, data1, data2, data3
319+
320+
str q_data0, [in], #(16)
321+
str q_data1, [in, #(-16 + 1*(1024/8))]
322+
str q_data2, [in, #(-16 + 2*(1024/8))]
323+
str q_data3, [in, #(-16 + 3*(1024/8))]
324+
325+
subs count, count, #1
326+
cbnz count, layer123_start
327+
328+
pop_stack
329+
ret
330+
#endif

mldsa/native/aarch64/src/Makefile

+4-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
2020
-c sw_pipelining.allow_post \
2121
-c variable_size \
2222
-c constraints.stalls_first_attempt=64 \
23-
-c timeout=120 \
2423
$(SLOTHY_EXTRA_FLAGS)
2524

2625
# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30.
@@ -38,5 +37,9 @@ all: ntt.S
3837
ntt.S: ../../../../dev/aarch64_clean/src/ntt.S
3938
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l layer123_start -l layer45678_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
4039

40+
intt.S: ../../../../dev/aarch64_clean/src/intt.S
41+
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l layer123_start -l layer45678_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
42+
43+
4144
clean:
4245
-$(RM) -rf *.S

0 commit comments

Comments
 (0)