|
26 | 26 | #include "../../../common.h"
|
27 | 27 | #if defined(MLD_ARITH_BACKEND_AARCH64)
|
28 | 28 |
|
| 29 | +.macro vins vec_out, gpr_in, lane |
| 30 | + ins \vec_out\().d[\lane], \gpr_in |
| 31 | +.endm |
| 32 | + |
| 33 | +.macro ldr_vo vec, base, offset |
| 34 | + ldr xtmp0, [\base, #\offset] |
| 35 | + ldr xtmp1, [\base, #(\offset+8)] |
| 36 | + vins \vec, xtmp0, 0 |
| 37 | + vins \vec, xtmp1, 1 |
| 38 | +.endm |
| 39 | + |
| 40 | +.macro ldr_vi vec, base, inc |
| 41 | + ldr xtmp0, [\base], #\inc |
| 42 | + ldr xtmp1, [\base, #(-\inc+8)] |
| 43 | + vins \vec, xtmp0, 0 |
| 44 | + vins \vec, xtmp1, 1 |
| 45 | +.endm |
| 46 | + |
29 | 47 | .macro mulmodq dst, src, const, idx0, idx1
|
30 | 48 | sqrdmulh t2.4s, \src\().4s, \const\().s[\idx1\()]
|
31 | 49 | mul \dst\().4s, \src\().4s, \const\().s[\idx0\()]
|
|
51 | 69 | .endm
|
52 | 70 |
|
53 | 71 | .macro load_roots_123
|
54 |
| - ldr q_root0, [r012345_ptr], #64 |
55 |
| - ldr q_root1, [r012345_ptr, #(-64 + 16)] |
56 |
| - ldr q_root2, [r012345_ptr, #(-64 + 32)] |
57 |
| - ldr q_root3, [r012345_ptr, #(-64 + 48)] |
| 72 | + ldr_vi root0, r012345_ptr, 64 |
| 73 | + ldr_vo root1, r012345_ptr, (-64 + 16) |
| 74 | + ldr_vo root2, r012345_ptr, (-64 + 32) |
| 75 | + ldr_vo root3, r012345_ptr, (-64 + 48) |
58 | 76 | .endm
|
59 | 77 |
|
60 | 78 | .macro load_roots_456
|
61 |
| - ldr q_root0, [r012345_ptr], #64 |
62 |
| - ldr q_root1, [r012345_ptr, #(-64 + 16)] |
63 |
| - ldr q_root2, [r012345_ptr, #(-64 + 32)] |
64 |
| - ldr q_root3, [r012345_ptr, #(-64 + 48)] |
| 79 | + ldr_vi root0, r012345_ptr, 64 |
| 80 | + ldr_vo root1, r012345_ptr, (-64 + 16) |
| 81 | + ldr_vo root2, r012345_ptr, (-64 + 32) |
| 82 | + ldr_vo root3, r012345_ptr, (-64 + 48) |
65 | 83 | .endm
|
66 | 84 |
|
67 | 85 | .macro load_roots_78_part1
|
68 |
| - ldr q_root0, [r67_ptr], #(12*16) |
69 |
| - ldr q_root0_tw, [r67_ptr, #(-12*16 + 1*16)] |
70 |
| - ldr q_root1, [r67_ptr, #(-12*16 + 2*16)] |
71 |
| - ldr q_root1_tw, [r67_ptr, #(-12*16 + 3*16)] |
72 |
| - ldr q_root2, [r67_ptr, #(-12*16 + 4*16)] |
73 |
| - ldr q_root2_tw, [r67_ptr, #(-12*16 + 5*16)] |
| 86 | + ldr_vi root0, r67_ptr, (12*16) |
| 87 | + ldr_vo root0_tw, r67_ptr, (-12*16 + 1*16) |
| 88 | + ldr_vo root1, r67_ptr, (-12*16 + 2*16) |
| 89 | + ldr_vo root1_tw, r67_ptr, (-12*16 + 3*16) |
| 90 | + ldr_vo root2, r67_ptr, (-12*16 + 4*16) |
| 91 | + ldr_vo root2_tw, r67_ptr, (-12*16 + 5*16) |
74 | 92 | .endm
|
75 | 93 |
|
76 | 94 | .macro load_roots_78_part2
|
77 |
| - ldr q_root0, [r67_ptr, #(-12*16 + 6*16)] |
78 |
| - ldr q_root0_tw, [r67_ptr, #(-12*16 + 7*16)] |
79 |
| - ldr q_root1, [r67_ptr, #(-12*16 + 8*16)] |
80 |
| - ldr q_root1_tw, [r67_ptr, #(-12*16 + 9*16)] |
81 |
| - ldr q_root2, [r67_ptr, #(-12*16 + 10*16)] |
82 |
| - ldr q_root2_tw, [r67_ptr, #(-12*16 + 11*16)] |
| 95 | + ldr_vo root0, r67_ptr, (-12*16 + 6*16) |
| 96 | + ldr_vo root0_tw, r67_ptr, (-12*16 + 7*16) |
| 97 | + ldr_vo root1, r67_ptr, (-12*16 + 8*16) |
| 98 | + ldr_vo root1_tw, r67_ptr, (-12*16 + 9*16) |
| 99 | + ldr_vo root2, r67_ptr, (-12*16 + 10*16) |
| 100 | + ldr_vo root2_tw, r67_ptr, (-12*16 + 11*16) |
83 | 101 | .endm
|
84 | 102 |
|
85 | 103 | .macro transpose4 data0, data1, data2, data3
|
|
129 | 147 | xtmp .req x6
|
130 | 148 | wtmp .req w6
|
131 | 149 |
|
| 150 | + xtmp0 .req x6 |
| 151 | + xtmp1 .req x7 |
| 152 | + |
132 | 153 | data0 .req v9
|
133 | 154 | data1 .req v10
|
134 | 155 | data2 .req v11
|
@@ -193,14 +214,14 @@ MLD_ASM_FN_SYMBOL(ntt_asm)
|
193 | 214 |
|
194 | 215 | .p2align 2
|
195 | 216 | layer123_start:
|
196 |
| - ldr q_data0, [in, #(0*(1024/8))] |
197 |
| - ldr q_data1, [in, #(1*(1024/8))] |
198 |
| - ldr q_data2, [in, #(2*(1024/8))] |
199 |
| - ldr q_data3, [in, #(3*(1024/8))] |
200 |
| - ldr q_data4, [in, #(4*(1024/8))] |
201 |
| - ldr q_data5, [in, #(5*(1024/8))] |
202 |
| - ldr q_data6, [in, #(6*(1024/8))] |
203 |
| - ldr q_data7, [in, #(7*(1024/8))] |
| 217 | + ldr_vo data0, in, (0*(1024/8)) |
| 218 | + ldr_vo data1, in, (1*(1024/8)) |
| 219 | + ldr_vo data2, in, (2*(1024/8)) |
| 220 | + ldr_vo data3, in, (3*(1024/8)) |
| 221 | + ldr_vo data4, in, (4*(1024/8)) |
| 222 | + ldr_vo data5, in, (5*(1024/8)) |
| 223 | + ldr_vo data6, in, (6*(1024/8)) |
| 224 | + ldr_vo data7, in, (7*(1024/8)) |
204 | 225 |
|
205 | 226 | ct_butterfly data0, data4, root0, 0, 1
|
206 | 227 | ct_butterfly data1, data5, root0, 0, 1
|
@@ -245,14 +266,14 @@ layer123_start:
|
245 | 266 |
|
246 | 267 | .p2align 2
|
247 | 268 | layer45678_start:
|
248 |
| - ldr q_data0, [in, #(64 + 16*0)] |
249 |
| - ldr q_data1, [in, #(64 + 16*1)] |
250 |
| - ldr q_data2, [in, #(64 + 16*2)] |
251 |
| - ldr q_data3, [in, #(64 + 16*3)] |
252 |
| - ldr q_data4, [inpp, #(64 + 16*0)] |
253 |
| - ldr q_data5, [inpp, #(64 + 16*1)] |
254 |
| - ldr q_data6, [inpp, #(64 + 16*2)] |
255 |
| - ldr q_data7, [inpp, #(64 + 16*3)] |
| 269 | + ldr_vo data0, in, (64 + 16*0) |
| 270 | + ldr_vo data1, in, (64 + 16*1) |
| 271 | + ldr_vo data2, in, (64 + 16*2) |
| 272 | + ldr_vo data3, in, (64 + 16*3) |
| 273 | + ldr_vo data4, inpp, (64 + 16*0) |
| 274 | + ldr_vo data5, inpp, (64 + 16*1) |
| 275 | + ldr_vo data6, inpp, (64 + 16*2) |
| 276 | + ldr_vo data7, inpp, (64 + 16*3) |
256 | 277 |
|
257 | 278 | add in, in, #64
|
258 | 279 | add inpp, inpp, #64
|
|
0 commit comments