Updates for readability following review.

rod-chapman · rod-chapman · commit dc80b36b891c · 2025-01-14T09:35:36.000Z
Introduces local gs_butterfly_reduce() and
gs_butterfly_defer() functions, which are inlined
for both compilation and proof, but significantly
simplify and improve readability of the calling
functions.

Signed-off-by: Rod Chapman &lt;rodchap@amazon.com&gt;
diff --git a/examples/monolithic_build/mlkem_native_monobuild.c b/examples/monolithic_build/mlkem_native_monobuild.c
@@ -1562,6 +1562,16 @@
 #undef ct_butterfly
 #endif
 
+/* mlkem/ntt.c */
+#if defined(gs_butterfly_defer)
+#undef gs_butterfly_defer
+#endif
+
+/* mlkem/ntt.c */
+#if defined(gs_butterfly_reduce)
+#undef gs_butterfly_reduce
+#endif
+
 /* mlkem/ntt.c */
 #if defined(invntt_layer321)
 #undef invntt_layer321
diff --git a/mlkem/ntt.c b/mlkem/ntt.c
@@ -22,6 +22,8 @@
 #define ntt_layer7_butterfly MLKEM_NAMESPACE(ntt_layer7_butterfly)
 #define ntt_layer7 MLKEM_NAMESPACE(ntt_layer7)
 
+#define gs_butterfly_reduce MLKEM_NAMESPACE(gs_butterfly_reduce)
+#define gs_butterfly_defer MLKEM_NAMESPACE(gs_butterfly_defer)
 #define invntt_layer7_invert_butterfly \
   MLKEM_NAMESPACE(invntt_layer7_invert_butterfly)
 #define invntt_layer7_invert MLKEM_NAMESPACE(invntt_layer7_invert)
@@ -108,7 +110,7 @@
 
 /* ct_butterfly() performs a single CT Butterfly step   */
 /* in polynomial denoted by r, using the coefficients   */
-/* index by coeff1_index and coeff2_index, and the      */
+/* indexed by coeff1_index and coeff2_index, and the    */
 /* given value of zeta.                                 */
 /*                                                      */
 /* NOTE that this function is marked INLINE for         */
@@ -445,6 +447,37 @@ STATIC_ASSERT(INVNTT_BOUND_REF <= INVNTT_BOUND, invntt_bound)
 /*  Used to invert and reduce coefficients in the Inverse NTT. */
 #define MONT_F 1441
 
+/* gs_butterfly_reduce() performs a single GS Butterfly */
+/* step in polynomial denoted by r, using the           */
+/* coefficients indexes coeff1_index and coeff2_index   */
+/* and the given value of zeta.                         */
+/*                                                      */
+/* Like ct_butterfly(), this functions is inlined       */
+/* for both compilation and proof.                      */
+static INLINE void gs_butterfly_reduce(int16_t r[MLKEM_N],
+                                       const int coeff1_index,
+                                       const int coeff2_index,
+                                       const int16_t zeta)
+{
+  const int16_t t1 = r[coeff1_index];
+  const int16_t t2 = r[coeff2_index];
+  r[coeff1_index] = barrett_reduce(t1 + t2);
+  r[coeff2_index] = fqmul((t2 - t1), zeta);
+}
+
+/* As gs_butterfly_reduce(), but does not reduce the */
+/* coefficient denoted by coeff1_index               */
+static INLINE void gs_butterfly_defer(int16_t r[MLKEM_N],
+                                      const int coeff1_index,
+                                      const int coeff2_index,
+                                      const int16_t zeta)
+{
+  const int16_t t1 = r[coeff1_index];
+  const int16_t t2 = r[coeff2_index];
+  r[coeff1_index] = t1 + t2;
+  r[coeff2_index] = fqmul((t2 - t1), zeta);
+}
+
 static INLINE void invntt_layer7_invert_butterfly(int16_t r[MLKEM_N],
                                                   int zeta_index, int start)
 __contract__(
@@ -467,17 +500,14 @@ __contract__(
   /* Invert and reduce all coefficients here the first time they */
   /* are read. This is efficient, and also means we can accept   */
   /* any int16_t value for all coefficients as input.            */
-  const int16_t c0 = fqmul(r[ci0], MONT_F);
-  const int16_t c1 = fqmul(r[ci1], MONT_F);
-  const int16_t c2 = fqmul(r[ci2], MONT_F);
-  const int16_t c3 = fqmul(r[ci3], MONT_F);
+  r[ci0] = fqmul(r[ci0], MONT_F);
+  r[ci1] = fqmul(r[ci1], MONT_F);
+  r[ci2] = fqmul(r[ci2], MONT_F);
+  r[ci3] = fqmul(r[ci3], MONT_F);
 
   /* Reduce all coefficients here to meet the precondition of Layer 6 */
-  r[ci0] = barrett_reduce(c0 + c2);
-  r[ci2] = fqmul((c2 - c0), zeta);
-
-  r[ci1] = barrett_reduce(c1 + c3);
-  r[ci3] = fqmul((c3 - c1), zeta);
+  gs_butterfly_reduce(r, ci0, ci2, zeta);
+  gs_butterfly_reduce(r, ci1, ci3, zeta);
 }
 
 static void invntt_layer7_invert(int16_t r[MLKEM_N])
@@ -521,28 +551,13 @@ __contract__(
   const int ci5 = ci0 + 5;
   const int ci6 = ci0 + 6;
   const int ci7 = ci0 + 7;
-  const int16_t c0 = r[ci0];
-  const int16_t c1 = r[ci1];
-  const int16_t c2 = r[ci2];
-  const int16_t c3 = r[ci3];
-  const int16_t c4 = r[ci4];
-  const int16_t c5 = r[ci5];
-  const int16_t c6 = r[ci6];
-  const int16_t c7 = r[ci7];
 
   /* Defer reduction of coefficients 0, 1, 2, and 3 here so they */
   /* are bounded to NTT_BOUND2 after Layer6                      */
-  r[ci0] = c0 + c4;
-  r[ci4] = fqmul((c4 - c0), zeta);
-
-  r[ci1] = c1 + c5;
-  r[ci5] = fqmul((c5 - c1), zeta);
-
-  r[ci2] = c2 + c6;
-  r[ci6] = fqmul((c6 - c2), zeta);
-
-  r[ci3] = c3 + c7;
-  r[ci7] = fqmul((c7 - c3), zeta);
+  gs_butterfly_defer(r, ci0, ci4, zeta);
+  gs_butterfly_defer(r, ci1, ci5, zeta);
+  gs_butterfly_defer(r, ci2, ci6, zeta);
+  gs_butterfly_defer(r, ci3, ci7, zeta);
 }
 
 static void invntt_layer6(int16_t r[MLKEM_N])
@@ -603,36 +618,13 @@ __contract__(
     const int ci16 = ci0 + 16;
     const int ci24 = ci0 + 24;
 
-    /* Layer 5 */
-    {
-      const int16_t c0 = r[ci0];
-      const int16_t c8 = r[ci8];
-      const int16_t c16 = r[ci16];
-      const int16_t c24 = r[ci24];
-
-      /* Defer reduction of coeffs 0 and 16 here */
-      r[ci0] = c0 + c8;
-      r[ci8] = fqmul(c8 - c0, l5zeta2);
-
-      r[ci16] = c16 + c24;
-      r[ci24] = fqmul(c24 - c16, l5zeta1);
-    }
-
-    /* Layer 4 */
-    {
-      const int16_t c0 = r[ci0];
-      const int16_t c8 = r[ci8];
-      const int16_t c16 = r[ci16];
-      const int16_t c24 = r[ci24];
-
-      /* In layer 4, reduce all coefficients to be in NTT_BOUND1 */
-      /* to meet the pre-condition of Layer321                   */
-      r[ci0] = barrett_reduce(c0 + c16);
-      r[ci16] = fqmul(c16 - c0, l4zeta);
-
-      r[ci8] = barrett_reduce(c8 + c24);
-      r[ci24] = fqmul(c24 - c8, l4zeta);
-    }
+    /* Layer 5 - Defer reduction of coeffs 0 and 16 here */
+    gs_butterfly_defer(r, ci0, ci8, l5zeta2);
+    gs_butterfly_defer(r, ci16, ci24, l5zeta1);
+    /* Layer 4 - reduce all coefficients to be in NTT_BOUND1 */
+    /* to meet the pre-condition of Layer321                 */
+    gs_butterfly_reduce(r, ci0, ci16, l4zeta);
+    gs_butterfly_reduce(r, ci8, ci24, l4zeta);
   }
 }
 
@@ -688,76 +680,20 @@ __contract__(
     const int ci224 = j + 224;
 
     /* Layer 3 */
-    {
-      const int16_t c0 = r[ci0];
-      const int16_t c32 = r[ci32];
-      const int16_t c64 = r[ci64];
-      const int16_t c96 = r[ci96];
-      const int16_t c128 = r[ci128];
-      const int16_t c160 = r[ci160];
-      const int16_t c192 = r[ci192];
-      const int16_t c224 = r[ci224];
-
-      r[ci0] = c0 + c32;
-      r[ci32] = fqmul(c32 - c0, l3zeta7);
-
-      r[ci64] = c64 + c96;
-      r[ci96] = fqmul(c96 - c64, l3zeta6);
-
-      r[ci128] = c128 + c160;
-      r[ci160] = fqmul(c160 - c128, l3zeta5);
-
-      r[ci192] = c192 + c224;
-      r[ci224] = fqmul(c224 - c192, l3zeta4);
-    }
-
+    gs_butterfly_defer(r, ci0, ci32, l3zeta7);
+    gs_butterfly_defer(r, ci64, ci96, l3zeta6);
+    gs_butterfly_defer(r, ci128, ci160, l3zeta5);
+    gs_butterfly_defer(r, ci192, ci224, l3zeta4);
     /* Layer 2 */
-    {
-      const int16_t c0 = r[ci0];
-      const int16_t c32 = r[ci32];
-      const int16_t c64 = r[ci64];
-      const int16_t c96 = r[ci96];
-      const int16_t c128 = r[ci128];
-      const int16_t c160 = r[ci160];
-      const int16_t c192 = r[ci192];
-      const int16_t c224 = r[ci224];
-
-      r[ci0] = c0 + c64;
-      r[ci64] = fqmul(c64 - c0, l2zeta3);
-
-      r[ci32] = c32 + c96;
-      r[ci96] = fqmul(c96 - c32, l2zeta3);
-
-      r[ci128] = c128 + c192;
-      r[ci192] = fqmul(c192 - c128, l2zeta2);
-
-      r[ci160] = c160 + c224;
-      r[ci224] = fqmul(c224 - c160, l2zeta2);
-    }
-
+    gs_butterfly_defer(r, ci0, ci64, l2zeta3);
+    gs_butterfly_defer(r, ci32, ci96, l2zeta3);
+    gs_butterfly_defer(r, ci128, ci192, l2zeta2);
+    gs_butterfly_defer(r, ci160, ci224, l2zeta2);
     /* Layer 1 */
-    {
-      const int16_t c0 = r[ci0];
-      const int16_t c32 = r[ci32];
-      const int16_t c64 = r[ci64];
-      const int16_t c96 = r[ci96];
-      const int16_t c128 = r[ci128];
-      const int16_t c160 = r[ci160];
-      const int16_t c192 = r[ci192];
-      const int16_t c224 = r[ci224];
-
-      r[ci0] = c0 + c128;
-      r[ci128] = fqmul(c128 - c0, l1zeta1);
-
-      r[ci32] = c32 + c160;
-      r[ci160] = fqmul(c160 - c32, l1zeta1);
-
-      r[ci64] = c64 + c192;
-      r[ci192] = fqmul(c192 - c64, l1zeta1);
-
-      r[ci96] = c96 + c224;
-      r[ci224] = fqmul(c224 - c96, l1zeta1);
-    }
+    gs_butterfly_defer(r, ci0, ci128, l1zeta1);
+    gs_butterfly_defer(r, ci32, ci160, l1zeta1);
+    gs_butterfly_defer(r, ci64, ci192, l1zeta1);
+    gs_butterfly_defer(r, ci96, ci224, l1zeta1);
   }
 }