Add first attempt at generalized 2- and 3-layer merged forward

rod-chapman · rod-chapman · commit 54308a075c97 · 2025-02-21T13:52:55.000Z
NTT functions.

CBMC proofs of these new functions are all TBD.

For now, we call these in a "3,2,1,1" pattern to make sure
there are no unreferenced functions.

Signed-off-by: Rod Chapman &lt;rodchap@amazon.com&gt;
diff --git a/mlkem/poly.c b/mlkem/poly.c
@@ -22,8 +22,9 @@
 #define fqmul MLK_NAMESPACE(fqmul)
 #define barrett_reduce MLK_NAMESPACE(barrett_reduce)
 #define scalar_signed_to_unsigned_q MLK_NAMESPACE(scalar_signed_to_unsigned_q)
-#define ntt_butterfly_block MLK_NAMESPACE(ntt_butterfly_block)
-#define ntt_layer MLK_NAMESPACE(ntt_layer)
+#define ntt_1_layer MLK_NAMESPACE(ntt_1_layer)
+#define ntt_2_layers MLK_NAMESPACE(ntt_2_layers)
+#define ntt_3_layers MLK_NAMESPACE(ntt_3_layers)
 #define invntt_layer MLK_NAMESPACE(invntt_layer)
 /* End of static namespacing */
 
@@ -261,102 +262,133 @@ void poly_mulcache_compute(poly_mulcache *x, const poly *a)
 #endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
 #if !defined(MLK_USE_NATIVE_NTT)
-/*
- * Computes a block CT butterflies with a fixed twiddle factor,
- * using Montgomery multiplication.
- * Parameters:
- * - r: Pointer to base of polynomial (_not_ the base of butterfly block)
- * - root: Twiddle factor to use for the butterfly. This must be in
- *         Montgomery form and signed canonical.
- * - start: Offset to the beginning of the butterfly block
- * - len: Index difference between coefficients subject to a butterfly
- * - bound: Ghost variable describing coefficient bound: Prior to `start`,
- *          coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
- *          they must be bound by `bound`.
- * When this function returns, output coefficients in the index range
- * [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
- * Example:
- * - start=8, len=4
- *   This would compute the following four butterflies
- *          8     --    12
- *             9    --     13
- *                10   --     14
- *                   11   --     15
- * - start=4, len=2
- *   This would compute the following two butterflies
- *          4 -- 6
- *             5 -- 7
- */
-static void ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
-                                unsigned start, unsigned len, int bound)
-__contract__(
-  requires(start < MLKEM_N)
-  requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
-  requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
-  requires(-MLKEM_Q_HALF < zeta && zeta < MLKEM_Q_HALF)
-  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
-  requires(array_abs_bound(r, start, MLKEM_N, bound))
-  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
-  ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
-  ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
+
+static MLK_INLINE void ct_butterfly(int16_t r[MLKEM_N],
+                                    const unsigned coeff1_index,
+                                    const unsigned coeff2_index,
+                                    const int16_t zeta)
 {
-  /* `bound` is a ghost variable only needed in the CBMC specification */
-  unsigned j;
-  ((void)bound);
-  for (j = start; j < start + len; j++)
-  __loop__(
-    invariant(start <= j && j <= start + len)
-    /*
-     * Coefficients are updated in strided pairs, so the bounds for the
-     * intermediate states alternate twice between the old and new bound
-     */
-    invariant(array_abs_bound(r, 0,           j,           bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j,           start + len, bound))
-    invariant(array_abs_bound(r, start + len, j + len,     bound + MLKEM_Q))
-    invariant(array_abs_bound(r, j + len,     MLKEM_N,     bound)))
-  {
-    int16_t t;
-    t = fqmul(r[j + len], zeta);
-    r[j + len] = r[j] - t;
-    r[j] = r[j] + t;
-  }
+  int16_t t1 = r[coeff1_index];
+  int16_t t2 = fqmul(r[coeff2_index], zeta);
+  r[coeff1_index] = t1 + t2;
+  r[coeff2_index] = t1 - t2;
 }
 
-/*
- *Compute one layer of forward NTT
- * Parameters:
- * - r: Pointer to base of polynomial
- * - len: Stride of butterflies in this layer.
- * - layer: Ghost variable indicating which layer is being applied.
- *          Must match `len` via `len == MLKEM_N >> layer`.
- * Note: `len` could be dropped and computed in the function, but
- *   we are following the structure of the reference NTT from the
- *   official Kyber implementation here, merely adding `layer` as
- *   a ghost variable for the specifications.
- */
-static void ntt_layer(int16_t r[MLKEM_N], unsigned len, unsigned layer)
+static void ntt_1_layer(int16_t r[MLKEM_N], unsigned layer)
 __contract__(
   requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
-  requires(1 <= layer && layer <= 7 && len == (MLKEM_N >> layer))
+  requires(1 <= layer && layer <= 7)
   requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
 {
+  const unsigned len = MLKEM_N >> layer;
   unsigned start, k;
-  /* `layer` is a ghost variable only needed in the CBMC specification */
-  ((void)layer);
-  /* Twiddle factors for layer n start at index 2^(layer-1) */
-  k = MLKEM_N / (2 * len);
+  /* Twiddle factors for layer n start at index 2 ** (layer-1) */
+  k = 1 << (layer - 1);
   for (start = 0; start < MLKEM_N; start += 2 * len)
   __loop__(
     invariant(start < MLKEM_N + 2 * len)
     invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
     invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
     invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
   {
-    int16_t zeta = zetas[k++];
-    ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
+    const int16_t zeta = zetas[k++];
+    unsigned j;
+    for (j = 0; j < len; j++)
+    {
+      ct_butterfly(r, j + start, j + start + len, zeta);
+    }
+  }
+}
+
+static void ntt_2_layers(int16_t r[MLKEM_N], unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 6)
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 2) * MLKEM_Q)))
+{
+  const unsigned len = MLKEM_N >> layer;
+  unsigned start, k;
+  /* Twiddle factors for layer n start at index 2 ** (layer-1) */
+  k = 1 << (layer - 1);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  {
+    unsigned j;
+    const int16_t this_layer_zeta = zetas[k];
+    const int16_t next_layer_zeta1 = zetas[k * 2];
+    const int16_t next_layer_zeta2 = zetas[k * 2 + 1];
+    k++;
+
+    for (j = 0; j < len / 2; j++)
+    {
+      const unsigned ci0 = j + start;
+      const unsigned ci1 = ci0 + len / 2;
+      const unsigned ci2 = ci1 + len / 2;
+      const unsigned ci3 = ci2 + len / 2;
+
+      ct_butterfly(r, ci0, ci2, this_layer_zeta);
+      ct_butterfly(r, ci1, ci3, this_layer_zeta);
+      ct_butterfly(r, ci0, ci1, next_layer_zeta1);
+      ct_butterfly(r, ci2, ci3, next_layer_zeta2);
+    }
+  }
+}
+
+static void ntt_3_layers(int16_t r[MLKEM_N], unsigned layer)
+__contract__(
+  requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
+  requires(1 <= layer && layer <= 5)
+  requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
+  assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 3) * MLKEM_Q)))
+{
+  const unsigned len = MLKEM_N >> layer;
+  unsigned start, k;
+  /* Twiddle factors for layer n start at index 2 ** (layer-1) */
+  k = 1 << (layer - 1);
+  for (start = 0; start < MLKEM_N; start += 2 * len)
+  {
+    unsigned j;
+    const int16_t first_layer_zeta = zetas[k];
+    const unsigned second_layer_zi1 = k * 2;
+    const unsigned second_layer_zi2 = k * 2 + 1;
+    const int16_t second_layer_zeta1 = zetas[second_layer_zi1];
+    const int16_t second_layer_zeta2 = zetas[second_layer_zi2];
+    const int16_t third_layer_zeta1 = zetas[second_layer_zi1 * 2];
+    const int16_t third_layer_zeta2 = zetas[second_layer_zi1 * 2 + 1];
+    const int16_t third_layer_zeta3 = zetas[second_layer_zi2 * 2];
+    const int16_t third_layer_zeta4 = zetas[second_layer_zi2 * 2 + 1];
+    k++;
+
+    for (j = 0; j < len / 4; j++)
+    {
+      const unsigned ci0 = j + start;
+      const unsigned ci1 = ci0 + len / 4;
+      const unsigned ci2 = ci1 + len / 4;
+      const unsigned ci3 = ci2 + len / 4;
+      const unsigned ci4 = ci3 + len / 4;
+      const unsigned ci5 = ci4 + len / 4;
+      const unsigned ci6 = ci5 + len / 4;
+      const unsigned ci7 = ci6 + len / 4;
+
+      ct_butterfly(r, ci0, ci4, first_layer_zeta);
+      ct_butterfly(r, ci1, ci5, first_layer_zeta);
+      ct_butterfly(r, ci2, ci6, first_layer_zeta);
+      ct_butterfly(r, ci3, ci7, first_layer_zeta);
+
+      ct_butterfly(r, ci0, ci2, second_layer_zeta1);
+      ct_butterfly(r, ci1, ci3, second_layer_zeta1);
+      ct_butterfly(r, ci4, ci6, second_layer_zeta2);
+      ct_butterfly(r, ci5, ci7, second_layer_zeta2);
+
+      ct_butterfly(r, ci0, ci1, third_layer_zeta1);
+      ct_butterfly(r, ci2, ci3, third_layer_zeta2);
+      ct_butterfly(r, ci4, ci5, third_layer_zeta3);
+      ct_butterfly(r, ci6, ci7, third_layer_zeta4);
+    }
   }
 }
 
@@ -372,18 +404,14 @@ __contract__(
 MLK_INTERNAL_API
 void poly_ntt(poly *p)
 {
-  unsigned len, layer;
   int16_t *r;
   debug_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
   r = p->coeffs;
 
-  for (len = 128, layer = 1; len >= 2; len >>= 1, layer++)
-  __loop__(
-    invariant(1 <= layer && layer <= 8 && len == (MLKEM_N >> layer))
-    invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
-  {
-    ntt_layer(r, len, layer);
-  }
+  ntt_3_layers(r, 1);
+  ntt_2_layers(r, 4);
+  ntt_1_layer(r, 6);
+  ntt_1_layer(r, 7);
 
   /* Check the stronger bound */
   debug_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
@@ -490,6 +518,7 @@ MLK_EMPTY_CU(poly)
 #undef fqmul
 #undef barrett_reduce
 #undef scalar_signed_to_unsigned_q
-#undef ntt_butterfly_block
-#undef ntt_layer
+#undef ntt_1_layer
+#undef ntt_2_layers
+#undef ntt_3_layers
 #undef invntt_layer