CBMC: Refine bounds for input and output of base multiplication

hanno-becker · hanno-becker · commit 1ed3c4ab7c1d · 2025-03-25T04:17:08.000Z
Previously, the base multiplication would assume that one of its inputs
is bound by 4096 in absolute value, but make no assumptions about the
other input ("b-input" henceforth) and its mulcache.

This commit refines the bounds slightly, as follows:
- The b-input is assumed to be bound by MLK_NTT_BOUND in absolute value.
  This comes for free since all values for b _are_ results of the NTT.
- The b-cache-input is assumed to be bound by MLKEM_Q in absolute value.

With those additional bounds in place, it can be showed that the result
of the base multiplication is below INT16_MAX/2 in absolute value.
Accordingly, this can be added as a precondition for the inverse NTT.

Those refined bounds can help in subsequent commits to improve the
reduction strategy inside the inverse NTT.

For the native AVX2 backend, the new output bound for the mulcache
forces an explicit zeroization of the mulcache. This is not ideal
since the cache is in fact entirely unused, but the performance
penalty should be marginal (if the compiler can't eliminate the
zeroization in the first place).

Signed-off-by: Hanno Becker &lt;beckphan@amazon.co.uk&gt;
diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
@@ -282,9 +282,11 @@ void mlk_gen_matrix(mlk_polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
  *              - mlk_polyvec a[MLKEM_K]: Input matrix. Must be in NTT domain
  *                  and have coefficients of absolute value < 4096.
  *              - mlk_polyvec *v: Input polynomial vector. Must be in NTT
- *                  domain.
+ *                  domain and have coefficients of absolute value
+ *                  < MLK_NTT_BOUND.
  *              - mlk_polyvec *vc: Mulcache for v, computed via
- *                  mlk_polyvec_mulcache_compute().
+ *                  mlk_polyvec_mulcache_compute(). Must have coefficients
+ *                  of absolute value < MLKEM_Q.
  *
  * Specification: Implements [FIPS 203, Section 2.4.7, Eq (2.12), (2.13)]
  *
@@ -299,13 +301,16 @@ __contract__(
   requires(forall(k0, 0, MLKEM_K,
     forall(k1, 0, MLKEM_K,
       array_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))))
-  assigns(object_whole(out)))
+  requires(forall(k2, 0, MLKEM_K,
+     array_abs_bound(v->vec[k2].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+  requires(forall(k3, 0, MLKEM_K,
+     array_abs_bound(vc->vec[k3].coeffs, 0, MLKEM_N/2, MLKEM_Q)))
+  assigns(object_whole(out))
+  ensures(forall(k4, 0, MLKEM_K,
+    array_abs_bound(out->vec[k4].coeffs, 0, MLKEM_N, INT16_MAX/2))))
 {
   unsigned i;
   for (i = 0; i < MLKEM_K; i++)
-  __loop__(
-    assigns(i, object_whole(out))
-    invariant(i <= MLKEM_K))
   {
     mlk_polyvec_basemul_acc_montgomery_cached(&out->vec[i], &a[i], v, vc);
   }
diff --git a/mlkem/native/api.h b/mlkem/native/api.h
@@ -140,6 +140,7 @@ __contract__(
 static MLK_INLINE void mlk_intt_native(int16_t p[MLKEM_N])
 __contract__(
   requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
+  requires(array_abs_bound(p, 0, MLKEM_N, INT16_MAX/2))
   assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
   ensures(array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
 );
@@ -205,7 +206,8 @@ static MLK_INLINE void mlk_poly_mulcache_compute_native(
 __contract__(
   requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
   requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
-  assigns(object_whole(cache))
+  assigns(memory_slice(cache, sizeof(int16_t) * MLKEM_N / 2))
+  ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
 );
 #endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 
@@ -246,10 +248,17 @@ __contract__(
    * requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
    * ```
    */
-  requires(forall(kN, 0, 2,					  \
-              array_bound(&((int16_t(*)[MLKEM_N])(a))[kN][0], 0, MLKEM_N, \
+  requires(forall(k0, 0, 2,					  \
+              array_bound(&((int16_t(*)[MLKEM_N])(a))[k0][0], 0, MLKEM_N, \
 			  0, MLKEM_UINT12_LIMIT)))
+  requires(forall(k1, 0, 2,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N])(b))[k1][0], 0, MLKEM_N, \
+			      MLK_NTT_BOUND)))
+  requires(forall(k2, 0, 2,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N/2])(b_cache))[k2][0], 0, MLKEM_N/2, \
+			      MLKEM_Q)))
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, INT16_MAX/2))
 );
 #endif /* MLK_MULTILEVEL_BUILD_WITH_SHARED || MLKEM_K == 2 */
 
@@ -289,10 +298,17 @@ __contract__(
    * requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
    * ```
    */
-  requires(forall(kN, 0, 3,					  \
-              array_bound(&((int16_t(*)[MLKEM_N])(a))[kN][0], 0, MLKEM_N, \
+  requires(forall(k0, 0, 3,					  \
+              array_bound(&((int16_t(*)[MLKEM_N])(a))[k0][0], 0, MLKEM_N, \
 			  0, MLKEM_UINT12_LIMIT)))
+  requires(forall(k1, 0, 3,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N])(b))[k1][0], 0, MLKEM_N, \
+			      MLK_NTT_BOUND)))
+  requires(forall(k2, 0, 3,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N/2])(b_cache))[k2][0], 0, MLKEM_N/2, \
+			      MLKEM_Q)))
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, INT16_MAX/2))
 );
 #endif /* MLK_MULTILEVEL_BUILD_WITH_SHARED || MLKEM_K == 3 */
 
@@ -332,10 +348,17 @@ __contract__(
    * requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
    * ```
    */
-  requires(forall(kN, 0, 4,					  \
-              array_bound(&((int16_t(*)[MLKEM_N])(a))[kN][0], 0, MLKEM_N, \
+  requires(forall(k0, 0, 4,					  \
+              array_bound(&((int16_t(*)[MLKEM_N])(a))[k0][0], 0, MLKEM_N, \
 			  0, MLKEM_UINT12_LIMIT)))
+  requires(forall(k1, 0, 4,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N])(b))[k1][0], 0, MLKEM_N, \
+			      MLK_NTT_BOUND)))
+  requires(forall(k2, 0, 4,					  \
+              array_abs_bound(&((int16_t(*)[MLKEM_N/2])(b_cache))[k2][0], 0, MLKEM_N/2, \
+			      MLKEM_Q)))
   assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
+  ensures(array_abs_bound(r, 0, MLKEM_N, INT16_MAX/2))
 );
 #endif /* MLK_MULTILEVEL_BUILD_WITH_SHARED || MLKEM_K == 4 */
 #endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
diff --git a/mlkem/native/x86_64/meta.h b/mlkem/native/x86_64/meta.h
@@ -75,9 +75,10 @@ static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])
 static MLK_INLINE void mlk_poly_mulcache_compute_native(
     int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])
 {
-  /* AVX2 backend does not use mulcache */
+  /* AVX2 backend does not use mulcache, but the contracts require
+   * that its < MLKEM_Q in absolute value, so we just zero it. */
+  memset(x, 0, sizeof(int16_t) * MLKEM_N / 2);
   ((void)y);
-  ((void)x);
 }
 
 #if defined(MLK_MULTILEVEL_BUILD_WITH_SHARED) || MLKEM_K == 2
diff --git a/mlkem/poly.c b/mlkem/poly.c
@@ -259,14 +259,6 @@ void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
     x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], zetas[64 + i]);
     x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -zetas[64 + i]);
   }
-
-  /*
-   * This bound is true for the C implementation, but not needed
-   * in the higher level bounds reasoning. It is thus omitted
-   * them from the spec to not unnecessarily constrain native
-   * implementations, but checked here nonetheless.
-   */
-  mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
 }
 #else  /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
 MLK_INTERNAL_API
diff --git a/mlkem/poly.h b/mlkem/poly.h
@@ -79,14 +79,16 @@ static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
  **************************************************/
 static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
 __contract__(
-    requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
-	     a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
-    /* We don't attempt to express an input-dependent output bound
-     * as the post-condition here. There are two call-sites for this
-     * function:
-     * - The base multiplication: Here, we need no output bound.
-     * - mlk_fqmul: Here, we inline this function and prove another spec
-     *          for mlk_fqmul which does have a post-condition bound. */
+    /* This specification is only relevant for Montgomery reduction
+     * during base multiplication, and the input bound is tailored to that.
+     * The output bound, albeit weak, allows one addition/subtraction prior
+     * to risk of overflow; this can be useful for the inverse NTT, for example.
+     *
+     * For the use of montgomery_reduce in fqmul, we inline this
+     * function instead of calling it by contract. */
+    requires(a <= +(4 * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND) &&
+             a >= -(4 * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND))
+    ensures(return_value < (INT16_MAX / 2) && return_value > -(INT16_MAX / 2))
 )
 {
   /* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
@@ -167,17 +169,13 @@ __contract__(
  * - Caches `b_1 * \gamma` in [FIPS 203, Algorithm 12, BaseCaseMultiply, L1]
  *
  ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
 MLK_INTERNAL_API
 void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
 __contract__(
   requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
   requires(memory_no_alias(a, sizeof(mlk_poly)))
-  assigns(object_whole(x))
+  assigns(memory_slice(x, sizeof(mlk_poly_mulcache)))
+  ensures(array_abs_bound(x->coeffs, 0, MLKEM_N/2, MLKEM_Q))
 );
 
 #define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
@@ -329,6 +327,7 @@ MLK_INTERNAL_API
 void mlk_poly_invntt_tomont(mlk_poly *r)
 __contract__(
   requires(memory_no_alias(r, sizeof(mlk_poly)))
+  requires(array_abs_bound(r->coeffs, 0, MLKEM_N, INT16_MAX/2))
   assigns(memory_slice(r, sizeof(mlk_poly)))
   ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
 );
diff --git a/mlkem/poly_k.c b/mlkem/poly_k.c
@@ -130,26 +130,30 @@ void mlk_polyvec_basemul_acc_montgomery_cached(
 {
   unsigned i;
   mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
+  mlk_assert_abs_bound_2d(b, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
+  mlk_assert_abs_bound_2d(b_cache, MLKEM_K, MLKEM_N / 2, MLKEM_Q);
   for (i = 0; i < MLKEM_N / 2; i++)
-  __loop__(invariant(i <= MLKEM_N / 2))
+  __loop__(
+    invariant(i <= MLKEM_N / 2)
+    invariant(array_abs_bound(r->coeffs, 0, 2 * i, INT16_MAX/2)))
   {
     unsigned k;
-    int32_t t[2] = {0};
+    int32_t t0 = 0, t1 = 0;
     for (k = 0; k < MLKEM_K; k++)
     __loop__(
-      invariant(k <= MLKEM_K &&
-         t[0] <=    (int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768  &&
-         t[0] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
-         t[1] <=   ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
-         t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
+      invariant(k <= MLKEM_K && i <= MLKEM_N / 2 &&
+         t0 <=   ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND) &&
+	 t0 >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND) &&
+         t1 <=   ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND) &&
+         t1 >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * MLK_NTT_BOUND)))
     {
-      t[0] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
-      t[0] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
-      t[1] += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
-      t[1] += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
+      t0 += (int32_t)a->vec[k].coeffs[2 * i + 1] * b_cache->vec[k].coeffs[i];
+      t0 += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i];
+      t1 += (int32_t)a->vec[k].coeffs[2 * i] * b->vec[k].coeffs[2 * i + 1];
+      t1 += (int32_t)a->vec[k].coeffs[2 * i + 1] * b->vec[k].coeffs[2 * i];
     }
-    r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
-    r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
+    r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t0);
+    r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t1);
   }
 }
 
diff --git a/mlkem/poly_k.h b/mlkem/poly_k.h
@@ -342,9 +342,11 @@ MLK_INTERNAL_API
 void mlk_polyvec_invntt_tomont(mlk_polyvec *r)
 __contract__(
   requires(memory_no_alias(r, sizeof(mlk_polyvec)))
+  requires(forall(k0, 0, MLKEM_K,
+    array_abs_bound(r->vec[k0].coeffs, 0, MLKEM_N, INT16_MAX/2)))
   assigns(object_whole(r))
-  ensures(forall(j, 0, MLKEM_K,
-  array_abs_bound(r->vec[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
+  ensures(forall(k1, 0, MLKEM_K,
+    array_abs_bound(r->vec[k1].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
 );
 
 #define mlk_polyvec_basemul_acc_montgomery_cached \
@@ -357,7 +359,11 @@ __contract__(
  *
  *              Bounds:
  *              - Every coefficient of a is assumed to be in [0..4095]
- *              - No bounds guarantees for the coefficients in the result.
+ *              - Every coefficient of b is assumed to be bound by
+ *                MLK_NTT_BOUND in absolute value.
+ *              - Every coefficient of b_cache is assumed to be bound by
+ *                MLKEM_Q in absolute value.
+ *              - The output bounds are below INT16_MAX/2 in absolute value.
  *
  * Arguments:   - mlk_poly *r: pointer to output polynomial
  *              - const mlk_polyvec *a: pointer to first input polynomial vector
@@ -384,7 +390,12 @@ __contract__(
   requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
   requires(forall(k1, 0, MLKEM_K,
      array_bound(a->vec[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
-  assigns(object_whole(r))
+  requires(forall(k2, 0, MLKEM_K,
+     array_abs_bound(b->vec[k2].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
+  requires(forall(k3, 0, MLKEM_K,
+     array_abs_bound(b_cache->vec[k3].coeffs, 0, MLKEM_N/2, MLKEM_Q)))
+  assigns(memory_slice(r, sizeof(mlk_poly)))
+  ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, INT16_MAX/2))
 );
 
 #define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
@@ -412,17 +423,14 @@ __contract__(
  * - Caches `b_1 * \gamma` in [FIPS 203, Algorithm 12, BaseCaseMultiply, L1]
  *
  ************************************************************/
-/*
- * NOTE: The default C implementation of this function populates
- * the mulcache with values in (-q,q), but this is not needed for the
- * higher level safety proofs, and thus not part of the spec.
- */
 MLK_INTERNAL_API
 void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache *x, const mlk_polyvec *a)
 __contract__(
   requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
   requires(memory_no_alias(a, sizeof(mlk_polyvec)))
   assigns(object_whole(x))
+  ensures(forall(k0, 0, MLKEM_K,
+    array_abs_bound(x->vec[k0].coeffs, 0, MLKEM_N/2, MLKEM_Q)))
 );
 
 #define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
diff --git a/proofs/cbmc/matvec_mul/Makefile b/proofs/cbmc/matvec_mul/Makefile
@@ -13,6 +13,7 @@ DEFINES +=
 INCLUDES +=
 
 REMOVE_FUNCTION_BODY +=
+UNWINDSET += mlk_matvec_mul.0:4
 
 PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
 PROJECT_SOURCES += $(SRCDIR)/mlkem/indcpa.c

Original file line number	Diff line number	Diff line change
`@@ -75,9 +75,10 @@ static MLK_INLINE void mlk_poly_tomont_native(int16_t data[MLKEM_N])`
`75`	`75`	`static MLK_INLINE void mlk_poly_mulcache_compute_native(`
`76`	`76`	`int16_t x[MLKEM_N / 2], const int16_t y[MLKEM_N])`
`77`	`77`	`{`
`78`		`- /* AVX2 backend does not use mulcache */`
	`78`	`+ /* AVX2 backend does not use mulcache, but the contracts require`
	`79`	`+ * that its < MLKEM_Q in absolute value, so we just zero it. */`
	`80`	`+ memset(x, 0, sizeof(int16_t) * MLKEM_N / 2);`
`79`	`81`	`((void)y);`
`80`		`- ((void)x);`
`81`	`82`	`}`
`82`	`83`
`83`	`84`	`#if defined(MLK_MULTILEVEL_BUILD_WITH_SHARED) \|\| MLKEM_K == 2`