RedisAI
diff --git a/‎src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h‎
Lines changed: 1 addition & 0 deletions b/‎src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/VecSim/index_factories/brute_force_factory.cpp‎
Lines changed: 8 additions & 0 deletions b/‎src/VecSim/index_factories/brute_force_factory.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/VecSim/index_factories/hnsw_factory.cpp‎
Lines changed: 13 additions & 0 deletions b/‎src/VecSim/index_factories/hnsw_factory.cpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/VecSim/index_factories/tiered_factory.cpp‎
Lines changed: 4 additions & 0 deletions b/‎src/VecSim/index_factories/tiered_factory.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP.cpp‎
Lines changed: 35 additions & 11 deletions b/‎src/VecSim/spaces/IP/IP.cpp‎
Lines changed: 35 additions & 11 deletions
diff --git a/‎src/VecSim/spaces/IP/IP.h‎
Lines changed: 3 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h‎
Lines changed: 3 additions & 3 deletions b/‎src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h‎
Lines changed: 105 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 58 additions & 0 deletions b/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP_space.h‎
Lines changed: 4 additions & 0 deletions b/‎src/VecSim/spaces/IP_space.h‎
Lines changed: 4 additions & 0 deletions
@@ -58,6 +58,7 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_switchDeleteModes_Test)
 friend class BF16TieredTest;
 friend class FP16TieredTest;
 friend class INT8TieredTest;
+friend class UINT8TieredTest;
 friend class CommonTypeMetricTieredTests_TestDataSizeTieredHNSW_Test;
 
 INDEX_TEST_FRIEND_CLASS(BM_VecSimBasics)
 
@@ -78,6 +78,11 @@ VecSimIndex *NewIndex(const BFParams *bfparams, const AbstractIndexInitParams &a
             abstractInitParams.allocator, bfparams->metric, bfparams->dim, is_normalized);
         return NewIndex_ChooseMultiOrSingle<int8_t, float>(bfparams, abstractInitParams,
                                                            indexComponents);
+    } else if (bfparams->type == VecSimType_UINT8) {
+        IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
+            abstractInitParams.allocator, bfparams->metric, bfparams->dim, is_normalized);
+        return NewIndex_ChooseMultiOrSingle<uint8_t, float>(bfparams, abstractInitParams,
+                                                            indexComponents);
     }
 
     // If we got here something is wrong.
@@ -120,6 +125,9 @@ size_t EstimateInitialSize(const BFParams *params, bool is_normalized) {
     } else if (params->type == VecSimType_INT8) {
         est += EstimateComponentsMemory<int8_t, float>(params->metric, is_normalized);
         est += EstimateInitialSize_ChooseMultiOrSingle<int8_t, float>(params->multi);
+    } else if (params->type == VecSimType_UINT8) {
+        est += EstimateComponentsMemory<uint8_t, float>(params->metric, is_normalized);
+        est += EstimateInitialSize_ChooseMultiOrSingle<uint8_t, float>(params->multi);
     } else {
         throw std::invalid_argument("Invalid params->type");
     }
 
@@ -78,6 +78,11 @@ VecSimIndex *NewIndex(const VecSimParams *params, bool is_normalized) {
             abstractInitParams.allocator, hnswParams->metric, hnswParams->dim, is_normalized);
         return NewIndex_ChooseMultiOrSingle<int8_t, float>(hnswParams, abstractInitParams,
                                                            indexComponents);
+    } else if (hnswParams->type == VecSimType_UINT8) {
+        IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
+            abstractInitParams.allocator, hnswParams->metric, hnswParams->dim, is_normalized);
+        return NewIndex_ChooseMultiOrSingle<uint8_t, float>(hnswParams, abstractInitParams,
+                                                            indexComponents);
     }
 
     // If we got here something is wrong.
@@ -117,6 +122,9 @@ size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized) {
     } else if (params->type == VecSimType_INT8) {
         est += EstimateComponentsMemory<int8_t, float>(params->metric, is_normalized);
         est += EstimateInitialSize_ChooseMultiOrSingle<int8_t, float>(params->multi);
+    } else if (params->type == VecSimType_UINT8) {
+        est += EstimateComponentsMemory<uint8_t, float>(params->metric, is_normalized);
+        est += EstimateInitialSize_ChooseMultiOrSingle<uint8_t, float>(params->multi);
     } else {
         throw std::invalid_argument("Invalid params->type");
     }
@@ -236,6 +244,11 @@ VecSimIndex *NewIndex(const std::string &location, bool is_normalized) {
             abstractInitParams.allocator, params.metric, abstractInitParams.dim, is_normalized);
         return NewIndex_ChooseMultiOrSingle<int8_t, float>(input, &params, abstractInitParams,
                                                            indexComponents, version);
+    } else if (params.type == VecSimType_UINT8) {
+        IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
+            abstractInitParams.allocator, params.metric, abstractInitParams.dim, is_normalized);
+        return NewIndex_ChooseMultiOrSingle<uint8_t, float>(input, &params, abstractInitParams,
+                                                            indexComponents, version);
     } else {
         auto bad_name = VecSimType_ToString(params.type);
         if (bad_name == nullptr) {
 
@@ -85,6 +85,8 @@ inline size_t EstimateInitialSize(const TieredIndexParams *params) {
         est += sizeof(TieredHNSWIndex<float16, float>);
     } else if (hnsw_params.type == VecSimType_INT8) {
         est += sizeof(TieredHNSWIndex<int8_t, float>);
+    } else if (hnsw_params.type == VecSimType_UINT8) {
+        est += sizeof(TieredHNSWIndex<uint8_t, float>);
     } else {
         throw std::invalid_argument("Invalid hnsw_params.type");
     }
@@ -105,6 +107,8 @@ VecSimIndex *NewIndex(const TieredIndexParams *params) {
         return TieredHNSWFactory::NewIndex<float16, float>(params);
     } else if (type == VecSimType_INT8) {
         return TieredHNSWFactory::NewIndex<int8_t, float>(params);
+    } else if (type == VecSimType_UINT8) {
+        return TieredHNSWFactory::NewIndex<uint8_t, float>(params);
     }
     return nullptr; // Invalid type.
 }
 
@@ -67,26 +67,50 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
     return 1.0f - res;
 }
 
-static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    int8_t *pVect1 = (int8_t *)pVect1v;
-    int8_t *pVect2 = (int8_t *)pVect2v;
-
-    int res = 0;
+// Return type for the inner product functions.
+// The type should be able to hold `dimension * MAX_VAL(int_elem_t) * MAX_VAL(int_elem_t)`.
+// To support dimension up to 2^16, we need the difference between the type and int_elem_t to be at
+// least 2 bytes. We assert that in the implementation.
+template <typename int_elem_t>
+using ret_t = std::conditional_t<sizeof(int_elem_t) == 1, int, long long>;
+
+template <typename int_elem_t>
+static inline ret_t<int_elem_t>
+INTEGER_InnerProductImp(const int_elem_t *pVect1, const int_elem_t *pVect2, size_t dimension) {
+    static_assert(sizeof(ret_t<int_elem_t>) - sizeof(int_elem_t) * 2 >= sizeof(uint16_t));
+    ret_t<int_elem_t> res = 0;
     for (size_t i = 0; i < dimension; i++) {
         res += pVect1[i] * pVect2[i];
     }
     return res;
 }
 
 float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension);
+    const auto *pVect1 = static_cast<const int8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const int8_t *>(pVect2v);
+    return 1 - INTEGER_InnerProductImp(pVect1, pVect2, dimension);
 }
 
 float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const int8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const int8_t *>(pVect2v);
+    // We expect the vectors' norm to be stored at the end of the vector.
+    float norm_v1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    float norm_v2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    return 1.0f - float(INTEGER_InnerProductImp(pVect1, pVect2, dimension)) / (norm_v1 * norm_v2);
+}
+
+float UINT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+    return 1 - INTEGER_InnerProductImp(pVect1, pVect2, dimension);
+}
+
+float UINT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
     // We expect the vectors' norm to be stored at the end of the vector.
-    float norm_v1 =
-        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
-    float norm_v2 =
-        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
-    return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
+    float norm_v1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    float norm_v2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    return 1.0f - float(INTEGER_InnerProductImp(pVect1, pVect2, dimension)) / (norm_v1 * norm_v2);
 }
@@ -19,3 +19,6 @@ float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size
 
 float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
+
+float UINT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
+float UINT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
@@ -22,7 +22,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s
     sum = _mm512_dpwssd_epi32(sum, va, vb);
 }
 
-template <unsigned char residual> // 0..64
+template <unsigned char residual> // 0..63
 static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
     int8_t *pVect1 = (int8_t *)pVect1v;
     int8_t *pVect2 = (int8_t *)pVect2v;
@@ -59,13 +59,13 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
     return _mm512_reduce_add_epi32(sum);
 }
 
-template <unsigned char residual> // 0..64
+template <unsigned char residual> // 0..63
 float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                                  size_t dimension) {
 
     return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 }
-template <unsigned char residual> // 0..64
+template <unsigned char residual> // 0..63
 float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
                                            size_t dimension) {
     float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
 
@@ -0,0 +1,105 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+
+static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) {
+    __m512i va = _mm512_loadu_epi8(pVect1); // AVX512BW
+    pVect1 += 64;
+
+    __m512i vb = _mm512_loadu_epi8(pVect2); // AVX512BW
+    pVect2 += 64;
+
+    __m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512()); // AVX512BW
+    __m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
+    sum = _mm512_dpwssd_epi32(sum, va_lo, vb_lo);
+
+    __m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
+    __m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
+    sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
+
+    // _mm512_dpwssd_epi32(src, a, b)
+    // Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
+    // 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
+    // with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+}
+
+template <unsigned char residual> // 0..63
+static inline int UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
+                                        size_t dimension) {
+    uint8_t *pVect1 = (uint8_t *)pVect1v;
+    uint8_t *pVect2 = (uint8_t *)pVect2v;
+
+    const uint8_t *pEnd1 = pVect1 + dimension;
+
+    __m512i sum = _mm512_setzero_epi32();
+
+    // Deal with remainder first.
+    if constexpr (residual) {
+        if constexpr (residual < 32) {
+            constexpr __mmask32 mask = (1LU << residual) - 1;
+            __m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
+            __m512i va = _mm512_cvtepu8_epi16(temp_a);
+
+            __m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
+            __m512i vb = _mm512_cvtepu8_epi16(temp_b);
+
+            sum = _mm512_dpwssd_epi32(sum, va, vb);
+        } else if constexpr (residual == 32) {
+            __m256i temp_a = _mm256_loadu_epi8(pVect1);
+            __m512i va = _mm512_cvtepu8_epi16(temp_a);
+
+            __m256i temp_b = _mm256_loadu_epi8(pVect2);
+            __m512i vb = _mm512_cvtepu8_epi16(temp_b);
+
+            sum = _mm512_dpwssd_epi32(sum, va, vb);
+        } else {
+            constexpr __mmask64 mask = (1LU << residual) - 1;
+            __m512i va = _mm512_maskz_loadu_epi8(mask, pVect1);
+            __m512i vb = _mm512_maskz_loadu_epi8(mask, pVect2);
+
+            __m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512());
+            __m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
+            sum = _mm512_dpwssd_epi32(sum, va_lo, vb_lo);
+
+            __m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512());
+            __m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
+            sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
+        }
+        pVect1 += residual;
+        pVect2 += residual;
+
+        // We dealt with the residual part.
+        // We are left with some multiple of 64-uint_8 (might be 0).
+        while (pVect1 < pEnd1) {
+            InnerProductStep(pVect1, pVect2, sum);
+        }
+    } else {
+        // We have no residual, we have some non-zero multiple of 64-uint_8.
+        do {
+            InnerProductStep(pVect1, pVect2, sum);
+        } while (pVect1 < pEnd1);
+    }
+
+    return _mm512_reduce_add_epi32(sum);
+}
+
+template <unsigned char residual> // 0..63
+float UINT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                                  size_t dimension) {
+
+    return 1 - UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+}
+template <unsigned char residual> // 0..63
+float UINT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
+                                            size_t dimension) {
+    float ip = UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
@@ -252,4 +252,62 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif // __x86_64__
     return ret_dist_func;
 }
+
+dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                        const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = UINT8_InnerProduct;
+    // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
+            *alignment = 32 * sizeof(uint8_t); // align to 256 bits.
+        return Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
+dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
+                                            const void *arch_opt) {
+    unsigned char dummy_alignment;
+    if (alignment == nullptr) {
+        alignment = &dummy_alignment;
+    }
+
+    dist_func_t<float> ret_dist_func = UINT8_Cosine;
+    // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
+    if (dim < 32) {
+        return ret_dist_func;
+    }
+#ifdef CPU_FEATURES_ARCH_X86_64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetX86Info().features
+                        : *static_cast<const cpu_features::X86Features *>(arch_opt);
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
+        // For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to
+        // `(dim + sizeof(float)) % 32`.
+        // Vectors satisfying this have a residual, causing offset loads during calculation.
+        // To avoid complexity, we skip alignment here, assuming the performance impact is
+        // negligible.
+        return Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+    }
+#endif
+#endif // __x86_64__
+    return ret_dist_func;
+}
+
 } // namespace spaces
@@ -20,4 +20,8 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu
                                        const void *arch_opt = nullptr);
 dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
                                            const void *arch_opt = nullptr);
+dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                        const void *arch_opt = nullptr);
+dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
+                                            const void *arch_opt = nullptr);
 } // namespace spaces
Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,8 @@ inline size_t EstimateInitialSize(const TieredIndexParams *params) {`
`85`	`85`	`est += sizeof(TieredHNSWIndex<float16, float>);`
`86`	`86`	`} else if (hnsw_params.type == VecSimType_INT8) {`
`87`	`87`	`est += sizeof(TieredHNSWIndex<int8_t, float>);`
	`88`	`+ } else if (hnsw_params.type == VecSimType_UINT8) {`
	`89`	`+ est += sizeof(TieredHNSWIndex<uint8_t, float>);`
`88`	`90`	`} else {`
`89`	`91`	`throw std::invalid_argument("Invalid hnsw_params.type");`
`90`	`92`	`}`
`@@ -105,6 +107,8 @@ VecSimIndex NewIndex(const TieredIndexParams params) {`
`105`	`107`	`return TieredHNSWFactory::NewIndex<float16, float>(params);`
`106`	`108`	`} else if (type == VecSimType_INT8) {`
`107`	`109`	`return TieredHNSWFactory::NewIndex<int8_t, float>(params);`
	`110`	`+ } else if (type == VecSimType_UINT8) {`
	`111`	`+ return TieredHNSWFactory::NewIndex<uint8_t, float>(params);`
`108`	`112`	`}`
`109`	`113`	`return nullptr; // Invalid type.`
`110`	`114`	`}`