Skip to content

Commit 83ba99f

Browse files
[8.0] Implement UINT8 vector type - [MOD-8230, MOD-8408] (#592)
Implement UINT8 vector type - [MOD-8230, MOD-8408] (#584) * cleanup implementation chooser * defining new API and naive implementation * define new uint8 type definitions * add new type to all the factories * add new type tp the python bindings * format * first attempt of implementing optimized implementation * implement benchmarks for uint8 * implement unit tests * fix L2 implementation * fix uint8 range test * format * cleanup * added flow/bindings tests * fix int benchmarks files * added cosine to naive int implementations * fix flow tests * unpack lo before high * remove todos * address some review comments * alternative implementation of residual handling * extend test coverage * fix build for old GCC versions * another attempt * fix uninitialized data read * fix test for mac (cherry picked from commit 8055e5c) Co-authored-by: GuyAv46 <[email protected]>
1 parent 5b19aeb commit 83ba99f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1877
-82
lines changed

src/VecSim/algorithms/hnsw/hnsw_tiered_tests_friends.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ INDEX_TEST_FRIEND_CLASS(HNSWTieredIndexTestBasic_switchDeleteModes_Test)
5858
friend class BF16TieredTest;
5959
friend class FP16TieredTest;
6060
friend class INT8TieredTest;
61+
friend class UINT8TieredTest;
6162
friend class CommonTypeMetricTieredTests_TestDataSizeTieredHNSW_Test;
6263

6364
INDEX_TEST_FRIEND_CLASS(BM_VecSimBasics)

src/VecSim/index_factories/brute_force_factory.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ VecSimIndex *NewIndex(const BFParams *bfparams, const AbstractIndexInitParams &a
7878
abstractInitParams.allocator, bfparams->metric, bfparams->dim, is_normalized);
7979
return NewIndex_ChooseMultiOrSingle<int8_t, float>(bfparams, abstractInitParams,
8080
indexComponents);
81+
} else if (bfparams->type == VecSimType_UINT8) {
82+
IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
83+
abstractInitParams.allocator, bfparams->metric, bfparams->dim, is_normalized);
84+
return NewIndex_ChooseMultiOrSingle<uint8_t, float>(bfparams, abstractInitParams,
85+
indexComponents);
8186
}
8287

8388
// If we got here something is wrong.
@@ -120,6 +125,9 @@ size_t EstimateInitialSize(const BFParams *params, bool is_normalized) {
120125
} else if (params->type == VecSimType_INT8) {
121126
est += EstimateComponentsMemory<int8_t, float>(params->metric, is_normalized);
122127
est += EstimateInitialSize_ChooseMultiOrSingle<int8_t, float>(params->multi);
128+
} else if (params->type == VecSimType_UINT8) {
129+
est += EstimateComponentsMemory<uint8_t, float>(params->metric, is_normalized);
130+
est += EstimateInitialSize_ChooseMultiOrSingle<uint8_t, float>(params->multi);
123131
} else {
124132
throw std::invalid_argument("Invalid params->type");
125133
}

src/VecSim/index_factories/hnsw_factory.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ VecSimIndex *NewIndex(const VecSimParams *params, bool is_normalized) {
7878
abstractInitParams.allocator, hnswParams->metric, hnswParams->dim, is_normalized);
7979
return NewIndex_ChooseMultiOrSingle<int8_t, float>(hnswParams, abstractInitParams,
8080
indexComponents);
81+
} else if (hnswParams->type == VecSimType_UINT8) {
82+
IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
83+
abstractInitParams.allocator, hnswParams->metric, hnswParams->dim, is_normalized);
84+
return NewIndex_ChooseMultiOrSingle<uint8_t, float>(hnswParams, abstractInitParams,
85+
indexComponents);
8186
}
8287

8388
// If we got here something is wrong.
@@ -117,6 +122,9 @@ size_t EstimateInitialSize(const HNSWParams *params, bool is_normalized) {
117122
} else if (params->type == VecSimType_INT8) {
118123
est += EstimateComponentsMemory<int8_t, float>(params->metric, is_normalized);
119124
est += EstimateInitialSize_ChooseMultiOrSingle<int8_t, float>(params->multi);
125+
} else if (params->type == VecSimType_UINT8) {
126+
est += EstimateComponentsMemory<uint8_t, float>(params->metric, is_normalized);
127+
est += EstimateInitialSize_ChooseMultiOrSingle<uint8_t, float>(params->multi);
120128
} else {
121129
throw std::invalid_argument("Invalid params->type");
122130
}
@@ -236,6 +244,11 @@ VecSimIndex *NewIndex(const std::string &location, bool is_normalized) {
236244
abstractInitParams.allocator, params.metric, abstractInitParams.dim, is_normalized);
237245
return NewIndex_ChooseMultiOrSingle<int8_t, float>(input, &params, abstractInitParams,
238246
indexComponents, version);
247+
} else if (params.type == VecSimType_UINT8) {
248+
IndexComponents<uint8_t, float> indexComponents = CreateIndexComponents<uint8_t, float>(
249+
abstractInitParams.allocator, params.metric, abstractInitParams.dim, is_normalized);
250+
return NewIndex_ChooseMultiOrSingle<uint8_t, float>(input, &params, abstractInitParams,
251+
indexComponents, version);
239252
} else {
240253
auto bad_name = VecSimType_ToString(params.type);
241254
if (bad_name == nullptr) {

src/VecSim/index_factories/tiered_factory.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ inline size_t EstimateInitialSize(const TieredIndexParams *params) {
8585
est += sizeof(TieredHNSWIndex<float16, float>);
8686
} else if (hnsw_params.type == VecSimType_INT8) {
8787
est += sizeof(TieredHNSWIndex<int8_t, float>);
88+
} else if (hnsw_params.type == VecSimType_UINT8) {
89+
est += sizeof(TieredHNSWIndex<uint8_t, float>);
8890
} else {
8991
throw std::invalid_argument("Invalid hnsw_params.type");
9092
}
@@ -105,6 +107,8 @@ VecSimIndex *NewIndex(const TieredIndexParams *params) {
105107
return TieredHNSWFactory::NewIndex<float16, float>(params);
106108
} else if (type == VecSimType_INT8) {
107109
return TieredHNSWFactory::NewIndex<int8_t, float>(params);
110+
} else if (type == VecSimType_UINT8) {
111+
return TieredHNSWFactory::NewIndex<uint8_t, float>(params);
108112
}
109113
return nullptr; // Invalid type.
110114
}

src/VecSim/spaces/IP/IP.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -67,26 +67,50 @@ float FP16_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension
6767
return 1.0f - res;
6868
}
6969

70-
static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
71-
int8_t *pVect1 = (int8_t *)pVect1v;
72-
int8_t *pVect2 = (int8_t *)pVect2v;
73-
74-
int res = 0;
70+
// Return type for the inner product functions.
71+
// The type should be able to hold `dimension * MAX_VAL(int_elem_t) * MAX_VAL(int_elem_t)`.
72+
// To support dimension up to 2^16, we need the difference between the type and int_elem_t to be at
73+
// least 2 bytes. We assert that in the implementation.
74+
template <typename int_elem_t>
75+
using ret_t = std::conditional_t<sizeof(int_elem_t) == 1, int, long long>;
76+
77+
template <typename int_elem_t>
78+
static inline ret_t<int_elem_t>
79+
INTEGER_InnerProductImp(const int_elem_t *pVect1, const int_elem_t *pVect2, size_t dimension) {
80+
static_assert(sizeof(ret_t<int_elem_t>) - sizeof(int_elem_t) * 2 >= sizeof(uint16_t));
81+
ret_t<int_elem_t> res = 0;
7582
for (size_t i = 0; i < dimension; i++) {
7683
res += pVect1[i] * pVect2[i];
7784
}
7885
return res;
7986
}
8087

8188
float INT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
82-
return 1 - INT8_InnerProductImp(pVect1v, pVect2v, dimension);
89+
const auto *pVect1 = static_cast<const int8_t *>(pVect1v);
90+
const auto *pVect2 = static_cast<const int8_t *>(pVect2v);
91+
return 1 - INTEGER_InnerProductImp(pVect1, pVect2, dimension);
8392
}
8493

8594
float INT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
95+
const auto *pVect1 = static_cast<const int8_t *>(pVect1v);
96+
const auto *pVect2 = static_cast<const int8_t *>(pVect2v);
97+
// We expect the vectors' norm to be stored at the end of the vector.
98+
float norm_v1 = *reinterpret_cast<const float *>(pVect1 + dimension);
99+
float norm_v2 = *reinterpret_cast<const float *>(pVect2 + dimension);
100+
return 1.0f - float(INTEGER_InnerProductImp(pVect1, pVect2, dimension)) / (norm_v1 * norm_v2);
101+
}
102+
103+
float UINT8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
104+
const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
105+
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
106+
return 1 - INTEGER_InnerProductImp(pVect1, pVect2, dimension);
107+
}
108+
109+
float UINT8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
110+
const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
111+
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
86112
// We expect the vectors' norm to be stored at the end of the vector.
87-
float norm_v1 =
88-
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
89-
float norm_v2 =
90-
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
91-
return 1.0f - float(INT8_InnerProductImp(pVect1v, pVect2v, dimension)) / (norm_v1 * norm_v2);
113+
float norm_v1 = *reinterpret_cast<const float *>(pVect1 + dimension);
114+
float norm_v2 = *reinterpret_cast<const float *>(pVect2 + dimension);
115+
return 1.0f - float(INTEGER_InnerProductImp(pVect1, pVect2, dimension)) / (norm_v1 * norm_v2);
92116
}

src/VecSim/spaces/IP/IP.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ float BF16_InnerProduct_BigEndian(const void *pVect1v, const void *pVect2v, size
1919

2020
float INT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
2121
float INT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);
22+
23+
float UINT8_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
24+
float UINT8_Cosine(const void *pVect1, const void *pVect2, size_t dimension);

src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_INT8.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ static inline void InnerProductStep(int8_t *&pVect1, int8_t *&pVect2, __m512i &s
2222
sum = _mm512_dpwssd_epi32(sum, va, vb);
2323
}
2424

25-
template <unsigned char residual> // 0..64
25+
template <unsigned char residual> // 0..63
2626
static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
2727
int8_t *pVect1 = (int8_t *)pVect1v;
2828
int8_t *pVect2 = (int8_t *)pVect2v;
@@ -59,13 +59,13 @@ static inline int INT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
5959
return _mm512_reduce_add_epi32(sum);
6060
}
6161

62-
template <unsigned char residual> // 0..64
62+
template <unsigned char residual> // 0..63
6363
float INT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
6464
size_t dimension) {
6565

6666
return 1 - INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
6767
}
68-
template <unsigned char residual> // 0..64
68+
template <unsigned char residual> // 0..63
6969
float INT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
7070
size_t dimension) {
7171
float ip = INT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
*Copyright Redis Ltd. 2021 - present
3+
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
4+
*the Server Side Public License v1 (SSPLv1).
5+
*/
6+
7+
#include "VecSim/spaces/space_includes.h"
8+
9+
static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) {
10+
__m512i va = _mm512_loadu_epi8(pVect1); // AVX512BW
11+
pVect1 += 64;
12+
13+
__m512i vb = _mm512_loadu_epi8(pVect2); // AVX512BW
14+
pVect2 += 64;
15+
16+
__m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512()); // AVX512BW
17+
__m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
18+
sum = _mm512_dpwssd_epi32(sum, va_lo, vb_lo);
19+
20+
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512()); // AVX512BW
21+
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
22+
sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
23+
24+
// _mm512_dpwssd_epi32(src, a, b)
25+
// Multiply groups of 2 adjacent pairs of signed 16-bit integers in `a` with corresponding
26+
// 16-bit integers in `b`, producing 2 intermediate signed 32-bit results. Sum these 2 results
27+
// with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
28+
}
29+
30+
template <unsigned char residual> // 0..63
31+
static inline int UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v,
32+
size_t dimension) {
33+
uint8_t *pVect1 = (uint8_t *)pVect1v;
34+
uint8_t *pVect2 = (uint8_t *)pVect2v;
35+
36+
const uint8_t *pEnd1 = pVect1 + dimension;
37+
38+
__m512i sum = _mm512_setzero_epi32();
39+
40+
// Deal with remainder first.
41+
if constexpr (residual) {
42+
if constexpr (residual < 32) {
43+
constexpr __mmask32 mask = (1LU << residual) - 1;
44+
__m256i temp_a = _mm256_maskz_loadu_epi8(mask, pVect1);
45+
__m512i va = _mm512_cvtepu8_epi16(temp_a);
46+
47+
__m256i temp_b = _mm256_maskz_loadu_epi8(mask, pVect2);
48+
__m512i vb = _mm512_cvtepu8_epi16(temp_b);
49+
50+
sum = _mm512_dpwssd_epi32(sum, va, vb);
51+
} else if constexpr (residual == 32) {
52+
__m256i temp_a = _mm256_loadu_epi8(pVect1);
53+
__m512i va = _mm512_cvtepu8_epi16(temp_a);
54+
55+
__m256i temp_b = _mm256_loadu_epi8(pVect2);
56+
__m512i vb = _mm512_cvtepu8_epi16(temp_b);
57+
58+
sum = _mm512_dpwssd_epi32(sum, va, vb);
59+
} else {
60+
constexpr __mmask64 mask = (1LU << residual) - 1;
61+
__m512i va = _mm512_maskz_loadu_epi8(mask, pVect1);
62+
__m512i vb = _mm512_maskz_loadu_epi8(mask, pVect2);
63+
64+
__m512i va_lo = _mm512_unpacklo_epi8(va, _mm512_setzero_si512());
65+
__m512i vb_lo = _mm512_unpacklo_epi8(vb, _mm512_setzero_si512());
66+
sum = _mm512_dpwssd_epi32(sum, va_lo, vb_lo);
67+
68+
__m512i va_hi = _mm512_unpackhi_epi8(va, _mm512_setzero_si512());
69+
__m512i vb_hi = _mm512_unpackhi_epi8(vb, _mm512_setzero_si512());
70+
sum = _mm512_dpwssd_epi32(sum, va_hi, vb_hi);
71+
}
72+
pVect1 += residual;
73+
pVect2 += residual;
74+
75+
// We dealt with the residual part.
76+
// We are left with some multiple of 64-uint_8 (might be 0).
77+
while (pVect1 < pEnd1) {
78+
InnerProductStep(pVect1, pVect2, sum);
79+
}
80+
} else {
81+
// We have no residual, we have some non-zero multiple of 64-uint_8.
82+
do {
83+
InnerProductStep(pVect1, pVect2, sum);
84+
} while (pVect1 < pEnd1);
85+
}
86+
87+
return _mm512_reduce_add_epi32(sum);
88+
}
89+
90+
template <unsigned char residual> // 0..63
91+
float UINT8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
92+
size_t dimension) {
93+
94+
return 1 - UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
95+
}
96+
template <unsigned char residual> // 0..63
97+
float UINT8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVect1v, const void *pVect2v,
98+
size_t dimension) {
99+
float ip = UINT8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
100+
float norm_v1 =
101+
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
102+
float norm_v2 =
103+
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
104+
return 1.0f - ip / (norm_v1 * norm_v2);
105+
}

src/VecSim/spaces/IP_space.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,4 +252,62 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
252252
#endif // __x86_64__
253253
return ret_dist_func;
254254
}
255+
256+
dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
257+
const void *arch_opt) {
258+
unsigned char dummy_alignment;
259+
if (alignment == nullptr) {
260+
alignment = &dummy_alignment;
261+
}
262+
263+
dist_func_t<float> ret_dist_func = UINT8_InnerProduct;
264+
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
265+
if (dim < 32) {
266+
return ret_dist_func;
267+
}
268+
#ifdef CPU_FEATURES_ARCH_X86_64
269+
auto features = (arch_opt == nullptr)
270+
? cpu_features::GetX86Info().features
271+
: *static_cast<const cpu_features::X86Features *>(arch_opt);
272+
#ifdef OPT_AVX512_F_BW_VL_VNNI
273+
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
274+
if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
275+
*alignment = 32 * sizeof(uint8_t); // align to 256 bits.
276+
return Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
277+
}
278+
#endif
279+
#endif // __x86_64__
280+
return ret_dist_func;
281+
}
282+
283+
dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
284+
const void *arch_opt) {
285+
unsigned char dummy_alignment;
286+
if (alignment == nullptr) {
287+
alignment = &dummy_alignment;
288+
}
289+
290+
dist_func_t<float> ret_dist_func = UINT8_Cosine;
291+
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
292+
if (dim < 32) {
293+
return ret_dist_func;
294+
}
295+
#ifdef CPU_FEATURES_ARCH_X86_64
296+
auto features = (arch_opt == nullptr)
297+
? cpu_features::GetX86Info().features
298+
: *static_cast<const cpu_features::X86Features *>(arch_opt);
299+
#ifdef OPT_AVX512_F_BW_VL_VNNI
300+
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
301+
// For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to
302+
// `(dim + sizeof(float)) % 32`.
303+
// Vectors satisfying this have a residual, causing offset loads during calculation.
304+
// To avoid complexity, we skip alignment here, assuming the performance impact is
305+
// negligible.
306+
return Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
307+
}
308+
#endif
309+
#endif // __x86_64__
310+
return ret_dist_func;
311+
}
312+
255313
} // namespace spaces

src/VecSim/spaces/IP_space.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,8 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nu
2020
const void *arch_opt = nullptr);
2121
dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
2222
const void *arch_opt = nullptr);
23+
dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
24+
const void *arch_opt = nullptr);
25+
dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr,
26+
const void *arch_opt = nullptr);
2327
} // namespace spaces

0 commit comments

Comments
 (0)