From f9f2b2a47f2885024e87006a32584350624f506a Mon Sep 17 00:00:00 2001 From: Srinivas Lade Date: Sun, 21 Sep 2025 22:16:01 -0700 Subject: [PATCH] Save temp --- Cargo.toml | 2 +- src/daft-core/Cargo.toml | 5 +-- src/daft-core/src/array/ops/hash.rs | 42 ++++++++++++++++++------- src/daft-core/src/kernels/hashing.rs | 16 +++++----- src/daft-core/src/python/series.rs | 10 +++++- src/daft-core/src/series/ops/minhash.rs | 35 +++++++++++++++++++++ src/daft-functions/src/hash.rs | 2 +- src/daft-functions/src/minhash.rs | 2 +- src/daft-hash/src/lib.rs | 8 +++-- 9 files changed, 92 insertions(+), 30 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f893e99906..35690ed2a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -338,7 +338,7 @@ typed-builder = "0.20.0" typetag = "0.2.18" url = "2.4.0" uuid = {version = "1.17.0", features = ["v4"]} -xxhash-rust = "0.8.12" +xxhash-rust = {version = "0.8.15", default-features = false, features = ["xxh64", "xxh3", "xxh32"]} [workspace.dependencies.arrow2] features = ["serde_types"] diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml index b67f8c97ae..3a430b076d 100644 --- a/src/daft-core/Cargo.toml +++ b/src/daft-core/Cargo.toml @@ -56,10 +56,7 @@ pyo3 = {workspace = true, optional = true} rand = "0.9.1" serde = {workspace = true} sketches-ddsketch = {workspace = true} - -[dependencies.xxhash-rust] -features = ["xxh3", "const_xxh3", "xxh64"] -version = "0.8.5" +xxhash-rust = {workspace = true} [dev-dependencies] rstest = {workspace = true} diff --git a/src/daft-core/src/array/ops/hash.rs b/src/daft-core/src/array/ops/hash.rs index f98dd64311..0b7e917cd9 100644 --- a/src/daft-core/src/array/ops/hash.rs +++ b/src/daft-core/src/array/ops/hash.rs @@ -7,7 +7,11 @@ use arrow2::types::Index; use common_error::{DaftError, DaftResult}; use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher}; use daft_schema::{dtype::DataType, field::Field}; -use xxhash_rust::xxh3::{xxh3_64, xxh3_64_with_seed}; +use xxhash_rust::{ + xxh3::{xxh3_64, xxh3_64_with_seed}, + xxh32::{xxh3_32, xxh3_32_with_seed}, + xxh64::xxh64, +}; use super::as_arrow::AsArrow; use crate::{ @@ -28,7 +32,7 @@ where T: DaftPrimitiveType, { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -44,7 +48,7 @@ where impl Utf8Array { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -60,7 +64,7 @@ impl Utf8Array { impl BinaryArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -76,7 +80,7 @@ impl BinaryArray { impl FixedSizeBinaryArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -92,7 +96,7 @@ impl FixedSizeBinaryArray { impl BooleanArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -108,7 +112,7 @@ impl BooleanArray { impl NullArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -159,13 +163,27 @@ fn hash_list( .collect(); match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { if let Some(cur_seed) = cur_seed_opt { Some(xxh3_64_with_seed(&child_bytes, cur_seed)) } else { Some(xxh3_64(&child_bytes)) } } + HashFunctionKind::XxHash32 => { + if let Some(cur_seed) = cur_seed_opt { + Some(xxh3_32_with_seed(&child_bytes, cur_seed)) + } else { + Some(xxh3_32(&child_bytes)) + } + } + HashFunctionKind::XxHash3 => { + if let Some(cur_seed) = cur_seed_opt { + Some(xxh3_with_seed(&child_bytes, cur_seed)) + } else { + Some(xxh3(&child_bytes)) + } + } HashFunctionKind::MurmurHash3 => { // Use 42 as default seed, // refer to: https://github.com/Eventual-Inc/Daft/blob/7be4b1ff9ed3fdc3a45947beefab7e7291cd3be7/src/daft-hash/src/lib.rs#L18 @@ -199,7 +217,7 @@ fn hash_list( let end = (offsets[i as usize + 1] as usize) * OFFSET; match hash_function { - HashFunctionKind::XxHash => Some(xxh3_64(&child_bytes[start..end])), + HashFunctionKind::XxHash64 => Some(xxh3_64(&child_bytes[start..end])), HashFunctionKind::MurmurHash3 => { // Use 42 as default seed, // refer to: https://github.com/Eventual-Inc/Daft/blob/7be4b1ff9ed3fdc3a45947beefab7e7291cd3be7/src/daft-hash/src/lib.rs#L18 @@ -222,7 +240,7 @@ fn hash_list( impl ListArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -242,7 +260,7 @@ impl ListArray { impl FixedSizeListArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( &self, @@ -265,7 +283,7 @@ impl FixedSizeListArray { impl StructArray { pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult { - self.hash_with(seed, HashFunctionKind::XxHash) + self.hash_with(seed, HashFunctionKind::XxHash64) } pub fn hash_with( diff --git a/src/daft-core/src/kernels/hashing.rs b/src/daft-core/src/kernels/hashing.rs index 2914a59f8a..984c012e50 100644 --- a/src/daft-core/src/kernels/hashing.rs +++ b/src/daft-core/src/kernels/hashing.rs @@ -21,7 +21,7 @@ fn hash_primitive( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { const NULL_HASH: u64 = const_xxh3::xxh3_64(b""); let hashes = if let Some(seed) = seed { array @@ -91,7 +91,7 @@ fn hash_boolean( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { const NULL_HASH: u64 = const_xxh3::xxh3_64(b""); const FALSE_HASH: u64 = const_xxh3::xxh3_64(b"0"); const TRUE_HASH: u64 = const_xxh3::xxh3_64(b"1"); @@ -174,7 +174,7 @@ fn hash_null( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { const NULL_HASH: u64 = const_xxh3::xxh3_64(b""); let hashes = if let Some(seed) = seed { seed.values_iter() @@ -215,7 +215,7 @@ fn hash_binary( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { let hashes = if let Some(seed) = seed { array .values_iter() @@ -259,7 +259,7 @@ fn hash_fixed_size_binary( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { let hashes = if let Some(seed) = seed { array .values_iter() @@ -303,7 +303,7 @@ fn hash_utf8( hash_function: HashFunctionKind, ) -> PrimitiveArray { match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { let hashes = if let Some(seed) = seed { array .values_iter() @@ -353,7 +353,7 @@ fn hash_timestamp_with_timezone( // For timestamps with timezone, we combine the timestamp value with the timezone string // to ensure that the same instant in different timezones produces different hashes match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { const NULL_HASH: u64 = const_xxh3::xxh3_64(b""); let hashes = if let Some(seed) = seed { array @@ -492,7 +492,7 @@ fn hash_decimal( }; match hash_function { - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { const NULL_HASH: u64 = const_xxh3::xxh3_64(b""); let hashes = if let Some(seed) = seed { array diff --git a/src/daft-core/src/python/series.rs b/src/daft-core/src/python/series.rs index f4e29d0ea9..112376d424 100644 --- a/src/daft-core/src/python/series.rs +++ b/src/daft-core/src/python/series.rs @@ -240,10 +240,18 @@ impl PySeries { let hasher = MurBuildHasher::new(seed); self.series.minhash(num_hashes, ngram_size, seed, &hasher) } - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64); self.series.minhash(num_hashes, ngram_size, seed, &hasher) } + HashFunctionKind::XxHash32 => { + let hasher = xxhash_rust::xxh32::Xxh32::new(seed as u32); + self.series.minhash(num_hashes, ngram_size, seed, &hasher) + } + HashFunctionKind::XxHash3 => { + let hasher = xxhash_rust::xxh3::Xxh3Builder::new().with_seed(seed as u64); + self.series.minhash(num_hashes, ngram_size, seed, &hasher) + } HashFunctionKind::Sha1 => { let hasher = BuildHasherDefault::::default(); self.series.minhash(num_hashes, ngram_size, seed, &hasher) diff --git a/src/daft-core/src/series/ops/minhash.rs b/src/daft-core/src/series/ops/minhash.rs index bbcff86313..ccacfb2ca7 100644 --- a/src/daft-core/src/series/ops/minhash.rs +++ b/src/daft-core/src/series/ops/minhash.rs @@ -1,4 +1,5 @@ use common_error::{DaftError, DaftResult}; +use xxhash_rust::xxh32::Xxh32; use crate::{ array::ops::DaftMinHash, @@ -6,6 +7,40 @@ use crate::{ series::{IntoSeries, Series}, }; +struct Xxh32Wrapper { + inner: Xxh32, +} + +impl std::hash::Hasher for Xxh32Wrapper { + fn write(&mut self, bytes: &[u8]) { + self.inner.update(bytes); + } + + fn finish(&self) -> u64 { + self.inner.digest() as u64 + } +} + +struct Xxh32BuildHasher { + seed: u32, +} + +impl Xxh32BuildHasher { + pub fn new(seed: u32) -> Self { + Self { seed } + } +} + +impl std::hash::BuildHasher for Xxh32BuildHasher { + type Hasher = Xxh32Wrapper; + + fn build_hasher(&self) -> Self::Hasher { + Xxh32Wrapper { + inner: Xxh32::new(self.seed), + } + } +} + impl Series { pub fn minhash( &self, diff --git a/src/daft-functions/src/hash.rs b/src/daft-functions/src/hash.rs index 24818dfa08..8f44fc6277 100644 --- a/src/daft-functions/src/hash.rs +++ b/src/daft-functions/src/hash.rs @@ -34,7 +34,7 @@ impl ScalarUDF for HashFunction { let hash_function = hash_function .map(|s| s.parse::()) .transpose()? - .unwrap_or(HashFunctionKind::XxHash); + .unwrap_or(HashFunctionKind::XxHash64); if let Some(seed) = seed { match seed.len() { diff --git a/src/daft-functions/src/minhash.rs b/src/daft-functions/src/minhash.rs index 5bfff0e6c6..b1eb412fb4 100644 --- a/src/daft-functions/src/minhash.rs +++ b/src/daft-functions/src/minhash.rs @@ -46,7 +46,7 @@ impl ScalarUDF for MinHashFunction { let hasher = MurBuildHasher::new(seed); input.minhash(num_hashes, ngram_size, seed, &hasher) } - HashFunctionKind::XxHash => { + HashFunctionKind::XxHash64 => { let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64); input.minhash(num_hashes, ngram_size, seed, &hasher) } diff --git a/src/daft-hash/src/lib.rs b/src/daft-hash/src/lib.rs index 42ad539aa9..e87e56cfc8 100644 --- a/src/daft-hash/src/lib.rs +++ b/src/daft-hash/src/lib.rs @@ -54,7 +54,9 @@ impl Hasher for Sha1Hasher { pub enum HashFunctionKind { #[default] MurmurHash3, - XxHash, + XxHash64, + XxHash32, + XxHash3, Sha1, } @@ -64,7 +66,9 @@ impl FromStr for HashFunctionKind { fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "murmurhash3" => Ok(Self::MurmurHash3), - "xxhash" => Ok(Self::XxHash), + "xxhash64" => Ok(Self::XxHash64), + "xxhash32" => Ok(Self::XxHash32), + "xxhash3" => Ok(Self::XxHash3), "sha1" => Ok(Self::Sha1), _ => Err(DaftError::ValueError(format!( "Invalid hash function: {}",