Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ typed-builder = "0.20.0"
typetag = "0.2.18"
url = "2.4.0"
uuid = {version = "1.17.0", features = ["v4"]}
xxhash-rust = "0.8.12"
xxhash-rust = {version = "0.8.15", default-features = false, features = ["xxh64", "xxh3", "xxh32"]}

[workspace.dependencies.arrow2]
features = ["serde_types"]
Expand Down
5 changes: 1 addition & 4 deletions src/daft-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@ pyo3 = {workspace = true, optional = true}
rand = "0.9.1"
serde = {workspace = true}
sketches-ddsketch = {workspace = true}

[dependencies.xxhash-rust]
features = ["xxh3", "const_xxh3", "xxh64"]
version = "0.8.5"
xxhash-rust = {workspace = true}

[dev-dependencies]
rstest = {workspace = true}
Expand Down
42 changes: 30 additions & 12 deletions src/daft-core/src/array/ops/hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ use arrow2::types::Index;
use common_error::{DaftError, DaftResult};
use daft_hash::{HashFunctionKind, MurBuildHasher, Sha1Hasher};
use daft_schema::{dtype::DataType, field::Field};
use xxhash_rust::xxh3::{xxh3_64, xxh3_64_with_seed};
use xxhash_rust::{
xxh3::{xxh3_64, xxh3_64_with_seed},
xxh32::{xxh3_32, xxh3_32_with_seed},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

syntax: Incorrect import: xxh3_32 and xxh3_32_with_seed don't exist. Should be xxh32 and xxh32_with_seed from the xxh32 module.

Suggested change
xxh32::{xxh3_32, xxh3_32_with_seed},
xxh32::{xxh32, xxh32_with_seed},

xxh64::xxh64,
};

use super::as_arrow::AsArrow;
use crate::{
Expand All @@ -28,7 +32,7 @@ where
T: DaftPrimitiveType,
{
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: Default hash function should be XxHash3 instead of XxHash64 to align with PR description

}
pub fn hash_with(
&self,
Expand All @@ -44,7 +48,7 @@ where

impl Utf8Array {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -60,7 +64,7 @@ impl Utf8Array {

impl BinaryArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -76,7 +80,7 @@ impl BinaryArray {

impl FixedSizeBinaryArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -92,7 +96,7 @@ impl FixedSizeBinaryArray {

impl BooleanArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -108,7 +112,7 @@ impl BooleanArray {

impl NullArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand Down Expand Up @@ -159,13 +163,27 @@ fn hash_list(
.collect();

match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
if let Some(cur_seed) = cur_seed_opt {
Some(xxh3_64_with_seed(&child_bytes, cur_seed))
} else {
Some(xxh3_64(&child_bytes))
}
}
HashFunctionKind::XxHash32 => {
if let Some(cur_seed) = cur_seed_opt {
Some(xxh3_32_with_seed(&child_bytes, cur_seed))
} else {
Some(xxh3_32(&child_bytes))
}
}
HashFunctionKind::XxHash3 => {
if let Some(cur_seed) = cur_seed_opt {
Some(xxh3_with_seed(&child_bytes, cur_seed))
} else {
Some(xxh3(&child_bytes))
}
}
HashFunctionKind::MurmurHash3 => {
// Use 42 as default seed,
// refer to: https://github.com/Eventual-Inc/Daft/blob/7be4b1ff9ed3fdc3a45947beefab7e7291cd3be7/src/daft-hash/src/lib.rs#L18
Expand Down Expand Up @@ -199,7 +217,7 @@ fn hash_list(
let end = (offsets[i as usize + 1] as usize) * OFFSET;

match hash_function {
HashFunctionKind::XxHash => Some(xxh3_64(&child_bytes[start..end])),
HashFunctionKind::XxHash64 => Some(xxh3_64(&child_bytes[start..end])),
HashFunctionKind::MurmurHash3 => {
// Use 42 as default seed,
// refer to: https://github.com/Eventual-Inc/Daft/blob/7be4b1ff9ed3fdc3a45947beefab7e7291cd3be7/src/daft-hash/src/lib.rs#L18
Expand All @@ -222,7 +240,7 @@ fn hash_list(

impl ListArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -242,7 +260,7 @@ impl ListArray {

impl FixedSizeListArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}
pub fn hash_with(
&self,
Expand All @@ -265,7 +283,7 @@ impl FixedSizeListArray {

impl StructArray {
pub fn hash(&self, seed: Option<&UInt64Array>) -> DaftResult<UInt64Array> {
self.hash_with(seed, HashFunctionKind::XxHash)
self.hash_with(seed, HashFunctionKind::XxHash64)
}

pub fn hash_with(
Expand Down
16 changes: 8 additions & 8 deletions src/daft-core/src/kernels/hashing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fn hash_primitive<T: NativeType>(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
const NULL_HASH: u64 = const_xxh3::xxh3_64(b"");
let hashes = if let Some(seed) = seed {
array
Expand Down Expand Up @@ -91,7 +91,7 @@ fn hash_boolean(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
const NULL_HASH: u64 = const_xxh3::xxh3_64(b"");
const FALSE_HASH: u64 = const_xxh3::xxh3_64(b"0");
const TRUE_HASH: u64 = const_xxh3::xxh3_64(b"1");
Expand Down Expand Up @@ -174,7 +174,7 @@ fn hash_null(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
const NULL_HASH: u64 = const_xxh3::xxh3_64(b"");
let hashes = if let Some(seed) = seed {
seed.values_iter()
Expand Down Expand Up @@ -215,7 +215,7 @@ fn hash_binary<O: Offset>(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
let hashes = if let Some(seed) = seed {
array
.values_iter()
Expand Down Expand Up @@ -259,7 +259,7 @@ fn hash_fixed_size_binary(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
let hashes = if let Some(seed) = seed {
array
.values_iter()
Expand Down Expand Up @@ -303,7 +303,7 @@ fn hash_utf8<O: Offset>(
hash_function: HashFunctionKind,
) -> PrimitiveArray<u64> {
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
let hashes = if let Some(seed) = seed {
array
.values_iter()
Expand Down Expand Up @@ -353,7 +353,7 @@ fn hash_timestamp_with_timezone(
// For timestamps with timezone, we combine the timestamp value with the timezone string
// to ensure that the same instant in different timezones produces different hashes
match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
const NULL_HASH: u64 = const_xxh3::xxh3_64(b"");
let hashes = if let Some(seed) = seed {
array
Expand Down Expand Up @@ -492,7 +492,7 @@ fn hash_decimal(
};

match hash_function {
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
const NULL_HASH: u64 = const_xxh3::xxh3_64(b"");
let hashes = if let Some(seed) = seed {
array
Expand Down
10 changes: 9 additions & 1 deletion src/daft-core/src/python/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,18 @@ impl PySeries {
let hasher = MurBuildHasher::new(seed);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::XxHash32 => {
let hasher = xxhash_rust::xxh32::Xxh32::new(seed as u32);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::XxHash3 => {
let hasher = xxhash_rust::xxh3::Xxh3Builder::new().with_seed(seed as u64);
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::Sha1 => {
let hasher = BuildHasherDefault::<Sha1Hasher>::default();
self.series.minhash(num_hashes, ngram_size, seed, &hasher)
Expand Down
35 changes: 35 additions & 0 deletions src/daft-core/src/series/ops/minhash.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,46 @@
use common_error::{DaftError, DaftResult};
use xxhash_rust::xxh32::Xxh32;

use crate::{
array::ops::DaftMinHash,
datatypes::DataType,
series::{IntoSeries, Series},
};

struct Xxh32Wrapper {
inner: Xxh32,
}

impl std::hash::Hasher for Xxh32Wrapper {
fn write(&mut self, bytes: &[u8]) {
self.inner.update(bytes);
}

fn finish(&self) -> u64 {
self.inner.digest() as u64
}
}

struct Xxh32BuildHasher {
seed: u32,
}

impl Xxh32BuildHasher {
pub fn new(seed: u32) -> Self {
Self { seed }
}
}

impl std::hash::BuildHasher for Xxh32BuildHasher {
type Hasher = Xxh32Wrapper;

fn build_hasher(&self) -> Self::Hasher {
Xxh32Wrapper {
inner: Xxh32::new(self.seed),
}
}
}

impl Series {
pub fn minhash(
&self,
Expand Down
2 changes: 1 addition & 1 deletion src/daft-functions/src/hash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ impl ScalarUDF for HashFunction {
let hash_function = hash_function
.map(|s| s.parse::<HashFunctionKind>())
.transpose()?
.unwrap_or(HashFunctionKind::XxHash);
.unwrap_or(HashFunctionKind::XxHash64);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logic: Default hash function should be XxHash3 instead of XxHash64 to maintain consistency with the PR description that states "default xxhash will be an alias for xxhash3"

Suggested change
.unwrap_or(HashFunctionKind::XxHash64);
.unwrap_or(HashFunctionKind::XxHash3);


if let Some(seed) = seed {
match seed.len() {
Expand Down
2 changes: 1 addition & 1 deletion src/daft-functions/src/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ impl ScalarUDF for MinHashFunction {
let hasher = MurBuildHasher::new(seed);
input.minhash(num_hashes, ngram_size, seed, &hasher)
}
HashFunctionKind::XxHash => {
HashFunctionKind::XxHash64 => {
let hasher = xxhash_rust::xxh64::Xxh64Builder::new(seed as u64);
input.minhash(num_hashes, ngram_size, seed, &hasher)
}
Expand Down
8 changes: 6 additions & 2 deletions src/daft-hash/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ impl Hasher for Sha1Hasher {
pub enum HashFunctionKind {
#[default]
MurmurHash3,
XxHash,
XxHash64,
XxHash32,
XxHash3,
Sha1,
}

Expand All @@ -64,7 +66,9 @@ impl FromStr for HashFunctionKind {
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"murmurhash3" => Ok(Self::MurmurHash3),
"xxhash" => Ok(Self::XxHash),
"xxhash64" => Ok(Self::XxHash64),
"xxhash32" => Ok(Self::XxHash32),
"xxhash3" => Ok(Self::XxHash3),
"sha1" => Ok(Self::Sha1),
_ => Err(DaftError::ValueError(format!(
"Invalid hash function: {}",
Expand Down