1
+ import hashlib
2
+ import json
3
+
1
4
from collections .abc import Iterator
2
5
from datetime import date , datetime , timezone
3
- from typing import Optional
4
6
5
7
import dlt
6
8
import pyarrow as pa
@@ -33,10 +35,68 @@ def add_partition_column(batch: pa.RecordBatch, partition_key: str) -> pa.Record
33
35
return new_batch
34
36
35
37
38
+ def add_row_hash (
39
+ batch : pa .RecordBatch , key_columns : list [str ], hash_column_name : str = "row_hash"
40
+ ) -> pa .RecordBatch :
41
+ """
42
+ Add a hash column to a PyArrow RecordBatch based on selected columns.
43
+
44
+ Args:
45
+ batch: PyArrow RecordBatch to process
46
+ key_columns: List of column names to include in hash
47
+ hash_column_name: Name for the new hash column
48
+
49
+ Returns:
50
+ PyArrow RecordBatch with added hash column
51
+ """
52
+ # Filter out columns that don't exist in the batch
53
+ existing_columns = [col for col in key_columns if col in batch .schema .names ]
54
+
55
+ if not existing_columns :
56
+ raise ValueError (
57
+ f"None of the key columns { key_columns } exist in the batch schema"
58
+ )
59
+
60
+ # Initialize a list to store hash values
61
+ hash_values = []
62
+
63
+ # Process each row directly using PyArrow
64
+ for i in range (batch .num_rows ):
65
+ # Create a dictionary for this row
66
+ row_dict = {}
67
+
68
+ # Extract values for each column in this row
69
+ for col in existing_columns :
70
+ # Get the column array
71
+ col_array = batch .column (batch .schema .get_field_index (col ))
72
+ # Get the value at this row index
73
+ value = col_array [i ].as_py ()
74
+
75
+ # Only include non-None values
76
+ if value is not None :
77
+ row_dict [col ] = value
78
+
79
+ # Convert to sorted JSON string for consistent hashing
80
+ json_str = json .dumps (row_dict , sort_keys = True , default = str )
81
+
82
+ # Create hash
83
+ hash_obj = hashlib .md5 (json_str .encode ())
84
+ hash_value = hash_obj .hexdigest ()
85
+ hash_values .append (hash_value )
86
+
87
+ # Create PyArrow array from hash values
88
+ hash_array = pa .array (hash_values , type = pa .string ())
89
+
90
+ # Add hash column to batch
91
+ new_batch = batch .append_column (hash_column_name , hash_array )
92
+ return new_batch
93
+
94
+
36
95
@dlt .transformer (standalone = True )
37
96
def read_parquet_custom (
38
97
items : Iterator [FileItemDict ],
39
- partition_key : Optional [str ] = None ,
98
+ partition_key : str ,
99
+ key_columns : list [str ],
40
100
batch_size : int = 64_000 ,
41
101
) -> Iterator [pa .RecordBatch ]:
42
102
"""
@@ -45,6 +105,8 @@ def read_parquet_custom(
45
105
46
106
Args:
47
107
items (Iterator[FileItemDict]): Iterator over file items.
108
+ partition_key (Optional[str]): Partition key to add to the data.
109
+ key_columns (Optional[List[str]]): Columns to use for row hash calculation.
48
110
batch_size (int, optional): Maximum number of rows to process per batch
49
111
50
112
Yields:
@@ -54,8 +116,12 @@ def read_parquet_custom(
54
116
with file_obj .open () as f :
55
117
parquet_file = pq .ParquetFile (f )
56
118
# Iterate over RecordBatch objects
57
- for batch in parquet_file .iter_batches (batch_size = batch_size ):
58
- # Create a new RecordBatch with the existing columns and the new column
59
- batch_with_metadata = add_partition_column (batch , partition_key )
119
+ for raw_batch in parquet_file .iter_batches (batch_size = batch_size ):
120
+ # Add partition column
121
+ processed_batch = add_partition_column (raw_batch , partition_key )
122
+
123
+ # Add row hash
124
+ processed_batch = add_row_hash (processed_batch , key_columns )
125
+
60
126
# Yield the enriched RecordBatch
61
- yield batch_with_metadata
127
+ yield processed_batch
0 commit comments