Add row_hash via PyArrow to ingested data

richban · richban · commit 29f326dedfc1 · 2025-02-28T23:33:49.000+01:00
diff --git a/opendata_stack_platform_project/opendata_stack_platform/dlt/sources/taxi_trip/__init__.py b/opendata_stack_platform_project/opendata_stack_platform/dlt/sources/taxi_trip/__init__.py
@@ -13,40 +13,57 @@
 BUCKET_URL = "s3://datalake"
 
 
-@dlt.source(name="taxi_trip_source")
-def taxi_trip_source(dataset_type: str, partition_key: Optional[str] = None) -> DltSource:
-    """Source for taxi trips data (yellow, green, or FHV) based on file path.
+def get_key_columns_for_dataset(dataset_type: str) -> list[str]:
+    """
+    Get the list of columns to use for row hash calculation based on dataset type.
 
     Args:
-        dataset_type: Type of dataset ('yellow', 'green', or 'fhvhv')
-        partition_key: Optional partition key for filtering data
+        dataset_type (str): Type of dataset ('yellow', 'green', or 'fhvhv')
 
     Returns:
-        DltSource: A data source for the specified taxi trip type
+        List[str]: List of column names to use for row hash
     """
     if dataset_type not in ["yellow", "green", "fhvhv"]:
         raise ValueError("dataset_type must be one of 'yellow', 'green', or 'fhvhv'.")
 
-    # Define natural key based on dataset type, using snake_case for DuckDB
     if dataset_type in ["yellow", "green"]:
         pickup_datetime = (
             "tpep_pickup_datetime" if dataset_type == "yellow" else "lpep_pickup_datetime"
         )
-        natural_key = (
-            "vendor_id",
+        return [
             pickup_datetime,
             "pu_location_id",
             "do_location_id",
             "partition_key",
-        )
+        ]
     else:  # fhvhv
-        natural_key = (
-            "hvfhs_license_num",
+        return [
             "pickup_datetime",
             "pu_location_id",
             "do_location_id",
             "partition_key",
-        )
+        ]
+
+
+@dlt.source(name="taxi_trip_source")
+def taxi_trip_source(dataset_type: str, partition_key: Optional[str] = None) -> DltSource:
+    """Source for taxi trips data (yellow, green, or FHV) based on file path.
+
+    Args:
+        dataset_type: Type of dataset ('yellow', 'green', or 'fhvhv')
+        partition_key: Optional partition key for filtering data
+
+    Returns:
+        DltSource: A data source for the specified taxi trip type
+    """
+    if dataset_type not in ["yellow", "green", "fhvhv"]:
+        raise ValueError("dataset_type must be one of 'yellow', 'green', or 'fhvhv'.")
+
+    # Get key columns for row hash from utility function
+    key_columns = get_key_columns_for_dataset(dataset_type)
+
+    # Natural key is always the row hash
+    natural_key = ["row_hash"]
 
     # Construct file glob pattern for the dataset type
     file_glob = constants.TAXI_TRIPS_RAW_KEY_TEMPLATE.format(
@@ -62,9 +79,13 @@ def taxi_trip_source(dataset_type: str, partition_key: Optional[str] = None) ->
         raw_files.add_filter(lambda item: partition_key[:-3] in item["file_name"])
 
     # Create source with transformations
-    source = (raw_files | read_parquet_custom(partition_key=partition_key)).with_name(
-        f"{dataset_type}_taxi_trip_bronze"
-    )
+    source = (
+        raw_files
+        | read_parquet_custom(
+            partition_key=partition_key,
+            key_columns=key_columns,
+        )
+    ).with_name(f"{dataset_type}_taxi_trip_bronze")
 
     # Apply write configuration hints
     source.apply_hints(
diff --git a/opendata_stack_platform_project/opendata_stack_platform/dlt/sources/taxi_trip/utils.py b/opendata_stack_platform_project/opendata_stack_platform/dlt/sources/taxi_trip/utils.py
@@ -1,6 +1,8 @@
+import hashlib
+import json
+
 from collections.abc import Iterator
 from datetime import date, datetime, timezone
-from typing import Optional
 
 import dlt
 import pyarrow as pa
@@ -33,10 +35,68 @@ def add_partition_column(batch: pa.RecordBatch, partition_key: str) -> pa.Record
     return new_batch
 
 
+def add_row_hash(
+    batch: pa.RecordBatch, key_columns: list[str], hash_column_name: str = "row_hash"
+) -> pa.RecordBatch:
+    """
+    Add a hash column to a PyArrow RecordBatch based on selected columns.
+
+    Args:
+        batch: PyArrow RecordBatch to process
+        key_columns: List of column names to include in hash
+        hash_column_name: Name for the new hash column
+
+    Returns:
+        PyArrow RecordBatch with added hash column
+    """
+    # Filter out columns that don't exist in the batch
+    existing_columns = [col for col in key_columns if col in batch.schema.names]
+
+    if not existing_columns:
+        raise ValueError(
+            f"None of the key columns {key_columns} exist in the batch schema"
+        )
+
+    # Initialize a list to store hash values
+    hash_values = []
+
+    # Process each row directly using PyArrow
+    for i in range(batch.num_rows):
+        # Create a dictionary for this row
+        row_dict = {}
+
+        # Extract values for each column in this row
+        for col in existing_columns:
+            # Get the column array
+            col_array = batch.column(batch.schema.get_field_index(col))
+            # Get the value at this row index
+            value = col_array[i].as_py()
+
+            # Only include non-None values
+            if value is not None:
+                row_dict[col] = value
+
+        # Convert to sorted JSON string for consistent hashing
+        json_str = json.dumps(row_dict, sort_keys=True, default=str)
+
+        # Create hash
+        hash_obj = hashlib.md5(json_str.encode())
+        hash_value = hash_obj.hexdigest()
+        hash_values.append(hash_value)
+
+    # Create PyArrow array from hash values
+    hash_array = pa.array(hash_values, type=pa.string())
+
+    # Add hash column to batch
+    new_batch = batch.append_column(hash_column_name, hash_array)
+    return new_batch
+
+
 @dlt.transformer(standalone=True)
 def read_parquet_custom(
     items: Iterator[FileItemDict],
-    partition_key: Optional[str] = None,
+    partition_key: str,
+    key_columns: list[str],
     batch_size: int = 64_000,
 ) -> Iterator[pa.RecordBatch]:
     """
@@ -45,6 +105,8 @@ def read_parquet_custom(
 
     Args:
         items (Iterator[FileItemDict]): Iterator over file items.
+        partition_key (Optional[str]): Partition key to add to the data.
+        key_columns (Optional[List[str]]): Columns to use for row hash calculation.
         batch_size (int, optional): Maximum number of rows to process per batch
 
     Yields:
@@ -54,8 +116,12 @@ def read_parquet_custom(
         with file_obj.open() as f:
             parquet_file = pq.ParquetFile(f)
             # Iterate over RecordBatch objects
-            for batch in parquet_file.iter_batches(batch_size=batch_size):
-                # Create a new RecordBatch with the existing columns and the new column
-                batch_with_metadata = add_partition_column(batch, partition_key)
+            for raw_batch in parquet_file.iter_batches(batch_size=batch_size):
+                # Add partition column
+                processed_batch = add_partition_column(raw_batch, partition_key)
+
+                # Add row hash
+                processed_batch = add_row_hash(processed_batch, key_columns)
+
                 # Yield the enriched RecordBatch
-                yield batch_with_metadata
+                yield processed_batch