diff --git a/src/op_analytics/datapipeline/models/compute/udfs.py b/src/op_analytics/datapipeline/models/compute/udfs.py index 0250fd8b554..3ae9eb1ea6a 100644 --- a/src/op_analytics/datapipeline/models/compute/udfs.py +++ b/src/op_analytics/datapipeline/models/compute/udfs.py @@ -21,28 +21,37 @@ def count_zero_bytes(x: bytes) -> int: def hex_to_lossy(x: str | None) -> int | None: - """Assumes that "x" is a hex string with the leading "0x" prefix.""" + """Assumes that "x" is a hex string with the leading "0x" prefix. + + Handles variable-length hex strings that are multiples of 64 hex chars + "0x" prefix. + Examples: 66 chars (32 bytes), 130 chars (64 bytes), 194 chars (96 bytes), etc. + """ if x is None: return None - # (pedrod) This fix was applied temporarily to let through some transfers on base - # that had a crazy amount with 258 hex characters. This was happening on a few - # transactions on batch (min_block=29132800, max_block=29133000) - # if len(x) == 258: - # return None - - assert len(x) == 66, f"Expected 66 characters, got {len(x)}: {x}" + # Validate format: must start with "0x" and have a multiple of 64 hex characters after + if not x.startswith("0x"): + return None + + hex_chars = len(x) - 2 # Remove "0x" prefix + if hex_chars == 0 or hex_chars % 64 != 0: + return None # If the string beyond the 16 right-most bytes is zeros then the conversion # to BIGINT will be valid. # # NOTE (pedrod): I also attempted to use the HUGEINT return type but it resulted # in an incorrect conversion from the python type to the duckdb type. - if x[:-16] == "0x000000000000000000000000000000000000000000000000": - return int("0x" + x[-16:], 0) - - # There are non-zero bytes beyond the right-most 32 bytes. - # This means this number cannot be represented as a hugeint. + + # Check if all leading bytes (everything except last 16 hex chars) are zero + leading_part = x[:-16] # Everything except last 16 hex chars (8 bytes) + expected_zeros = "0x" + "0" * (len(leading_part) - 2) # "0x" + zeros for the leading part + + if leading_part == expected_zeros: + return int("0x" + x[-16:], 0) # Convert last 16 hex chars (8 bytes) to int + + # There are non-zero bytes beyond the right-most 16 bytes. + # This means this number cannot be represented as a bigint. return None @@ -139,6 +148,26 @@ def create_python_udfs(duckdb_context: DuckDBContext): AS a * 0.0625::DECIMAL(5, 5); """, # + # Cast to DECIMAL(38,0) - for large integer values to prevent overflow + """CREATE OR REPLACE MACRO dec38(a) + AS CAST(a AS DECIMAL(38, 0)); + """, + # + # Cast to DECIMAL(38,12) - for values with 12 decimal places (L1 fee calculations) + """CREATE OR REPLACE MACRO dec38_12(a) + AS CAST(a AS DECIMAL(38, 12)); + """, + # + # Multiply two values safely using DECIMAL(38,0) to prevent overflow + """CREATE OR REPLACE MACRO safe_mul(a, b) + AS dec38(a) * dec38(b); + """, + # + # Multiply value by decimal scalar safely (used for L1 fee scalars) + """CREATE OR REPLACE MACRO safe_mul_scalar(a, b) + AS dec38(a) * CAST(b AS DECIMAL(38, 6)); + """, + # # Count zero bytes for binary data that is encoded as a hex string. """CREATE OR REPLACE MACRO hexstr_zero_bytes(a) AS count_zero_bytes(unhex(substr(a, 3))); diff --git a/src/op_analytics/datapipeline/models/templates/SCHEMA_UPDATE_OVERFLOW_FIX.md b/src/op_analytics/datapipeline/models/templates/SCHEMA_UPDATE_OVERFLOW_FIX.md new file mode 100644 index 00000000000..8ba59c5a43d --- /dev/null +++ b/src/op_analytics/datapipeline/models/templates/SCHEMA_UPDATE_OVERFLOW_FIX.md @@ -0,0 +1,171 @@ +# ClickHouse Schema Updates for INT64 Overflow Fix + +## Overview + +This document contains the SQL commands needed to update existing ClickHouse tables to support the new decimal data types that prevent INT64 overflow when processing high gas price transactions (e.g., Ethereum L1 data during congestion). + +## What Changed + +The overflow fix involved changing columns with actual INT64 overflow issues from `BIGINT` to `DECIMAL` types to safely handle large arithmetic operations: + +- **Gas fee calculations**: `BIGINT` → `DECIMAL(38,0)` (prevents overflow in gas_price × gas_used) +- **Size calculations**: Maintained `DECIMAL(38,12)` (preserved original precision) +- **Unified gas calculations**: `BIGINT` → `DECIMAL(38,0)` (prevents overflow) +- **Trace calculations**: Kept as `DOUBLE` (no overflow issues, calculations already work fine) + +## Required Schema Updates + +### 1. Update `refined_transactions_fees_v2` Table + +```sql +-- Update fee calculation columns to DECIMAL(38,0) +ALTER TABLE refined_transactions_fees_v2 + MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0), + MODIFY COLUMN l2_fee DECIMAL(38,0), + MODIFY COLUMN l2_priority_fee DECIMAL(38,0), + MODIFY COLUMN l2_base_fee DECIMAL(38,0), + MODIFY COLUMN tx_fee DECIMAL(38,0), + MODIFY COLUMN l2_legacy_extra_fee DECIMAL(38,0), + MODIFY COLUMN l1_gas_used_unified DECIMAL(38,0); + +-- Update L1 fee calculation columns to DECIMAL(38,12) to maintain original precision +ALTER TABLE refined_transactions_fees_v2 + MODIFY COLUMN l1_base_fee DECIMAL(38,12), + MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12), + MODIFY COLUMN l1_blob_fee DECIMAL(38,12), + MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12); +``` + +### 2. Update `refined_traces_fees_v2` Table + +```sql +-- Note: These columns remain as DOUBLE since they don't have overflow issues +-- ALTER TABLE refined_traces_fees_v2 +-- MODIFY COLUMN tx_l2_fee_native_minus_subtraces DOUBLE, +-- MODIFY COLUMN tx_l2_base_fee_native_minus_subtraces DOUBLE, +-- MODIFY COLUMN tx_l2_priority_fee_native_minus_subtraces DOUBLE, +-- MODIFY COLUMN tx_l2_legacy_base_fee_native_minus_subtraces DOUBLE; +-- No changes needed - these were already DOUBLE and work fine. +``` + +## Execution Instructions + +### Option 1: Execute All at Once +```sql +-- Execute both ALTER statements in sequence +ALTER TABLE refined_transactions_fees_v2 + MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0), + MODIFY COLUMN l2_fee DECIMAL(38,0), + MODIFY COLUMN l2_priority_fee DECIMAL(38,0), + MODIFY COLUMN l2_base_fee DECIMAL(38,0), + MODIFY COLUMN tx_fee DECIMAL(38,0), + MODIFY COLUMN l2_legacy_extra_fee DECIMAL(38,0), + MODIFY COLUMN l1_gas_used_unified DECIMAL(38,0), + MODIFY COLUMN l1_base_fee DECIMAL(38,12), + MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12), + MODIFY COLUMN l1_blob_fee DECIMAL(38,12), + MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12); + +-- Note: refined_traces_fees_v2 columns remain DOUBLE - no changes needed +``` + +### Option 2: Execute Incrementally (Safer for Large Tables) +```sql +-- Execute one column at a time if tables are very large +ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0); +ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN l2_fee DECIMAL(38,0); +ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN l2_priority_fee DECIMAL(38,0); +-- ... continue for each column +``` + +## Verification Queries + +After running the schema updates, verify the changes were applied correctly: + +```sql +-- Check refined_transactions_fees_v2 schema +DESCRIBE refined_transactions_fees_v2; + +-- Check refined_traces_fees_v2 schema +DESCRIBE refined_traces_fees_v2; + +-- Verify specific columns +SELECT + name, + type +FROM system.columns +WHERE table = 'refined_transactions_fees_v2' + AND name IN ( + 'legacy_extra_fee_per_gas', + 'l2_fee', + 'l2_priority_fee', + 'l2_base_fee', + 'tx_fee', + 'l2_legacy_extra_fee', + 'l1_gas_used_unified', + 'l1_base_fee', + 'l1_base_scaled_size', + 'l1_blob_fee', + 'l1_blob_scaled_size' + ); +``` + +## Impact and Considerations + +### Benefits +- **Prevents INT64 Overflow**: Can now handle Ethereum L1 gas prices during high congestion +- **Future-Proof**: Supports even higher values as networks evolve +- **Maintains Precision**: DECIMAL types preserve exact values for financial calculations + +### Potential Impacts +- **Storage**: DECIMAL columns may use slightly more storage than BIGINT +- **Performance**: Minimal impact on query performance +- **Compatibility**: Existing queries should work without modification +- **Memory**: DECIMAL operations may use slightly more memory + +### Testing Recommendations +1. Test the schema changes on a staging environment first +2. Verify that existing queries still return expected results +3. Run performance tests on critical queries +4. Check that data pipeline ingestion continues to work correctly + +## Rollback Plan + +If you need to rollback the changes (not recommended after new data is ingested): + +```sql +-- WARNING: This may cause data loss if values exceed BIGINT limits +ALTER TABLE refined_transactions_fees_v2 + MODIFY COLUMN legacy_extra_fee_per_gas BIGINT, + MODIFY COLUMN l2_fee BIGINT, + MODIFY COLUMN l2_priority_fee BIGINT, + MODIFY COLUMN l2_base_fee BIGINT, + MODIFY COLUMN tx_fee BIGINT, + MODIFY COLUMN l2_legacy_extra_fee BIGINT, + MODIFY COLUMN l1_gas_used_unified BIGINT; + +-- Note: L1 fee columns were already DECIMAL, changing precision back +ALTER TABLE refined_transactions_fees_v2 + MODIFY COLUMN l1_base_fee DECIMAL(38,12), + MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12), + MODIFY COLUMN l1_blob_fee DECIMAL(38,12), + MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12); + +-- Note: refined_traces_fees_v2 was unchanged, so no rollback needed +``` + +## Related Files + +The following template files were updated with the overflow prevention logic: +- `refined_transactions_fees.sql.j2` - Core transaction fee calculations +- `refined_traces/traces_txs_join.sql.j2` - Trace-transaction fee joins +- `compute/udfs.py` - Added helper functions: `dec38()`, `dec38_6()`, `safe_mul()`, `safe_mul_scalar()` + +## Contact + +If you encounter any issues with these schema updates, check: +1. ClickHouse logs for any error messages +2. Data pipeline monitoring for ingestion issues +3. Test queries to verify data integrity + +The changes are designed to be backward compatible while preventing the INT64 overflow that was occurring with high Ethereum L1 gas prices. \ No newline at end of file diff --git a/src/op_analytics/datapipeline/models/templates/refined_traces/traces_txs_join.sql.j2 b/src/op_analytics/datapipeline/models/templates/refined_traces/traces_txs_join.sql.j2 index 12756e25372..4e0fa8e75f4 100644 --- a/src/op_analytics/datapipeline/models/templates/refined_traces/traces_txs_join.sql.j2 +++ b/src/op_analytics/datapipeline/models/templates/refined_traces/traces_txs_join.sql.j2 @@ -36,16 +36,16 @@ SELECT -- Parent trace fees. -- the subtraces will never add up to part of whole, but leave as is - , r.gas_used_minus_subtraces * gwei_to_eth(t.l2_gas_price_gwei) + , CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_gas_price_gwei) AS DOUBLE) AS tx_l2_fee_native_minus_subtraces - , r.gas_used_minus_subtraces * gwei_to_eth(t.l2_base_gas_price_gwei) + , CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_base_gas_price_gwei) AS DOUBLE) AS tx_l2_base_fee_native_minus_subtraces - , r.gas_used_minus_subtraces * gwei_to_eth(t.l2_priority_gas_price_gwei) + , CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_priority_gas_price_gwei) AS DOUBLE) AS tx_l2_priority_fee_native_minus_subtraces - , r.gas_used_minus_subtraces * gwei_to_eth(t.l2_legacy_extra_gas_price_gwei) + , CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_legacy_extra_gas_price_gwei) AS DOUBLE) AS tx_l2_legacy_base_fee_native_minus_subtraces -- Amortize evenly across all calls diff --git a/src/op_analytics/datapipeline/models/templates/refined_transactions_fees.sql.j2 b/src/op_analytics/datapipeline/models/templates/refined_transactions_fees.sql.j2 index b3f60cd2b17..0149a5d6991 100644 --- a/src/op_analytics/datapipeline/models/templates/refined_transactions_fees.sql.j2 +++ b/src/op_analytics/datapipeline/models/templates/refined_transactions_fees.sql.j2 @@ -50,10 +50,10 @@ pb AS ( , micro(t.receipt_l1_blob_base_fee_scalar) AS l1_blob_base_fee_scalar -- L2 Fees and breakdown into BASE and PRIORITY contributions - , if(t.max_priority_fee_per_gas = 0 AND t.gas_price > 0, t.gas_price - b.base_fee_per_gas, 0) AS legacy_extra_fee_per_gas - , CASE WHEN t.gas_price = 0 THEN 0 ELSE t.gas_price * t.receipt_gas_used END AS l2_fee - , CASE WHEN t.gas_price = 0 THEN 0 ELSE t.max_priority_fee_per_gas * t.receipt_gas_used END AS l2_priority_fee - , CASE WHEN t.gas_price = 0 THEN 0 ELSE b.base_fee_per_gas * t.receipt_gas_used END AS l2_base_fee + , if(t.max_priority_fee_per_gas = 0 AND t.gas_price > 0, dec38(t.gas_price) - dec38(b.base_fee_per_gas), 0) AS legacy_extra_fee_per_gas + , CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(t.gas_price, t.receipt_gas_used) END AS l2_fee + , CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(t.max_priority_fee_per_gas, t.receipt_gas_used) END AS l2_priority_fee + , CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(b.base_fee_per_gas, t.receipt_gas_used) END AS l2_base_fee -- Transaction Information , hexstr_method_id(t.input) AS method_id @@ -85,7 +85,7 @@ pb AS ( * -- Note that this is the "unified" L1 Gas Used. -- The meaning of the field has changed in Fjord. - , coalesce(receipt_l1_gas_used, 16 * estimated_size) AS l1_gas_used_unified + , coalesce(receipt_l1_gas_used, 16 * dec38(estimated_size)) AS l1_gas_used_unified FROM ( SELECT * @@ -93,11 +93,11 @@ pb AS ( -- This is not equivalent to L1 Gas Used. , CASE - WHEN ((16 * l1_base_fee_scalar * l1_gas_price) + (l1_blob_base_fee_scalar * l1_blob_base_fee)) = 0 + WHEN ((dec38_12(16) * l1_base_fee_scalar * dec38(l1_gas_price)) + (l1_blob_base_fee_scalar * dec38(l1_blob_base_fee))) = 0 THEN NULL - ELSE (l1_fee / ( - (16 * l1_base_fee_scalar * l1_gas_price) - + (l1_blob_base_fee_scalar * l1_blob_base_fee) + ELSE (dec38(l1_fee) / ( + (dec38_12(16) * l1_base_fee_scalar * dec38(l1_gas_price)) + + (l1_blob_base_fee_scalar * dec38(l1_blob_base_fee)) ))::INT64 END AS estimated_size @@ -119,12 +119,12 @@ pb AS ( , if(l2_priority_fee = 0, l2_fee - l2_base_fee, 0) AS l2_legacy_extra_fee -- L1 Base - , div16(l1_gas_used_unified) * coalesce(16 * l1_base_fee_scalar, l1_fee_scalar) * l1_gas_price AS l1_base_fee - , div16(l1_gas_used_unified) * coalesce(16 * l1_base_fee_scalar, l1_fee_scalar) AS l1_base_scaled_size + , dec38(div16(l1_gas_used_unified)) * dec38_12(coalesce(safe_mul_scalar(16, l1_base_fee_scalar), l1_fee_scalar)) * dec38(l1_gas_price) AS l1_base_fee + , dec38(div16(l1_gas_used_unified)) * dec38_12(coalesce(safe_mul_scalar(16, l1_base_fee_scalar), l1_fee_scalar)) AS l1_base_scaled_size -- L1 Blob - , coalesce(div16(l1_gas_used_unified) * l1_blob_base_fee_scalar * l1_blob_base_fee, 0) AS l1_blob_fee - , coalesce(div16(l1_gas_used_unified) * l1_blob_base_fee_scalar, 0) AS l1_blob_scaled_size + , coalesce(dec38(div16(l1_gas_used_unified)) * dec38_12(l1_blob_base_fee_scalar) * dec38(l1_blob_base_fee), 0) AS l1_blob_fee + , coalesce(dec38(div16(l1_gas_used_unified)) * dec38_12(l1_blob_base_fee_scalar), 0) AS l1_blob_scaled_size FROM pt2 ) diff --git a/tests/op_analytics/datapipeline/etl/models/test_refined_traces.py b/tests/op_analytics/datapipeline/etl/models/test_refined_traces.py index 6c86fe37da0..95fc56dc60f 100644 --- a/tests/op_analytics/datapipeline/etl/models/test_refined_traces.py +++ b/tests/op_analytics/datapipeline/etl/models/test_refined_traces.py @@ -78,10 +78,10 @@ def test_refined_txs_schema(self): "l1_fee_scalar": "DECIMAL(12,6)", "l1_base_fee_scalar": "DECIMAL(26,7)", "l1_blob_base_fee_scalar": "DECIMAL(26,7)", - "legacy_extra_fee_per_gas": "BIGINT", - "l2_fee": "BIGINT", - "l2_priority_fee": "BIGINT", - "l2_base_fee": "BIGINT", + "legacy_extra_fee_per_gas": "DECIMAL(38,0)", + "l2_fee": "DECIMAL(38,0)", + "l2_priority_fee": "DECIMAL(38,0)", + "l2_base_fee": "DECIMAL(38,0)", "method_id": "VARCHAR", "success": "BOOLEAN", "input_byte_length": "INTEGER", @@ -90,9 +90,9 @@ def test_refined_txs_schema(self): "is_attributes_deposited_transaction": "BOOLEAN", "block_hour": "TIMESTAMP", "estimated_size": "BIGINT", - "l1_gas_used_unified": "BIGINT", - "tx_fee": "BIGINT", - "l2_legacy_extra_fee": "BIGINT", + "l1_gas_used_unified": "DECIMAL(38,0)", + "tx_fee": "DECIMAL(38,0)", + "l2_legacy_extra_fee": "DECIMAL(38,0)", "l1_base_fee": "DECIMAL(38,12)", "l1_base_scaled_size": "DECIMAL(38,12)", "l1_blob_fee": "DECIMAL(38,12)", diff --git a/tests/op_analytics/datapipeline/models/compute/test_udfs.py b/tests/op_analytics/datapipeline/models/compute/test_udfs.py index 9966a1e796f..7ccd38d77d2 100644 --- a/tests/op_analytics/datapipeline/models/compute/test_udfs.py +++ b/tests/op_analytics/datapipeline/models/compute/test_udfs.py @@ -119,6 +119,27 @@ def test_div16(): assert actual == expected +def test_decimal_casting_functions(): + ctx = init_client() + create_duckdb_macros(ctx) + + actual = ctx.client.sql(""" + SELECT + dec38(1285446742109188) AS large_int, + dec38_12(1.123456789012) AS decimal_precision_12, + safe_mul(1000000000, 25000) AS safe_multiplication, + safe_mul_scalar(1000000000, 1.5) AS safe_scalar_mul + """).fetchall()[0] + + expected = ( + Decimal("1285446742109188"), + Decimal("1.123456789012"), + Decimal("25000000000000"), + Decimal("1500000000.000000"), + ) + assert actual == expected + + def test_hexstr_bytelen(): ctx = init_client() create_duckdb_macros(ctx) @@ -288,6 +309,39 @@ def test_conversion_from_hex_to_number(): assert lossy == int(lossless) +def test_hex_to_lossy_variable_length(): + """Test hex_to_lossy with variable length hex strings (multiples of 64 + 2). + + This test demonstrates that we fixed the original issue where hex_to_lossy + would fail with AssertionError on strings longer than 66 characters. + """ + ctx = init_client() + create_duckdb_macros(ctx) + + actual = ctx.client.sql(""" + SELECT + -- Standard 66 chars (should work as before) + hex_to_lossy('0x00000000000000000000000000000000000000000000000000000001c0000001') AS test_66, + + -- 130 chars (should now work instead of crashing) + hex_to_lossy('0x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001c0000001') AS test_130, + + -- Invalid: not multiple of 64 (should return NULL) + hex_to_lossy('0x123456789') AS test_invalid_length, + + -- Invalid: no 0x prefix (should return NULL) + hex_to_lossy('123456789abcdef123456789abcdef123456789abcdef123456789abcdef1234') AS test_no_prefix + """).fetchall()[0] + + expected = ( + 0x01c0000001, # 66 chars - last 16 hex chars + 0x01c0000001, # 130 chars - last 16 hex chars (now works!) + None, # Invalid length + None, # No 0x prefix + ) + assert actual == expected + + def test_indexed_event_arg_to_address(): ctx = init_client() create_duckdb_macros(ctx)