Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 42 additions & 13 deletions src/op_analytics/datapipeline/models/compute/udfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,37 @@ def count_zero_bytes(x: bytes) -> int:


def hex_to_lossy(x: str | None) -> int | None:
"""Assumes that "x" is a hex string with the leading "0x" prefix."""
"""Assumes that "x" is a hex string with the leading "0x" prefix.

Handles variable-length hex strings that are multiples of 64 hex chars + "0x" prefix.
Examples: 66 chars (32 bytes), 130 chars (64 bytes), 194 chars (96 bytes), etc.
"""
if x is None:
return None

# (pedrod) This fix was applied temporarily to let through some transfers on base
# that had a crazy amount with 258 hex characters. This was happening on a few
# transactions on batch (min_block=29132800, max_block=29133000)
# if len(x) == 258:
# return None

assert len(x) == 66, f"Expected 66 characters, got {len(x)}: {x}"
# Validate format: must start with "0x" and have a multiple of 64 hex characters after
if not x.startswith("0x"):
return None

hex_chars = len(x) - 2 # Remove "0x" prefix
if hex_chars == 0 or hex_chars % 64 != 0:
return None

# If the string beyond the 16 right-most bytes is zeros then the conversion
# to BIGINT will be valid.
#
# NOTE (pedrod): I also attempted to use the HUGEINT return type but it resulted
# in an incorrect conversion from the python type to the duckdb type.
if x[:-16] == "0x000000000000000000000000000000000000000000000000":
return int("0x" + x[-16:], 0)

# There are non-zero bytes beyond the right-most 32 bytes.
# This means this number cannot be represented as a hugeint.

# Check if all leading bytes (everything except last 16 hex chars) are zero
leading_part = x[:-16] # Everything except last 16 hex chars (8 bytes)
expected_zeros = "0x" + "0" * (len(leading_part) - 2) # "0x" + zeros for the leading part

if leading_part == expected_zeros:
return int("0x" + x[-16:], 0) # Convert last 16 hex chars (8 bytes) to int

# There are non-zero bytes beyond the right-most 16 bytes.
# This means this number cannot be represented as a bigint.
return None


Expand Down Expand Up @@ -139,6 +148,26 @@ def create_python_udfs(duckdb_context: DuckDBContext):
AS a * 0.0625::DECIMAL(5, 5);
""",
#
# Cast to DECIMAL(38,0) - for large integer values to prevent overflow
"""CREATE OR REPLACE MACRO dec38(a)
AS CAST(a AS DECIMAL(38, 0));
""",
#
# Cast to DECIMAL(38,12) - for values with 12 decimal places (L1 fee calculations)
"""CREATE OR REPLACE MACRO dec38_12(a)
AS CAST(a AS DECIMAL(38, 12));
""",
#
# Multiply two values safely using DECIMAL(38,0) to prevent overflow
"""CREATE OR REPLACE MACRO safe_mul(a, b)
AS dec38(a) * dec38(b);
""",
#
# Multiply value by decimal scalar safely (used for L1 fee scalars)
"""CREATE OR REPLACE MACRO safe_mul_scalar(a, b)
AS dec38(a) * CAST(b AS DECIMAL(38, 6));
""",
#
# Count zero bytes for binary data that is encoded as a hex string.
"""CREATE OR REPLACE MACRO hexstr_zero_bytes(a)
AS count_zero_bytes(unhex(substr(a, 3)));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# ClickHouse Schema Updates for INT64 Overflow Fix

## Overview

This document contains the SQL commands needed to update existing ClickHouse tables to support the new decimal data types that prevent INT64 overflow when processing high gas price transactions (e.g., Ethereum L1 data during congestion).

## What Changed

The overflow fix involved changing columns with actual INT64 overflow issues from `BIGINT` to `DECIMAL` types to safely handle large arithmetic operations:

- **Gas fee calculations**: `BIGINT` → `DECIMAL(38,0)` (prevents overflow in gas_price × gas_used)
- **Size calculations**: Maintained `DECIMAL(38,12)` (preserved original precision)
- **Unified gas calculations**: `BIGINT` → `DECIMAL(38,0)` (prevents overflow)
- **Trace calculations**: Kept as `DOUBLE` (no overflow issues, calculations already work fine)

## Required Schema Updates

### 1. Update `refined_transactions_fees_v2` Table

```sql
-- Update fee calculation columns to DECIMAL(38,0)
ALTER TABLE refined_transactions_fees_v2
MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0),
MODIFY COLUMN l2_fee DECIMAL(38,0),
MODIFY COLUMN l2_priority_fee DECIMAL(38,0),
MODIFY COLUMN l2_base_fee DECIMAL(38,0),
MODIFY COLUMN tx_fee DECIMAL(38,0),
MODIFY COLUMN l2_legacy_extra_fee DECIMAL(38,0),
MODIFY COLUMN l1_gas_used_unified DECIMAL(38,0);

-- Update L1 fee calculation columns to DECIMAL(38,12) to maintain original precision
ALTER TABLE refined_transactions_fees_v2
MODIFY COLUMN l1_base_fee DECIMAL(38,12),
MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12),
MODIFY COLUMN l1_blob_fee DECIMAL(38,12),
MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12);
```

### 2. Update `refined_traces_fees_v2` Table

```sql
-- Note: These columns remain as DOUBLE since they don't have overflow issues
-- ALTER TABLE refined_traces_fees_v2
-- MODIFY COLUMN tx_l2_fee_native_minus_subtraces DOUBLE,
-- MODIFY COLUMN tx_l2_base_fee_native_minus_subtraces DOUBLE,
-- MODIFY COLUMN tx_l2_priority_fee_native_minus_subtraces DOUBLE,
-- MODIFY COLUMN tx_l2_legacy_base_fee_native_minus_subtraces DOUBLE;
-- No changes needed - these were already DOUBLE and work fine.
```

## Execution Instructions

### Option 1: Execute All at Once
```sql
-- Execute both ALTER statements in sequence
ALTER TABLE refined_transactions_fees_v2
MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0),
MODIFY COLUMN l2_fee DECIMAL(38,0),
MODIFY COLUMN l2_priority_fee DECIMAL(38,0),
MODIFY COLUMN l2_base_fee DECIMAL(38,0),
MODIFY COLUMN tx_fee DECIMAL(38,0),
MODIFY COLUMN l2_legacy_extra_fee DECIMAL(38,0),
MODIFY COLUMN l1_gas_used_unified DECIMAL(38,0),
MODIFY COLUMN l1_base_fee DECIMAL(38,12),
MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12),
MODIFY COLUMN l1_blob_fee DECIMAL(38,12),
MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12);

-- Note: refined_traces_fees_v2 columns remain DOUBLE - no changes needed
```

### Option 2: Execute Incrementally (Safer for Large Tables)
```sql
-- Execute one column at a time if tables are very large
ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN legacy_extra_fee_per_gas DECIMAL(38,0);
ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN l2_fee DECIMAL(38,0);
ALTER TABLE refined_transactions_fees_v2 MODIFY COLUMN l2_priority_fee DECIMAL(38,0);
-- ... continue for each column
```

## Verification Queries

After running the schema updates, verify the changes were applied correctly:

```sql
-- Check refined_transactions_fees_v2 schema
DESCRIBE refined_transactions_fees_v2;

-- Check refined_traces_fees_v2 schema
DESCRIBE refined_traces_fees_v2;

-- Verify specific columns
SELECT
name,
type
FROM system.columns
WHERE table = 'refined_transactions_fees_v2'
AND name IN (
'legacy_extra_fee_per_gas',
'l2_fee',
'l2_priority_fee',
'l2_base_fee',
'tx_fee',
'l2_legacy_extra_fee',
'l1_gas_used_unified',
'l1_base_fee',
'l1_base_scaled_size',
'l1_blob_fee',
'l1_blob_scaled_size'
);
```

## Impact and Considerations

### Benefits
- **Prevents INT64 Overflow**: Can now handle Ethereum L1 gas prices during high congestion
- **Future-Proof**: Supports even higher values as networks evolve
- **Maintains Precision**: DECIMAL types preserve exact values for financial calculations

### Potential Impacts
- **Storage**: DECIMAL columns may use slightly more storage than BIGINT
- **Performance**: Minimal impact on query performance
- **Compatibility**: Existing queries should work without modification
- **Memory**: DECIMAL operations may use slightly more memory

### Testing Recommendations
1. Test the schema changes on a staging environment first
2. Verify that existing queries still return expected results
3. Run performance tests on critical queries
4. Check that data pipeline ingestion continues to work correctly

## Rollback Plan

If you need to rollback the changes (not recommended after new data is ingested):

```sql
-- WARNING: This may cause data loss if values exceed BIGINT limits
ALTER TABLE refined_transactions_fees_v2
MODIFY COLUMN legacy_extra_fee_per_gas BIGINT,
MODIFY COLUMN l2_fee BIGINT,
MODIFY COLUMN l2_priority_fee BIGINT,
MODIFY COLUMN l2_base_fee BIGINT,
MODIFY COLUMN tx_fee BIGINT,
MODIFY COLUMN l2_legacy_extra_fee BIGINT,
MODIFY COLUMN l1_gas_used_unified BIGINT;

-- Note: L1 fee columns were already DECIMAL, changing precision back
ALTER TABLE refined_transactions_fees_v2
MODIFY COLUMN l1_base_fee DECIMAL(38,12),
MODIFY COLUMN l1_base_scaled_size DECIMAL(38,12),
MODIFY COLUMN l1_blob_fee DECIMAL(38,12),
MODIFY COLUMN l1_blob_scaled_size DECIMAL(38,12);

-- Note: refined_traces_fees_v2 was unchanged, so no rollback needed
```

## Related Files

The following template files were updated with the overflow prevention logic:
- `refined_transactions_fees.sql.j2` - Core transaction fee calculations
- `refined_traces/traces_txs_join.sql.j2` - Trace-transaction fee joins
- `compute/udfs.py` - Added helper functions: `dec38()`, `dec38_6()`, `safe_mul()`, `safe_mul_scalar()`

## Contact

If you encounter any issues with these schema updates, check:
1. ClickHouse logs for any error messages
2. Data pipeline monitoring for ingestion issues
3. Test queries to verify data integrity

The changes are designed to be backward compatible while preventing the INT64 overflow that was occurring with high Ethereum L1 gas prices.
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,16 @@ SELECT

-- Parent trace fees.
-- the subtraces will never add up to part of whole, but leave as is
, r.gas_used_minus_subtraces * gwei_to_eth(t.l2_gas_price_gwei)
, CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_gas_price_gwei) AS DOUBLE)
AS tx_l2_fee_native_minus_subtraces

, r.gas_used_minus_subtraces * gwei_to_eth(t.l2_base_gas_price_gwei)
, CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_base_gas_price_gwei) AS DOUBLE)
AS tx_l2_base_fee_native_minus_subtraces

, r.gas_used_minus_subtraces * gwei_to_eth(t.l2_priority_gas_price_gwei)
, CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_priority_gas_price_gwei) AS DOUBLE)
AS tx_l2_priority_fee_native_minus_subtraces

, r.gas_used_minus_subtraces * gwei_to_eth(t.l2_legacy_extra_gas_price_gwei)
, CAST(r.gas_used_minus_subtraces * gwei_to_eth(t.l2_legacy_extra_gas_price_gwei) AS DOUBLE)
AS tx_l2_legacy_base_fee_native_minus_subtraces

-- Amortize evenly across all calls
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ pb AS (
, micro(t.receipt_l1_blob_base_fee_scalar) AS l1_blob_base_fee_scalar

-- L2 Fees and breakdown into BASE and PRIORITY contributions
, if(t.max_priority_fee_per_gas = 0 AND t.gas_price > 0, t.gas_price - b.base_fee_per_gas, 0) AS legacy_extra_fee_per_gas
, CASE WHEN t.gas_price = 0 THEN 0 ELSE t.gas_price * t.receipt_gas_used END AS l2_fee
, CASE WHEN t.gas_price = 0 THEN 0 ELSE t.max_priority_fee_per_gas * t.receipt_gas_used END AS l2_priority_fee
, CASE WHEN t.gas_price = 0 THEN 0 ELSE b.base_fee_per_gas * t.receipt_gas_used END AS l2_base_fee
, if(t.max_priority_fee_per_gas = 0 AND t.gas_price > 0, dec38(t.gas_price) - dec38(b.base_fee_per_gas), 0) AS legacy_extra_fee_per_gas
, CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(t.gas_price, t.receipt_gas_used) END AS l2_fee
, CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(t.max_priority_fee_per_gas, t.receipt_gas_used) END AS l2_priority_fee
, CASE WHEN t.gas_price = 0 THEN 0 ELSE safe_mul(b.base_fee_per_gas, t.receipt_gas_used) END AS l2_base_fee

-- Transaction Information
, hexstr_method_id(t.input) AS method_id
Expand Down Expand Up @@ -85,19 +85,19 @@ pb AS (
*
-- Note that this is the "unified" L1 Gas Used.
-- The meaning of the field has changed in Fjord.
, coalesce(receipt_l1_gas_used, 16 * estimated_size) AS l1_gas_used_unified
, coalesce(receipt_l1_gas_used, 16 * dec38(estimated_size)) AS l1_gas_used_unified
FROM (
SELECT
*
-- Estimated Size in bytes calculation, post-Fjord.
-- This is not equivalent to L1 Gas Used.

, CASE
WHEN ((16 * l1_base_fee_scalar * l1_gas_price) + (l1_blob_base_fee_scalar * l1_blob_base_fee)) = 0
WHEN ((dec38_12(16) * l1_base_fee_scalar * dec38(l1_gas_price)) + (l1_blob_base_fee_scalar * dec38(l1_blob_base_fee))) = 0
THEN NULL
ELSE (l1_fee / (
(16 * l1_base_fee_scalar * l1_gas_price)
+ (l1_blob_base_fee_scalar * l1_blob_base_fee)
ELSE (dec38(l1_fee) / (
(dec38_12(16) * l1_base_fee_scalar * dec38(l1_gas_price))
+ (l1_blob_base_fee_scalar * dec38(l1_blob_base_fee))
))::INT64
END
AS estimated_size
Expand All @@ -119,12 +119,12 @@ pb AS (
, if(l2_priority_fee = 0, l2_fee - l2_base_fee, 0) AS l2_legacy_extra_fee

-- L1 Base
, div16(l1_gas_used_unified) * coalesce(16 * l1_base_fee_scalar, l1_fee_scalar) * l1_gas_price AS l1_base_fee
, div16(l1_gas_used_unified) * coalesce(16 * l1_base_fee_scalar, l1_fee_scalar) AS l1_base_scaled_size
, dec38(div16(l1_gas_used_unified)) * dec38_12(coalesce(safe_mul_scalar(16, l1_base_fee_scalar), l1_fee_scalar)) * dec38(l1_gas_price) AS l1_base_fee
, dec38(div16(l1_gas_used_unified)) * dec38_12(coalesce(safe_mul_scalar(16, l1_base_fee_scalar), l1_fee_scalar)) AS l1_base_scaled_size

-- L1 Blob
, coalesce(div16(l1_gas_used_unified) * l1_blob_base_fee_scalar * l1_blob_base_fee, 0) AS l1_blob_fee
, coalesce(div16(l1_gas_used_unified) * l1_blob_base_fee_scalar, 0) AS l1_blob_scaled_size
, coalesce(dec38(div16(l1_gas_used_unified)) * dec38_12(l1_blob_base_fee_scalar) * dec38(l1_blob_base_fee), 0) AS l1_blob_fee
, coalesce(dec38(div16(l1_gas_used_unified)) * dec38_12(l1_blob_base_fee_scalar), 0) AS l1_blob_scaled_size

FROM pt2
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ def test_refined_txs_schema(self):
"l1_fee_scalar": "DECIMAL(12,6)",
"l1_base_fee_scalar": "DECIMAL(26,7)",
"l1_blob_base_fee_scalar": "DECIMAL(26,7)",
"legacy_extra_fee_per_gas": "BIGINT",
"l2_fee": "BIGINT",
"l2_priority_fee": "BIGINT",
"l2_base_fee": "BIGINT",
"legacy_extra_fee_per_gas": "DECIMAL(38,0)",
"l2_fee": "DECIMAL(38,0)",
"l2_priority_fee": "DECIMAL(38,0)",
"l2_base_fee": "DECIMAL(38,0)",
"method_id": "VARCHAR",
"success": "BOOLEAN",
"input_byte_length": "INTEGER",
Expand All @@ -90,9 +90,9 @@ def test_refined_txs_schema(self):
"is_attributes_deposited_transaction": "BOOLEAN",
"block_hour": "TIMESTAMP",
"estimated_size": "BIGINT",
"l1_gas_used_unified": "BIGINT",
"tx_fee": "BIGINT",
"l2_legacy_extra_fee": "BIGINT",
"l1_gas_used_unified": "DECIMAL(38,0)",
"tx_fee": "DECIMAL(38,0)",
"l2_legacy_extra_fee": "DECIMAL(38,0)",
"l1_base_fee": "DECIMAL(38,12)",
"l1_base_scaled_size": "DECIMAL(38,12)",
"l1_blob_fee": "DECIMAL(38,12)",
Expand Down
Loading