Skip to content

Commit 323beb9

Browse files
authored
POD-2826: Fix how floats and integers are handled when determining TDR Schema (#69)
* look through all values before determining type * update version * pull common logic into separate function * update bool logic, update unnit tests
1 parent 81a5d8f commit 323beb9

File tree

3 files changed

+97
-33
lines changed

3 files changed

+97
-33
lines changed

VERSION.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
11.1.0
2-
- Add API to make snapshot public
1+
11.2.0
2+
- Update logic on how floats and integers are handled when determining a TDR schema from Terra input metadata

ops_utils/tdr_utils/tdr_schema_utils.py

Lines changed: 65 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
import time
66
import numpy as np
77
import pandas as pd
8+
import math
89
from datetime import date, datetime
9-
from typing import Any, Optional
10+
from typing import Any, Optional, Union
1011

1112

1213
class InferTDRSchema:
@@ -150,46 +151,81 @@ def _check_type_consistency(self, key_value_type_mappings: dict) -> list[dict]:
150151

151152
return disparate_header_info
152153

153-
def _python_type_to_tdr_type_conversion(self, value_for_header: Any) -> str:
154+
@staticmethod
155+
def _interpret_number(x: Union[float, int]) -> Union[int, float]:
156+
if isinstance(x, float) and x.is_integer():
157+
return int(x)
158+
return x
159+
160+
def _determine_if_float_or_int(self, interpreted_numbers: list[Union[int, float]]) -> str:
161+
# Remove NaNs before type checks
162+
non_nan_numbers = [x for x in interpreted_numbers if not (isinstance(x, float) and math.isnan(x))]
163+
164+
# If all values are int, return int type
165+
if all(isinstance(row_value, int) for row_value in non_nan_numbers):
166+
return self.PYTHON_TDR_DATA_TYPE_MAPPING[int]
167+
# If all values are float, return float type
168+
elif all(isinstance(row_value, float) for row_value in non_nan_numbers):
169+
return self.PYTHON_TDR_DATA_TYPE_MAPPING[float]
170+
# If ANY are float, return float type
171+
else:
172+
return self.PYTHON_TDR_DATA_TYPE_MAPPING[float]
173+
174+
def _python_type_to_tdr_type_conversion(self, values_for_header: list[Any]) -> str:
154175
"""
155176
Convert Python data types to TDR data types.
156177
157178
Args:
158-
value_for_header (Any): The value to determine the TDR type for.
179+
values_for_header (Any): All values for a column header.
159180
160181
Returns:
161182
str: The TDR data type.
162183
"""
163184
gcp_fileref_regex = "^gs://.*"
164185

165-
# Find potential file references
166-
if isinstance(value_for_header, str):
167-
gcp_match = re.search(
168-
pattern=gcp_fileref_regex, string=value_for_header)
169-
if gcp_match:
186+
# Collect all the non-None values for the column
187+
non_none_values = [v for v in values_for_header if v is not None]
188+
189+
# HANDLE SPECIAL CASES
190+
191+
# FILE REFS AND LISTS OF FILE REFS
192+
# If ANY of the values for a header are of type "fileref", we assume that the column is a fileref
193+
for row_value in non_none_values:
194+
if isinstance(row_value, str) and re.search(pattern=gcp_fileref_regex, string=row_value):
170195
return self.PYTHON_TDR_DATA_TYPE_MAPPING["fileref"]
171196

172-
# Tried to use this to parse datetimes, but it was turning too many
173-
# regular ints into datetimes. Commenting out for now
174-
# try:
175-
# date_or_time = parser.parse(value_for_header)
176-
# return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(date_or_time)]
177-
# pass
178-
# except (TypeError, ParserError):
179-
# pass
180-
181-
if isinstance(value_for_header, list):
182-
# check for potential list of filerefs
183-
for v in value_for_header:
184-
if isinstance(v, str):
185-
gcp_match = re.search(pattern=gcp_fileref_regex, string=v)
186-
if gcp_match:
197+
if isinstance(row_value, list):
198+
# Check for a potential array of filerefs - if ANY of the items in a list are
199+
# of type "fileref" we assume that the whole column is a fileref
200+
for item in row_value:
201+
if isinstance(item, str) and re.search(pattern=gcp_fileref_regex, string=item):
187202
return self.PYTHON_TDR_DATA_TYPE_MAPPING["fileref"]
188-
non_none_entry_in_list = [a for a in value_for_header if a is not None][0]
189-
return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(non_none_entry_in_list)]
190203

191-
# if none of the above special cases apply, just pass the type of the value to determine the TDR type
192-
return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(value_for_header)]
204+
# INTEGERS/FLOATS AND LISTS OF INTEGERS AND FLOATS
205+
# Case 1: All values are plain numbers (int or float) - specifically excluding bools
206+
if all(isinstance(x, (int, float)) and not isinstance(x, bool) for x in non_none_values):
207+
interpreted_numbers = [self._interpret_number(row_value) for row_value in non_none_values]
208+
return self._determine_if_float_or_int(interpreted_numbers)
209+
210+
# Case 2: Values are lists of numbers (e.g., [[1, 2], [3.1], [4]])
211+
if all(isinstance(row_value, list) for row_value in non_none_values):
212+
if all(
213+
all(isinstance(item, (int, float)) and not isinstance(item, bool) for item in row_value)
214+
for row_value in non_none_values
215+
):
216+
# Flatten the list of lists and interpret all non-None elements
217+
interpreted_numbers = [self._interpret_number(item)
218+
for row_value in non_none_values for item in row_value if item is not None]
219+
220+
return self._determine_if_float_or_int(interpreted_numbers)
221+
222+
# If none of the above special cases apply, use the first of the non-null values to determine the
223+
# TDR data type
224+
225+
first_value = non_none_values[0]
226+
if isinstance(first_value, list):
227+
return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(first_value[0])]
228+
return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(first_value)]
193229

194230
def _format_column_metadata(self, key_value_type_mappings: dict, disparate_header_info: list[dict]) -> list[dict]:
195231
"""
@@ -214,8 +250,8 @@ def _format_column_metadata(self, key_value_type_mappings: dict, disparate_heade
214250
logging.info(f"Header '{header}' was forced to string to to mismatched datatypes in column")
215251
data_type = self.PYTHON_TDR_DATA_TYPE_MAPPING[str]
216252
else:
217-
# find either the first item that's non-None, or the first non-empty list
218-
data_type = self._python_type_to_tdr_type_conversion([a for a in values_for_header if a is not None][0])
253+
# Use all existing values for the header to determine the data type
254+
data_type = self._python_type_to_tdr_type_conversion(values_for_header)
219255

220256
column_metadata = {
221257
"name": header,

ops_utils/tests/test_tdr_schema_utils.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,37 @@ def test_check_type_consistency_disparate(self):
127127
self.assertEqual(actual_disparate_header_info, expected_disparate_header_info)
128128

129129
def test_python_type_to_tdr_type_conversion_file_ref(self):
130-
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(value_for_header="gs://bucket/some/file.txt")
130+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=["gs://bucket/some/file.txt"])
131131
self.assertEqual(res, "fileref")
132132

133133
def test_python_type_to_tdr_type_conversion_boolean(self):
134-
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(value_for_header=True)
134+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[True])
135135
self.assertEqual(res, "boolean")
136+
137+
def test_python_type_to_tdr_type_conversion_list_of_booleans(self):
138+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[[True], [True, False], [False]])
139+
self.assertEqual(res, "boolean")
140+
141+
def test_python_type_to_tdr_type_conversion_ints(self):
142+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[1.0, 2, 3.0])
143+
self.assertEqual(res, "int64")
144+
145+
def test_python_type_to_tdr_type_conversion_floats(self):
146+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[1.1, 2.2, 3.3])
147+
self.assertEqual(res, "float64")
148+
149+
def test_python_type_to_tdr_type_conversion_float_and_ints(self):
150+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[1.1, 2, 3.0])
151+
self.assertEqual(res, "float64")
152+
153+
def test_python_type_to_tdr_type_conversion_list_of_ints(self):
154+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[[1.0, 2], [3, 4.0], [5]])
155+
self.assertEqual(res, "int64")
156+
157+
def test_python_type_to_tdr_type_conversion_list_of_floats(self):
158+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[[1.0, 2.2], [3.4, 4.1], [5.9]])
159+
self.assertEqual(res, "float64")
160+
161+
def test_python_type_to_tdr_type_conversion_list_of_floats_and_ints(self):
162+
res = self.infer_tdr_schema._python_type_to_tdr_type_conversion(values_for_header=[[1.0, 2], [3.4, 4.1], [5.0]])
163+
self.assertEqual(res, "float64")

0 commit comments

Comments
 (0)