5
5
import time
6
6
import numpy as np
7
7
import pandas as pd
8
+ import math
8
9
from datetime import date , datetime
9
- from typing import Any , Optional
10
+ from typing import Any , Optional , Union
10
11
11
12
12
13
class InferTDRSchema :
@@ -150,46 +151,81 @@ def _check_type_consistency(self, key_value_type_mappings: dict) -> list[dict]:
150
151
151
152
return disparate_header_info
152
153
153
- def _python_type_to_tdr_type_conversion (self , value_for_header : Any ) -> str :
154
+ @staticmethod
155
+ def _interpret_number (x : Union [float , int ]) -> Union [int , float ]:
156
+ if isinstance (x , float ) and x .is_integer ():
157
+ return int (x )
158
+ return x
159
+
160
+ def _determine_if_float_or_int (self , interpreted_numbers : list [Union [int , float ]]) -> str :
161
+ # Remove NaNs before type checks
162
+ non_nan_numbers = [x for x in interpreted_numbers if not (isinstance (x , float ) and math .isnan (x ))]
163
+
164
+ # If all values are int, return int type
165
+ if all (isinstance (row_value , int ) for row_value in non_nan_numbers ):
166
+ return self .PYTHON_TDR_DATA_TYPE_MAPPING [int ]
167
+ # If all values are float, return float type
168
+ elif all (isinstance (row_value , float ) for row_value in non_nan_numbers ):
169
+ return self .PYTHON_TDR_DATA_TYPE_MAPPING [float ]
170
+ # If ANY are float, return float type
171
+ else :
172
+ return self .PYTHON_TDR_DATA_TYPE_MAPPING [float ]
173
+
174
+ def _python_type_to_tdr_type_conversion (self , values_for_header : list [Any ]) -> str :
154
175
"""
155
176
Convert Python data types to TDR data types.
156
177
157
178
Args:
158
- value_for_header (Any): The value to determine the TDR type for .
179
+ values_for_header (Any): All values for a column header .
159
180
160
181
Returns:
161
182
str: The TDR data type.
162
183
"""
163
184
gcp_fileref_regex = "^gs://.*"
164
185
165
- # Find potential file references
166
- if isinstance (value_for_header , str ):
167
- gcp_match = re .search (
168
- pattern = gcp_fileref_regex , string = value_for_header )
169
- if gcp_match :
186
+ # Collect all the non-None values for the column
187
+ non_none_values = [v for v in values_for_header if v is not None ]
188
+
189
+ # HANDLE SPECIAL CASES
190
+
191
+ # FILE REFS AND LISTS OF FILE REFS
192
+ # If ANY of the values for a header are of type "fileref", we assume that the column is a fileref
193
+ for row_value in non_none_values :
194
+ if isinstance (row_value , str ) and re .search (pattern = gcp_fileref_regex , string = row_value ):
170
195
return self .PYTHON_TDR_DATA_TYPE_MAPPING ["fileref" ]
171
196
172
- # Tried to use this to parse datetimes, but it was turning too many
173
- # regular ints into datetimes. Commenting out for now
174
- # try:
175
- # date_or_time = parser.parse(value_for_header)
176
- # return self.PYTHON_TDR_DATA_TYPE_MAPPING[type(date_or_time)]
177
- # pass
178
- # except (TypeError, ParserError):
179
- # pass
180
-
181
- if isinstance (value_for_header , list ):
182
- # check for potential list of filerefs
183
- for v in value_for_header :
184
- if isinstance (v , str ):
185
- gcp_match = re .search (pattern = gcp_fileref_regex , string = v )
186
- if gcp_match :
197
+ if isinstance (row_value , list ):
198
+ # Check for a potential array of filerefs - if ANY of the items in a list are
199
+ # of type "fileref" we assume that the whole column is a fileref
200
+ for item in row_value :
201
+ if isinstance (item , str ) and re .search (pattern = gcp_fileref_regex , string = item ):
187
202
return self .PYTHON_TDR_DATA_TYPE_MAPPING ["fileref" ]
188
- non_none_entry_in_list = [a for a in value_for_header if a is not None ][0 ]
189
- return self .PYTHON_TDR_DATA_TYPE_MAPPING [type (non_none_entry_in_list )]
190
203
191
- # if none of the above special cases apply, just pass the type of the value to determine the TDR type
192
- return self .PYTHON_TDR_DATA_TYPE_MAPPING [type (value_for_header )]
204
+ # INTEGERS/FLOATS AND LISTS OF INTEGERS AND FLOATS
205
+ # Case 1: All values are plain numbers (int or float) - specifically excluding bools
206
+ if all (isinstance (x , (int , float )) and not isinstance (x , bool ) for x in non_none_values ):
207
+ interpreted_numbers = [self ._interpret_number (row_value ) for row_value in non_none_values ]
208
+ return self ._determine_if_float_or_int (interpreted_numbers )
209
+
210
+ # Case 2: Values are lists of numbers (e.g., [[1, 2], [3.1], [4]])
211
+ if all (isinstance (row_value , list ) for row_value in non_none_values ):
212
+ if all (
213
+ all (isinstance (item , (int , float )) and not isinstance (item , bool ) for item in row_value )
214
+ for row_value in non_none_values
215
+ ):
216
+ # Flatten the list of lists and interpret all non-None elements
217
+ interpreted_numbers = [self ._interpret_number (item )
218
+ for row_value in non_none_values for item in row_value if item is not None ]
219
+
220
+ return self ._determine_if_float_or_int (interpreted_numbers )
221
+
222
+ # If none of the above special cases apply, use the first of the non-null values to determine the
223
+ # TDR data type
224
+
225
+ first_value = non_none_values [0 ]
226
+ if isinstance (first_value , list ):
227
+ return self .PYTHON_TDR_DATA_TYPE_MAPPING [type (first_value [0 ])]
228
+ return self .PYTHON_TDR_DATA_TYPE_MAPPING [type (first_value )]
193
229
194
230
def _format_column_metadata (self , key_value_type_mappings : dict , disparate_header_info : list [dict ]) -> list [dict ]:
195
231
"""
@@ -214,8 +250,8 @@ def _format_column_metadata(self, key_value_type_mappings: dict, disparate_heade
214
250
logging .info (f"Header '{ header } ' was forced to string to to mismatched datatypes in column" )
215
251
data_type = self .PYTHON_TDR_DATA_TYPE_MAPPING [str ]
216
252
else :
217
- # find either the first item that's non-None, or the first non-empty list
218
- data_type = self ._python_type_to_tdr_type_conversion ([ a for a in values_for_header if a is not None ][ 0 ] )
253
+ # Use all existing values for the header to determine the data type
254
+ data_type = self ._python_type_to_tdr_type_conversion (values_for_header )
219
255
220
256
column_metadata = {
221
257
"name" : header ,
0 commit comments