From 421024cf9beac599351b2bf6d65e7839d8909c2b Mon Sep 17 00:00:00 2001 From: lkhagvadorj-amp Date: Thu, 17 Apr 2025 16:56:20 +0100 Subject: [PATCH] feat: allow loading table from dataframe with extra fields, #1812 --- google/cloud/bigquery/_pandas_helpers.py | 11 ++++++-- tests/unit/test__pandas_helpers.py | 34 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 457eb9078..dacbe1515 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -484,6 +484,10 @@ def dataframe_to_bq_schema(dataframe, bq_schema): Optional[Sequence[google.cloud.bigquery.schema.SchemaField]]: The automatically determined schema. Returns None if the type of any column cannot be determined. + + Note: + - If `bq_schema` contains fields not found in the DataFrame, they will + still be included in the resulting schema, and a warning will be issued. """ if pandas_gbq is None: warnings.warn( @@ -537,11 +541,14 @@ def dataframe_to_bq_schema(dataframe, bq_schema): # Catch any schema mismatch. The developer explicitly asked to serialize a # column, but it was not found. if bq_schema_unused: - raise ValueError( + warnings.warn( "bq_schema contains fields not present in dataframe: {}".format( bq_schema_unused - ) + ), + category=UserWarning, ) + for unused_field_name in bq_schema_unused: + bq_schema_out.append(bq_schema_index.get(unused_field_name)) # If schema detection was not successful for all columns, also try with # pyarrow, if available. diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 48c085c1d..c59b6d7d3 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -1385,6 +1385,40 @@ def test_dataframe_to_bq_schema_w_bq_schema(module_under_test, monkeypatch): assert returned_schema == expected_schema +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_dataframe_to_bq_schema_allows_extra_fields(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas_gbq", None) + + df_data = collections.OrderedDict( + [ + ("str_column", ["hello", "world"]), + ("int_column", [42, 8]), + ("bool_column", [True, False]), + ] + ) + dataframe = pandas.DataFrame(df_data) + + dict_schema = [ + {"name": "str_column", "type": "STRING", "mode": "NULLABLE"}, + {"name": "int_column", "type": "INTEGER", "mode": "NULLABLE"}, + {"name": "bool_column", "type": "BOOL", "mode": "REQUIRED"}, + {"name": "extra_column", "type": "STRING", "mode": "NULLABLE"}, + ] + + with pytest.warns(UserWarning, match="bq_schema contains fields not present"): + returned_schema = module_under_test.dataframe_to_bq_schema( + dataframe, dict_schema + ) + + expected_schema = ( + schema.SchemaField("str_column", "STRING", "NULLABLE"), + schema.SchemaField("int_column", "INTEGER", "NULLABLE"), + schema.SchemaField("bool_column", "BOOL", "REQUIRED"), + schema.SchemaField("extra_column", "STRING", "NULLABLE"), + ) + assert returned_schema == expected_schema + + @pytest.mark.skipif(pandas is None, reason="Requires `pandas`") def test_dataframe_to_bq_schema_fallback_needed_wo_pyarrow( module_under_test, monkeypatch