Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: add warning when encountering unknown field types #1989

Merged
merged 15 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions google/cloud/bigquery/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import math
import re
import os
import warnings
from typing import Optional, Union

from dateutil import relativedelta
Expand Down Expand Up @@ -297,12 +298,7 @@ def _record_from_json(value, field):
record = {}
record_iter = zip(field.fields, value["f"])
for subfield, cell in record_iter:
converter = _CELLDATA_FROM_JSON[subfield.field_type]
if subfield.mode == "REPEATED":
value = [converter(item["v"], subfield) for item in cell["v"]]
else:
value = converter(cell["v"], subfield)
record[subfield.name] = value
record[subfield.name] = _field_from_json(cell["v"], subfield)
return record


Expand Down Expand Up @@ -382,7 +378,14 @@ def _field_to_index_mapping(schema):


def _field_from_json(resource, field):
converter = _CELLDATA_FROM_JSON.get(field.field_type, lambda value, _: value)
def default_converter(value, field):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can move this function to outside, so we can reuse it for _scalar_field_to_json() as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up just moving the warning itself to a new function, since the converter in _scalar_field_to_json has a different function signature.

warnings.warn(
"Unknown type '{}' for field '{}'.".format(field.field_type, field.name),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's be explicit and say "Behavior reading this type is not officially supported and may change in future." or something scary along those lines.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before we mark this as "fixed", I think the inverse direction is needed too (where we write JSON from a record), which is used in methods like client.insert_rows() (though maybe that already fails when encountering unknown types?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. Yeah looks like

# Unknown fields should not be silently dropped, include them. Since there
and
converter = _SCALAR_VALUE_TO_JSON_ROW.get(field.field_type)
are two points. The first there is no type info so probably just _scalar_field_to_json needs to be updated. Updated PR to include a warning for the write.

FutureWarning,
)
return value

converter = _CELLDATA_FROM_JSON.get(field.field_type, default_converter)
if field.mode == "REPEATED":
return [converter(item["v"], field) for item in resource]
else:
Expand Down
4 changes: 3 additions & 1 deletion google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,9 @@ def bq_to_arrow_field(bq_field, array_type=None):
metadata=metadata,
)

warnings.warn("Unable to determine type for field '{}'.".format(bq_field.name))
warnings.warn(
"Unable to determine Arrow type for field '{}'.".format(bq_field.name)
)
return None


Expand Down
56 changes: 56 additions & 0 deletions tests/unit/test__helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import decimal
import json
import os
import warnings
import pytest
import packaging
import unittest
Expand Down Expand Up @@ -640,6 +641,17 @@ def test_w_single_scalar_column(self):
row = {"f": [{"v": "1"}]}
self.assertEqual(self._call_fut(row, schema=[col]), (1,))

def test_w_unknown_type(self):
# SELECT 1 AS col
col = _Field("REQUIRED", "col", "UNKNOWN")
row = {"f": [{"v": "1"}]}
with warnings.catch_warnings(record=True) as warned:
self.assertEqual(self._call_fut(row, schema=[col]), ("1",))
self.assertEqual(len(warned), 1)
warning = warned[0]
self.assertTrue("UNKNOWN" in str(warning))
self.assertTrue("col" in str(warning))

def test_w_single_scalar_geography_column(self):
# SELECT 1 AS col
col = _Field("REQUIRED", "geo", "GEOGRAPHY")
Expand All @@ -660,6 +672,17 @@ def test_w_single_array_column(self):
row = {"f": [{"v": [{"v": "1"}, {"v": "2"}, {"v": "3"}]}]}
self.assertEqual(self._call_fut(row, schema=[col]), ([1, 2, 3],))

def test_w_unknown_type_repeated(self):
# SELECT 1 AS col
col = _Field("REPEATED", "col", "UNKNOWN")
row = {"f": [{"v": [{"v": "1"}, {"v": "2"}, {"v": "3"}]}]}
with warnings.catch_warnings(record=True) as warned:
self.assertEqual(self._call_fut(row, schema=[col]), (["1", "2", "3"],))
self.assertEqual(len(warned), 1)
warning = warned[0]
self.assertTrue("UNKNOWN" in str(warning))
self.assertTrue("col" in str(warning))

def test_w_struct_w_nested_array_column(self):
# SELECT ([1, 2], 3, [4, 5]) as col
first = _Field("REPEATED", "first", "INTEGER")
Expand All @@ -684,6 +707,39 @@ def test_w_struct_w_nested_array_column(self):
({"first": [1, 2], "second": 3, "third": [4, 5]},),
)

def test_w_unknown_type_subfield(self):
# SELECT [(1, 2, 3), (4, 5, 6)] as col
first = _Field("REPEATED", "first", "UNKNOWN1")
second = _Field("REQUIRED", "second", "UNKNOWN2")
third = _Field("REPEATED", "third", "INTEGER")
col = _Field("REQUIRED", "col", "RECORD", fields=[first, second, third])
row = {
"f": [
{
"v": {
"f": [
{"v": [{"v": "1"}, {"v": "2"}]},
{"v": "3"},
{"v": [{"v": "4"}, {"v": "5"}]},
]
}
}
]
}
with warnings.catch_warnings(record=True) as warned:
self.assertEqual(
self._call_fut(row, schema=[col]),
({"first": ["1", "2"], "second": "3", "third": [4, 5]},),
)
self.assertEqual(len(warned), 2) # 1 warning per unknown field.
warned = [str(warning) for warning in warned]
self.assertTrue(
any("first" in warning and "UNKNOWN1" in warning for warning in warned)
)
self.assertTrue(
any("second" in warning and "UNKNOWN2" in warning for warning in warned)
)

def test_w_array_of_struct(self):
# SELECT [(1, 2, 3), (4, 5, 6)] as col
first = _Field("REQUIRED", "first", "INTEGER")
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2751,9 +2751,9 @@ def test_to_arrow_w_unknown_type(self):
self.assertEqual(ages, [33, 29])
self.assertEqual(sports, ["volleyball", "basketball"])

self.assertEqual(len(warned), 1)
warning = warned[0]
self.assertTrue("sport" in str(warning))
# Expect warning from both the arrow conversion, and the json deserialization.
self.assertEqual(len(warned), 2)
self.assertTrue(all("sport" in str(warning) for warning in warned))

def test_to_arrow_w_empty_table(self):
pyarrow = pytest.importorskip(
Expand Down