Skip to content

Commit

Permalink
feat: support nullable boolean and Int64 dtypes in `insert_rows_from_…
Browse files Browse the repository at this point in the history
…dataframe` (#1816)
  • Loading branch information
tswast authored Feb 12, 2024
1 parent 57be031 commit ab0cf4c
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 18 deletions.
19 changes: 19 additions & 0 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,6 +958,25 @@ def dataframe_to_json_generator(dataframe):
# considered a NaN, however.
if isinstance(is_nan, bool) and is_nan:
continue

# Convert numpy types to corresponding Python types.
# https://stackoverflow.com/a/60441783/101923
if isinstance(value, numpy.bool_):
value = bool(value)
elif isinstance(
value,
(
numpy.int64,
numpy.int32,
numpy.int16,
numpy.int8,
numpy.uint64,
numpy.uint32,
numpy.uint16,
numpy.uint8,
),
):
value = int(value)
output[column] = value

yield output
Expand Down
13 changes: 12 additions & 1 deletion tests/system/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,7 +835,9 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
schema = [
SF("float_col", "FLOAT", mode="REQUIRED"),
SF("int_col", "INTEGER", mode="REQUIRED"),
SF("int64_col", "INTEGER", mode="NULLABLE"),
SF("bool_col", "BOOLEAN", mode="REQUIRED"),
SF("boolean_col", "BOOLEAN", mode="NULLABLE"),
SF("string_col", "STRING", mode="NULLABLE"),
SF("date_col", "DATE", mode="NULLABLE"),
SF("time_col", "TIME", mode="NULLABLE"),
Expand Down Expand Up @@ -898,6 +900,15 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
dataframe["date_col"] = dataframe["date_col"].astype("dbdate")
dataframe["time_col"] = dataframe["time_col"].astype("dbtime")

# Support nullable integer and boolean dtypes.
# https://github.com/googleapis/python-bigquery/issues/1815
dataframe["int64_col"] = pandas.Series(
[-11, -22, pandas.NA, -44, -55, -66], dtype="Int64"
)
dataframe["boolean_col"] = pandas.Series(
[True, False, True, pandas.NA, True, False], dtype="boolean"
)

table_id = f"{bigquery_client.project}.{dataset_id}.test_insert_rows_from_dataframe"
table_arg = bigquery.Table(table_id, schema=schema)
table = helpers.retry_403(bigquery_client.create_table)(table_arg)
Expand All @@ -910,7 +921,7 @@ def test_insert_rows_from_dataframe(bigquery_client, dataset_id):
expected = [
# Pandas often represents NULL values as NaN. Convert to None for
# easier comparison.
tuple(None if col != col else col for col in data_row)
tuple(None if pandas.isna(col) else col for col in data_row)
for data_row in dataframe.itertuples(index=False)
]

Expand Down
65 changes: 48 additions & 17 deletions tests/unit/test__pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,29 +808,60 @@ def test_list_columns_and_indexes_with_named_index_same_as_column_name(
@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_dataframe_to_json_generator(module_under_test):
utcnow = datetime.datetime.utcnow()
df_data = collections.OrderedDict(
[
("a_series", [pandas.NA, 2, 3, 4]),
("b_series", [0.1, float("NaN"), 0.3, 0.4]),
("c_series", ["a", "b", pandas.NA, "d"]),
("d_series", [utcnow, utcnow, utcnow, pandas.NaT]),
("e_series", [True, False, True, None]),
]
)
dataframe = pandas.DataFrame(
df_data, index=pandas.Index([4, 5, 6, 7], name="a_index")
{
"a_series": [1, 2, 3, 4],
"b_series": [0.1, float("NaN"), 0.3, 0.4],
"c_series": ["a", "b", pandas.NA, "d"],
"d_series": [utcnow, utcnow, utcnow, pandas.NaT],
"e_series": [True, False, True, None],
# Support nullable dtypes.
# https://github.com/googleapis/python-bigquery/issues/1815
"boolean_series": pandas.Series(
[True, False, pandas.NA, False], dtype="boolean"
),
"int64_series": pandas.Series([-1, pandas.NA, -3, -4], dtype="Int64"),
}
)

dataframe = dataframe.astype({"a_series": pandas.Int64Dtype()})
# Index is not included, even if it is not the default and has a name.
dataframe = dataframe.rename(index=lambda idx: idx + 4)
dataframe.index.name = "a_index"

rows = module_under_test.dataframe_to_json_generator(dataframe)
rows = list(module_under_test.dataframe_to_json_generator(dataframe))
expected = [
{"b_series": 0.1, "c_series": "a", "d_series": utcnow, "e_series": True},
{"a_series": 2, "c_series": "b", "d_series": utcnow, "e_series": False},
{"a_series": 3, "b_series": 0.3, "d_series": utcnow, "e_series": True},
{"a_series": 4, "b_series": 0.4, "c_series": "d"},
{
"a_series": 1,
"b_series": 0.1,
"c_series": "a",
"d_series": utcnow,
"e_series": True,
"boolean_series": True,
"int64_series": -1,
},
{
"a_series": 2,
"c_series": "b",
"d_series": utcnow,
"e_series": False,
"boolean_series": False,
},
{
"a_series": 3,
"b_series": 0.3,
"d_series": utcnow,
"e_series": True,
"int64_series": -3,
},
{
"a_series": 4,
"b_series": 0.4,
"c_series": "d",
"boolean_series": False,
"int64_series": -4,
},
]
assert list(rows) == expected
assert rows == expected


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
Expand Down

0 comments on commit ab0cf4c

Please sign in to comment.