Skip to content

Commit 0f4024c

Browse files
authored
fix: support large lists of lists in bpd.Series() constructor (#1662)
1 parent 7489c27 commit 0f4024c

File tree

3 files changed

+64
-6
lines changed

3 files changed

+64
-6
lines changed

bigframes/session/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,13 @@ def read_parquet(
10541054
if engine == "bigquery":
10551055
job_config = bigquery.LoadJobConfig()
10561056
job_config.source_format = bigquery.SourceFormat.PARQUET
1057+
1058+
# Ensure we can load pyarrow.list_ / BQ ARRAY type.
1059+
# See internal issue 414374215.
1060+
parquet_options = bigquery.ParquetOptions()
1061+
parquet_options.enable_list_inference = True
1062+
job_config.parquet_options = parquet_options
1063+
10571064
job_config.labels = {"bigframes-api": "read_parquet"}
10581065
table_id = self._loader.load_file(path, job_config=job_config)
10591066
return self._loader.read_gbq_table(table_id)

bigframes/session/loader.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,13 @@ def load_data(
217217

218218
job_config = bigquery.LoadJobConfig()
219219
job_config.source_format = bigquery.SourceFormat.PARQUET
220+
221+
# Ensure we can load pyarrow.list_ / BQ ARRAY type.
222+
# See internal issue 414374215.
223+
parquet_options = bigquery.ParquetOptions()
224+
parquet_options.enable_list_inference = True
225+
job_config.parquet_options = parquet_options
226+
220227
job_config.schema = bq_schema
221228
if api_name:
222229
job_config.labels = {"bigframes-api": api_name}

tests/system/small/bigquery/test_array.py

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,61 @@
1717
import pytest
1818

1919
import bigframes.bigquery as bbq
20+
import bigframes.dtypes
2021
import bigframes.pandas as bpd
2122

2223

23-
def test_array_length():
24-
series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]])
25-
# TODO(b/336880368): Allow for NULL values to be input for ARRAY columns.
26-
# Once we actually store NULL values, this will be NULL where the input is NULL.
27-
expected = bpd.Series([3, 2, 0, 0, 1])
24+
@pytest.mark.parametrize(
25+
["input_data", "expected"],
26+
[
27+
pytest.param(
28+
[["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]],
29+
[
30+
3,
31+
2,
32+
# TODO(b/336880368): Allow for NULL values to be input for ARRAY
33+
# columns. Once we actually store NULL values, this will be
34+
# NULL where the input is NULL.
35+
0,
36+
0,
37+
1,
38+
],
39+
id="small-string",
40+
),
41+
pytest.param(
42+
[[1, 2, 3], [4, 5], [], [], [6]], [3, 2, 0, 0, 1], id="small-int64"
43+
),
44+
pytest.param(
45+
[
46+
# Regression test for b/414374215 where the Series constructor
47+
# returns empty lists when the lists are too big to embed in
48+
# SQL.
49+
list(np.random.randint(-1_000_000, 1_000_000, size=1000)),
50+
list(np.random.randint(-1_000_000, 1_000_000, size=967)),
51+
list(np.random.randint(-1_000_000, 1_000_000, size=423)),
52+
list(np.random.randint(-1_000_000, 1_000_000, size=5000)),
53+
list(np.random.randint(-1_000_000, 1_000_000, size=1003)),
54+
list(np.random.randint(-1_000_000, 1_000_000, size=9999)),
55+
],
56+
[
57+
1000,
58+
967,
59+
423,
60+
5000,
61+
1003,
62+
9999,
63+
],
64+
id="larger-int64",
65+
),
66+
],
67+
)
68+
def test_array_length(input_data, expected):
69+
series = bpd.Series(input_data)
70+
expected = pd.Series(expected, dtype=bigframes.dtypes.INT_DTYPE)
2871
pd.testing.assert_series_equal(
2972
bbq.array_length(series).to_pandas(),
30-
expected.to_pandas(),
73+
expected,
74+
check_index_type=False,
3175
)
3276

3377

0 commit comments

Comments
 (0)