fix: support large lists of lists in bpd.Series() constructor (#1662)

tswast · web-flow · commit 0f4024c84508 · 2025-05-01T16:22:49.000-07:00
diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py
@@ -1054,6 +1054,13 @@ def read_parquet(
         if engine == "bigquery":
             job_config = bigquery.LoadJobConfig()
             job_config.source_format = bigquery.SourceFormat.PARQUET
+
+            # Ensure we can load pyarrow.list_ / BQ ARRAY type.
+            # See internal issue 414374215.
+            parquet_options = bigquery.ParquetOptions()
+            parquet_options.enable_list_inference = True
+            job_config.parquet_options = parquet_options
+
             job_config.labels = {"bigframes-api": "read_parquet"}
             table_id = self._loader.load_file(path, job_config=job_config)
             return self._loader.read_gbq_table(table_id)
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -217,6 +217,13 @@ def load_data(
 
         job_config = bigquery.LoadJobConfig()
         job_config.source_format = bigquery.SourceFormat.PARQUET
+
+        # Ensure we can load pyarrow.list_ / BQ ARRAY type.
+        # See internal issue 414374215.
+        parquet_options = bigquery.ParquetOptions()
+        parquet_options.enable_list_inference = True
+        job_config.parquet_options = parquet_options
+
         job_config.schema = bq_schema
         if api_name:
             job_config.labels = {"bigframes-api": api_name}
diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py
@@ -17,17 +17,61 @@
 import pytest
 
 import bigframes.bigquery as bbq
+import bigframes.dtypes
 import bigframes.pandas as bpd
 
 
-def test_array_length():
-    series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]])
-    # TODO(b/336880368): Allow for NULL values to be input for ARRAY columns.
-    # Once we actually store NULL values, this will be NULL where the input is NULL.
-    expected = bpd.Series([3, 2, 0, 0, 1])
+@pytest.mark.parametrize(
+    ["input_data", "expected"],
+    [
+        pytest.param(
+            [["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]],
+            [
+                3,
+                2,
+                # TODO(b/336880368): Allow for NULL values to be input for ARRAY
+                # columns.  Once we actually store NULL values, this will be
+                # NULL where the input is NULL.
+                0,
+                0,
+                1,
+            ],
+            id="small-string",
+        ),
+        pytest.param(
+            [[1, 2, 3], [4, 5], [], [], [6]], [3, 2, 0, 0, 1], id="small-int64"
+        ),
+        pytest.param(
+            [
+                # Regression test for b/414374215 where the Series constructor
+                # returns empty lists when the lists are too big to embed in
+                # SQL.
+                list(np.random.randint(-1_000_000, 1_000_000, size=1000)),
+                list(np.random.randint(-1_000_000, 1_000_000, size=967)),
+                list(np.random.randint(-1_000_000, 1_000_000, size=423)),
+                list(np.random.randint(-1_000_000, 1_000_000, size=5000)),
+                list(np.random.randint(-1_000_000, 1_000_000, size=1003)),
+                list(np.random.randint(-1_000_000, 1_000_000, size=9999)),
+            ],
+            [
+                1000,
+                967,
+                423,
+                5000,
+                1003,
+                9999,
+            ],
+            id="larger-int64",
+        ),
+    ],
+)
+def test_array_length(input_data, expected):
+    series = bpd.Series(input_data)
+    expected = pd.Series(expected, dtype=bigframes.dtypes.INT_DTYPE)
     pd.testing.assert_series_equal(
         bbq.array_length(series).to_pandas(),
-        expected.to_pandas(),
+        expected,
+        check_index_type=False,
     )