Skip to content

Commit 8f3b5b2

Browse files
authored
fix: avoid 403 response too large to return error with read_gbq and large query results (#77)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Towards internal issue 303057336 🦕
1 parent 158c00c commit 8f3b5b2

File tree

7 files changed

+193
-75
lines changed

7 files changed

+193
-75
lines changed

bigframes/core/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,8 +1198,8 @@ def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue:
11981198
destination = self._session._ibis_to_session_table(
11991199
ibis_expr, cluster_cols=cluster_cols, api_name="cache"
12001200
)
1201-
table_expression = self._session.ibis_client.sql(
1202-
f"SELECT * FROM `_SESSION`.`{destination.table_id}`"
1201+
table_expression = self._session.ibis_client.table(
1202+
f"{destination.project}.{destination.dataset_id}.{destination.table_id}"
12031203
)
12041204
new_columns = [table_expression[column] for column in self.column_names]
12051205
new_hidden_columns = [

bigframes/core/io.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
import datetime
1818
import textwrap
19-
from typing import Dict, Union
19+
import types
20+
from typing import Dict, Iterable, Union
2021

2122
import google.cloud.bigquery as bigquery
2223

@@ -89,6 +90,48 @@ def create_snapshot_sql(
8990
)
9091

9192

93+
# BigQuery REST API returns types in Legacy SQL format
94+
# https://cloud.google.com/bigquery/docs/data-types but we use Standard SQL
95+
# names
96+
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
97+
BQ_STANDARD_TYPES = types.MappingProxyType(
98+
{
99+
"BOOLEAN": "BOOL",
100+
"INTEGER": "INT64",
101+
"FLOAT": "FLOAT64",
102+
}
103+
)
104+
105+
106+
def bq_field_to_type_sql(field: bigquery.SchemaField):
107+
if field.mode == "REPEATED":
108+
nested_type = bq_field_to_type_sql(
109+
bigquery.SchemaField(
110+
field.name, field.field_type, mode="NULLABLE", fields=field.fields
111+
)
112+
)
113+
return f"ARRAY<{nested_type}>"
114+
115+
if field.field_type == "RECORD":
116+
nested_fields_sql = ", ".join(
117+
bq_field_to_sql(child_field) for child_field in field.fields
118+
)
119+
return f"STRUCT<{nested_fields_sql}>"
120+
121+
type_ = field.field_type
122+
return BQ_STANDARD_TYPES.get(type_, type_)
123+
124+
125+
def bq_field_to_sql(field: bigquery.SchemaField):
126+
name = field.name
127+
type_ = bq_field_to_type_sql(field)
128+
return f"`{name}` {type_}"
129+
130+
131+
def bq_schema_to_sql(schema: Iterable[bigquery.SchemaField]):
132+
return ", ".join(bq_field_to_sql(field) for field in schema)
133+
134+
92135
def format_option(key: str, value: Union[bool, str]) -> str:
93136
if isinstance(value, bool):
94137
return f"{key}=true" if value else f"{key}=false"

bigframes/session.py

Lines changed: 86 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -449,13 +449,6 @@ def _query_to_destination(
449449
index_cols: List[str],
450450
api_name: str,
451451
) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]:
452-
# If there are no index columns, then there's no reason to cache to a
453-
# (clustered) session table, as we'll just have to query it again to
454-
# create a default index & ordering.
455-
if not index_cols:
456-
_, query_job = self._start_query(query)
457-
return query_job.destination, query_job
458-
459452
# If a dry_run indicates this is not a query type job, then don't
460453
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
461454
dry_run_config = bigquery.QueryJobConfig()
@@ -465,15 +458,24 @@ def _query_to_destination(
465458
_, query_job = self._start_query(query)
466459
return query_job.destination, query_job
467460

468-
# Make sure we cluster by the index column(s) so that subsequent
469-
# operations are as speedy as they can be.
461+
# Create a table to workaround BigQuery 10 GB query results limit. See:
462+
# internal issue 303057336.
463+
# Since we have a `statement_type == 'SELECT'`, schema should be populated.
464+
schema = typing.cast(Iterable[bigquery.SchemaField], dry_run_job.schema)
465+
temp_table = self._create_session_table_empty(api_name, schema, index_cols)
466+
467+
job_config = bigquery.QueryJobConfig()
468+
job_config.destination = temp_table
469+
470470
try:
471-
ibis_expr = self.ibis_client.sql(query)
472-
return self._ibis_to_session_table(ibis_expr, index_cols, api_name), None
471+
# Write to temp table to workaround BigQuery 10 GB query results
472+
# limit. See: internal issue 303057336.
473+
_, query_job = self._start_query(query, job_config=job_config)
474+
return query_job.destination, query_job
473475
except google.api_core.exceptions.BadRequest:
474-
# Some SELECT statements still aren't compatible with CREATE TEMP
475-
# TABLE ... AS SELECT ... statements. For example, if the query has
476-
# a top-level ORDER BY, this conflicts with our ability to cluster
476+
# Some SELECT statements still aren't compatible with cluster
477+
# tables as the destination. For example, if the query has a
478+
# top-level ORDER BY, this conflicts with our ability to cluster
477479
# the table by the index column(s).
478480
_, query_job = self._start_query(query)
479481
return query_job.destination, query_job
@@ -1231,6 +1233,54 @@ def _create_session_table(self) -> bigquery.TableReference:
12311233
)
12321234
return dataset.table(table_name)
12331235

1236+
def _create_session_table_empty(
1237+
self,
1238+
api_name: str,
1239+
schema: Iterable[bigquery.SchemaField],
1240+
cluster_cols: List[str],
1241+
) -> bigquery.TableReference:
1242+
# Can't set a table in _SESSION as destination via query job API, so we
1243+
# run DDL, instead.
1244+
table = self._create_session_table()
1245+
schema_sql = bigframes_io.bq_schema_to_sql(schema)
1246+
1247+
clusterable_cols = [
1248+
col.name
1249+
for col in schema
1250+
if col.name in cluster_cols and _can_cluster_bq(col)
1251+
][:_MAX_CLUSTER_COLUMNS]
1252+
1253+
if clusterable_cols:
1254+
cluster_cols_sql = ", ".join(
1255+
f"`{cluster_col}`" for cluster_col in clusterable_cols
1256+
)
1257+
cluster_sql = f"CLUSTER BY {cluster_cols_sql}"
1258+
else:
1259+
cluster_sql = ""
1260+
1261+
ddl_text = f"""
1262+
CREATE TEMP TABLE
1263+
`_SESSION`.`{table.table_id}`
1264+
({schema_sql})
1265+
{cluster_sql}
1266+
"""
1267+
1268+
job_config = bigquery.QueryJobConfig()
1269+
1270+
# Include a label so that Dataplex Lineage can identify temporary
1271+
# tables that BigQuery DataFrames creates. Googlers: See internal issue
1272+
# 296779699. We're labeling the job instead of the table because
1273+
# otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1274+
# supported`.
1275+
job_config.labels = {"source": "bigquery-dataframes-temp"}
1276+
job_config.labels["bigframes-api"] = api_name
1277+
1278+
_, query_job = self._start_query(ddl_text, job_config=job_config)
1279+
1280+
# Use fully-qualified name instead of `_SESSION` name so that the
1281+
# created table can be used as the destination table.
1282+
return query_job.destination
1283+
12341284
def _create_sequential_ordering(
12351285
self,
12361286
table: ibis_types.Table,
@@ -1249,7 +1299,9 @@ def _create_sequential_ordering(
12491299
cluster_cols=list(index_cols) + [default_ordering_name],
12501300
api_name=api_name,
12511301
)
1252-
table = self.ibis_client.sql(f"SELECT * FROM `{table_ref.table_id}`")
1302+
table = self.ibis_client.table(
1303+
f"{table_ref.project}.{table_ref.dataset_id}.{table_ref.table_id}"
1304+
)
12531305
ordering_reference = core.OrderingColumnReference(default_ordering_name)
12541306
ordering = core.ExpressionOrdering(
12551307
ordering_value_columns=[ordering_reference],
@@ -1264,55 +1316,13 @@ def _ibis_to_session_table(
12641316
cluster_cols: Iterable[str],
12651317
api_name: str,
12661318
) -> bigquery.TableReference:
1267-
clusterable_cols = [
1268-
col for col in cluster_cols if _can_cluster(table[col].type())
1269-
][:_MAX_CLUSTER_COLUMNS]
1270-
return self._query_to_session_table(
1319+
desination, _ = self._query_to_destination(
12711320
self.ibis_client.compile(table),
1272-
cluster_cols=clusterable_cols,
1321+
index_cols=list(cluster_cols),
12731322
api_name=api_name,
12741323
)
1275-
1276-
def _query_to_session_table(
1277-
self,
1278-
query_text: str,
1279-
cluster_cols: Iterable[str],
1280-
api_name: str,
1281-
) -> bigquery.TableReference:
1282-
if len(list(cluster_cols)) > _MAX_CLUSTER_COLUMNS:
1283-
raise ValueError(
1284-
f"Too many cluster columns: {list(cluster_cols)}, max {_MAX_CLUSTER_COLUMNS} allowed."
1285-
)
1286-
# Can't set a table in _SESSION as destination via query job API, so we
1287-
# run DDL, instead.
1288-
table = self._create_session_table()
1289-
cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols)
1290-
1291-
# TODO(swast): This might not support multi-statement SQL queries (scripts).
1292-
ddl_text = f"""
1293-
CREATE TEMP TABLE `_SESSION`.`{table.table_id}`
1294-
CLUSTER BY {cluster_cols_sql}
1295-
AS {query_text}
1296-
"""
1297-
1298-
job_config = bigquery.QueryJobConfig()
1299-
1300-
# Include a label so that Dataplex Lineage can identify temporary
1301-
# tables that BigQuery DataFrames creates. Googlers: See internal issue
1302-
# 296779699. We're labeling the job instead of the table because
1303-
# otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1304-
# supported`.
1305-
job_config.labels = {"source": "bigquery-dataframes-temp"}
1306-
job_config.labels["bigframes-api"] = api_name
1307-
1308-
try:
1309-
self._start_query(
1310-
ddl_text, job_config=job_config
1311-
) # Wait for the job to complete
1312-
except google.api_core.exceptions.Conflict:
1313-
# Allow query retry to succeed.
1314-
pass
1315-
return table
1324+
# There should always be a destination table for this query type.
1325+
return typing.cast(bigquery.TableReference, desination)
13161326

13171327
def remote_function(
13181328
self,
@@ -1494,14 +1504,21 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi
14941504
return Session(context)
14951505

14961506

1497-
def _can_cluster(ibis_type: ibis_dtypes.DataType):
1507+
def _can_cluster_bq(field: bigquery.SchemaField):
14981508
# https://cloud.google.com/bigquery/docs/clustered-tables
14991509
# Notably, float is excluded
1500-
return (
1501-
ibis_type.is_integer()
1502-
or ibis_type.is_string()
1503-
or ibis_type.is_decimal()
1504-
or ibis_type.is_date()
1505-
or ibis_type.is_timestamp()
1506-
or ibis_type.is_boolean()
1510+
type_ = field.field_type
1511+
return type_ in (
1512+
"INTEGER",
1513+
"INT64",
1514+
"STRING",
1515+
"NUMERIC",
1516+
"DECIMAL",
1517+
"BIGNUMERIC",
1518+
"BIGDECIMAL",
1519+
"DATE",
1520+
"DATETIME",
1521+
"TIMESTAMP",
1522+
"BOOL",
1523+
"BOOLEAN",
15071524
)

tests/system/small/ml/test_core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import bigframes
2525
from bigframes.ml import core
26+
import tests.system.utils
2627

2728

2829
def test_model_eval(
@@ -224,7 +225,7 @@ def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlMo
224225
"cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383],
225226
},
226227
)
227-
pd.testing.assert_frame_equal(
228+
tests.system.utils.assert_pandas_df_equal_ignore_ordering(
228229
result,
229230
expected,
230231
check_exact=False,

tests/system/small/ml/test_decomposition.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pandas as pd
1616

1717
from bigframes.ml import decomposition
18+
import tests.system.utils
1819

1920

2021
def test_pca_predict(penguins_pca_model, new_penguins_df):
@@ -129,7 +130,7 @@ def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
129130
"explained_variance": [3.278657, 1.270829, 1.125354],
130131
},
131132
)
132-
pd.testing.assert_frame_equal(
133+
tests.system.utils.assert_pandas_df_equal_ignore_ordering(
133134
result,
134135
expected,
135136
check_exact=False,
@@ -148,7 +149,7 @@ def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
148149
"explained_variance_ratio": [0.469357, 0.181926, 0.1611],
149150
},
150151
)
151-
pd.testing.assert_frame_equal(
152+
tests.system.utils.assert_pandas_df_equal_ignore_ordering(
152153
result,
153154
expected,
154155
check_exact=False,

tests/system/small/test_session.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def test_read_gbq_tokyo(
5757
),
5858
pytest.param(
5959
"""SELECT
60+
t.int64_col + 1 as my_ints,
6061
t.float64_col * 2 AS my_floats,
6162
CONCAT(t.string_col, "_2") AS my_strings,
6263
t.int64_col > 0 AS my_bools,

tests/unit/core/test_io.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
# limitations under the License.
1414

1515
import datetime
16+
from typing import Iterable
1617

1718
import google.cloud.bigquery as bigquery
19+
import pytest
1820

1921
import bigframes.core.io
2022

@@ -47,3 +49,56 @@ def test_create_snapshot_sql_doesnt_timetravel_session_datasets():
4749

4850
# Don't need the project ID for _SESSION tables.
4951
assert "my-test-project" not in sql
52+
53+
54+
@pytest.mark.parametrize(
55+
("schema", "expected"),
56+
(
57+
(
58+
[bigquery.SchemaField("My Column", "INTEGER")],
59+
"`My Column` INT64",
60+
),
61+
(
62+
[
63+
bigquery.SchemaField("My Column", "INTEGER"),
64+
bigquery.SchemaField("Float Column", "FLOAT"),
65+
bigquery.SchemaField("Bool Column", "BOOLEAN"),
66+
],
67+
"`My Column` INT64, `Float Column` FLOAT64, `Bool Column` BOOL",
68+
),
69+
(
70+
[
71+
bigquery.SchemaField("My Column", "INTEGER", mode="REPEATED"),
72+
bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
73+
bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
74+
],
75+
"`My Column` ARRAY<INT64>, `Float Column` ARRAY<FLOAT64>, `Bool Column` ARRAY<BOOL>",
76+
),
77+
(
78+
[
79+
bigquery.SchemaField(
80+
"My Column",
81+
"RECORD",
82+
mode="REPEATED",
83+
fields=(
84+
bigquery.SchemaField("Float Column", "FLOAT", mode="REPEATED"),
85+
bigquery.SchemaField("Bool Column", "BOOLEAN", mode="REPEATED"),
86+
bigquery.SchemaField(
87+
"Nested Column",
88+
"RECORD",
89+
fields=(bigquery.SchemaField("Int Column", "INTEGER"),),
90+
),
91+
),
92+
),
93+
],
94+
(
95+
"`My Column` ARRAY<STRUCT<"
96+
+ "`Float Column` ARRAY<FLOAT64>,"
97+
+ " `Bool Column` ARRAY<BOOL>,"
98+
+ " `Nested Column` STRUCT<`Int Column` INT64>>>"
99+
),
100+
),
101+
),
102+
)
103+
def test_bq_schema_to_sql(schema: Iterable[bigquery.SchemaField], expected: str):
104+
pass

0 commit comments

Comments
 (0)