@@ -449,13 +449,6 @@ def _query_to_destination(
449
449
index_cols : List [str ],
450
450
api_name : str ,
451
451
) -> Tuple [Optional [bigquery .TableReference ], Optional [bigquery .QueryJob ]]:
452
- # If there are no index columns, then there's no reason to cache to a
453
- # (clustered) session table, as we'll just have to query it again to
454
- # create a default index & ordering.
455
- if not index_cols :
456
- _ , query_job = self ._start_query (query )
457
- return query_job .destination , query_job
458
-
459
452
# If a dry_run indicates this is not a query type job, then don't
460
453
# bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement.
461
454
dry_run_config = bigquery .QueryJobConfig ()
@@ -465,15 +458,24 @@ def _query_to_destination(
465
458
_ , query_job = self ._start_query (query )
466
459
return query_job .destination , query_job
467
460
468
- # Make sure we cluster by the index column(s) so that subsequent
469
- # operations are as speedy as they can be.
461
+ # Create a table to workaround BigQuery 10 GB query results limit. See:
462
+ # internal issue 303057336.
463
+ # Since we have a `statement_type == 'SELECT'`, schema should be populated.
464
+ schema = typing .cast (Iterable [bigquery .SchemaField ], dry_run_job .schema )
465
+ temp_table = self ._create_session_table_empty (api_name , schema , index_cols )
466
+
467
+ job_config = bigquery .QueryJobConfig ()
468
+ job_config .destination = temp_table
469
+
470
470
try :
471
- ibis_expr = self .ibis_client .sql (query )
472
- return self ._ibis_to_session_table (ibis_expr , index_cols , api_name ), None
471
+ # Write to temp table to workaround BigQuery 10 GB query results
472
+ # limit. See: internal issue 303057336.
473
+ _ , query_job = self ._start_query (query , job_config = job_config )
474
+ return query_job .destination , query_job
473
475
except google .api_core .exceptions .BadRequest :
474
- # Some SELECT statements still aren't compatible with CREATE TEMP
475
- # TABLE ... AS SELECT ... statements. For example, if the query has
476
- # a top-level ORDER BY, this conflicts with our ability to cluster
476
+ # Some SELECT statements still aren't compatible with cluster
477
+ # tables as the destination. For example, if the query has a
478
+ # top-level ORDER BY, this conflicts with our ability to cluster
477
479
# the table by the index column(s).
478
480
_ , query_job = self ._start_query (query )
479
481
return query_job .destination , query_job
@@ -1231,6 +1233,54 @@ def _create_session_table(self) -> bigquery.TableReference:
1231
1233
)
1232
1234
return dataset .table (table_name )
1233
1235
1236
+ def _create_session_table_empty (
1237
+ self ,
1238
+ api_name : str ,
1239
+ schema : Iterable [bigquery .SchemaField ],
1240
+ cluster_cols : List [str ],
1241
+ ) -> bigquery .TableReference :
1242
+ # Can't set a table in _SESSION as destination via query job API, so we
1243
+ # run DDL, instead.
1244
+ table = self ._create_session_table ()
1245
+ schema_sql = bigframes_io .bq_schema_to_sql (schema )
1246
+
1247
+ clusterable_cols = [
1248
+ col .name
1249
+ for col in schema
1250
+ if col .name in cluster_cols and _can_cluster_bq (col )
1251
+ ][:_MAX_CLUSTER_COLUMNS ]
1252
+
1253
+ if clusterable_cols :
1254
+ cluster_cols_sql = ", " .join (
1255
+ f"`{ cluster_col } `" for cluster_col in clusterable_cols
1256
+ )
1257
+ cluster_sql = f"CLUSTER BY { cluster_cols_sql } "
1258
+ else :
1259
+ cluster_sql = ""
1260
+
1261
+ ddl_text = f"""
1262
+ CREATE TEMP TABLE
1263
+ `_SESSION`.`{ table .table_id } `
1264
+ ({ schema_sql } )
1265
+ { cluster_sql }
1266
+ """
1267
+
1268
+ job_config = bigquery .QueryJobConfig ()
1269
+
1270
+ # Include a label so that Dataplex Lineage can identify temporary
1271
+ # tables that BigQuery DataFrames creates. Googlers: See internal issue
1272
+ # 296779699. We're labeling the job instead of the table because
1273
+ # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1274
+ # supported`.
1275
+ job_config .labels = {"source" : "bigquery-dataframes-temp" }
1276
+ job_config .labels ["bigframes-api" ] = api_name
1277
+
1278
+ _ , query_job = self ._start_query (ddl_text , job_config = job_config )
1279
+
1280
+ # Use fully-qualified name instead of `_SESSION` name so that the
1281
+ # created table can be used as the destination table.
1282
+ return query_job .destination
1283
+
1234
1284
def _create_sequential_ordering (
1235
1285
self ,
1236
1286
table : ibis_types .Table ,
@@ -1249,7 +1299,9 @@ def _create_sequential_ordering(
1249
1299
cluster_cols = list (index_cols ) + [default_ordering_name ],
1250
1300
api_name = api_name ,
1251
1301
)
1252
- table = self .ibis_client .sql (f"SELECT * FROM `{ table_ref .table_id } `" )
1302
+ table = self .ibis_client .table (
1303
+ f"{ table_ref .project } .{ table_ref .dataset_id } .{ table_ref .table_id } "
1304
+ )
1253
1305
ordering_reference = core .OrderingColumnReference (default_ordering_name )
1254
1306
ordering = core .ExpressionOrdering (
1255
1307
ordering_value_columns = [ordering_reference ],
@@ -1264,55 +1316,13 @@ def _ibis_to_session_table(
1264
1316
cluster_cols : Iterable [str ],
1265
1317
api_name : str ,
1266
1318
) -> bigquery .TableReference :
1267
- clusterable_cols = [
1268
- col for col in cluster_cols if _can_cluster (table [col ].type ())
1269
- ][:_MAX_CLUSTER_COLUMNS ]
1270
- return self ._query_to_session_table (
1319
+ desination , _ = self ._query_to_destination (
1271
1320
self .ibis_client .compile (table ),
1272
- cluster_cols = clusterable_cols ,
1321
+ index_cols = list ( cluster_cols ) ,
1273
1322
api_name = api_name ,
1274
1323
)
1275
-
1276
- def _query_to_session_table (
1277
- self ,
1278
- query_text : str ,
1279
- cluster_cols : Iterable [str ],
1280
- api_name : str ,
1281
- ) -> bigquery .TableReference :
1282
- if len (list (cluster_cols )) > _MAX_CLUSTER_COLUMNS :
1283
- raise ValueError (
1284
- f"Too many cluster columns: { list (cluster_cols )} , max { _MAX_CLUSTER_COLUMNS } allowed."
1285
- )
1286
- # Can't set a table in _SESSION as destination via query job API, so we
1287
- # run DDL, instead.
1288
- table = self ._create_session_table ()
1289
- cluster_cols_sql = ", " .join (f"`{ cluster_col } `" for cluster_col in cluster_cols )
1290
-
1291
- # TODO(swast): This might not support multi-statement SQL queries (scripts).
1292
- ddl_text = f"""
1293
- CREATE TEMP TABLE `_SESSION`.`{ table .table_id } `
1294
- CLUSTER BY { cluster_cols_sql }
1295
- AS { query_text }
1296
- """
1297
-
1298
- job_config = bigquery .QueryJobConfig ()
1299
-
1300
- # Include a label so that Dataplex Lineage can identify temporary
1301
- # tables that BigQuery DataFrames creates. Googlers: See internal issue
1302
- # 296779699. We're labeling the job instead of the table because
1303
- # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not
1304
- # supported`.
1305
- job_config .labels = {"source" : "bigquery-dataframes-temp" }
1306
- job_config .labels ["bigframes-api" ] = api_name
1307
-
1308
- try :
1309
- self ._start_query (
1310
- ddl_text , job_config = job_config
1311
- ) # Wait for the job to complete
1312
- except google .api_core .exceptions .Conflict :
1313
- # Allow query retry to succeed.
1314
- pass
1315
- return table
1324
+ # There should always be a destination table for this query type.
1325
+ return typing .cast (bigquery .TableReference , desination )
1316
1326
1317
1327
def remote_function (
1318
1328
self ,
@@ -1494,14 +1504,21 @@ def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Sessi
1494
1504
return Session (context )
1495
1505
1496
1506
1497
- def _can_cluster ( ibis_type : ibis_dtypes . DataType ):
1507
+ def _can_cluster_bq ( field : bigquery . SchemaField ):
1498
1508
# https://cloud.google.com/bigquery/docs/clustered-tables
1499
1509
# Notably, float is excluded
1500
- return (
1501
- ibis_type .is_integer ()
1502
- or ibis_type .is_string ()
1503
- or ibis_type .is_decimal ()
1504
- or ibis_type .is_date ()
1505
- or ibis_type .is_timestamp ()
1506
- or ibis_type .is_boolean ()
1510
+ type_ = field .field_type
1511
+ return type_ in (
1512
+ "INTEGER" ,
1513
+ "INT64" ,
1514
+ "STRING" ,
1515
+ "NUMERIC" ,
1516
+ "DECIMAL" ,
1517
+ "BIGNUMERIC" ,
1518
+ "BIGDECIMAL" ,
1519
+ "DATE" ,
1520
+ "DATETIME" ,
1521
+ "TIMESTAMP" ,
1522
+ "BOOL" ,
1523
+ "BOOLEAN" ,
1507
1524
)
0 commit comments