|
28 | 28 | import google.cloud.bigquery as bigquery |
29 | 29 | import google.cloud.bigquery.table |
30 | 30 |
|
| 31 | +import bigframes.core |
31 | 32 | import bigframes.core.events |
32 | 33 | import bigframes.exceptions as bfe |
33 | 34 | import bigframes.session._io.bigquery |
|
37 | 38 | import bigframes.session |
38 | 39 |
|
39 | 40 |
|
| 41 | +def _convert_information_schema_table_id_to_table_reference( |
| 42 | + table_id: str, |
| 43 | + default_project: Optional[str], |
| 44 | +) -> bigquery.TableReference: |
| 45 | + """Squeeze an INFORMATION_SCHEMA reference into a TableReference. |
| 46 | + This is kind-of a hack. INFORMATION_SCHEMA is a view that isn't available |
| 47 | + via the tables.get REST API. |
| 48 | + """ |
| 49 | + parts = table_id.split(".") |
| 50 | + parts_casefold = [part.casefold() for part in parts] |
| 51 | + dataset_index = parts_casefold.index("INFORMATION_SCHEMA".casefold()) |
| 52 | + |
| 53 | + if dataset_index == 0: |
| 54 | + project = default_project |
| 55 | + else: |
| 56 | + project = ".".join(parts[:dataset_index]) |
| 57 | + |
| 58 | + if project is None: |
| 59 | + message = ( |
| 60 | + "Could not determine project ID. " |
| 61 | + "Please provide a project or region in your INFORMATION_SCHEMA table ID, " |
| 62 | + "For example, 'region-REGION_NAME.INFORMATION_SCHEMA.JOBS'." |
| 63 | + ) |
| 64 | + raise ValueError(message) |
| 65 | + |
| 66 | + dataset = "INFORMATION_SCHEMA" |
| 67 | + table_id_short = ".".join(parts[dataset_index + 1 :]) |
| 68 | + return bigquery.TableReference( |
| 69 | + bigquery.DatasetReference(project, dataset), |
| 70 | + table_id_short, |
| 71 | + ) |
| 72 | + |
| 73 | + |
| 74 | +def get_information_schema_metadata( |
| 75 | + bqclient: bigquery.Client, |
| 76 | + table_id: str, |
| 77 | + default_project: Optional[str], |
| 78 | +) -> bigquery.Table: |
| 79 | + job_config = bigquery.QueryJobConfig(dry_run=True) |
| 80 | + job = bqclient.query( |
| 81 | + f"SELECT * FROM `{table_id}`", |
| 82 | + job_config=job_config, |
| 83 | + ) |
| 84 | + table_ref = _convert_information_schema_table_id_to_table_reference( |
| 85 | + table_id=table_id, |
| 86 | + default_project=default_project, |
| 87 | + ) |
| 88 | + table = bigquery.Table.from_api_repr( |
| 89 | + { |
| 90 | + "tableReference": table_ref.to_api_repr(), |
| 91 | + "location": job.location, |
| 92 | + # Prevent ourselves from trying to read the table with the BQ |
| 93 | + # Storage API. |
| 94 | + "type": "VIEW", |
| 95 | + } |
| 96 | + ) |
| 97 | + table.schema = job.schema |
| 98 | + return table |
| 99 | + |
| 100 | + |
40 | 101 | def get_table_metadata( |
41 | 102 | bqclient: bigquery.Client, |
42 | | - table_ref: google.cloud.bigquery.table.TableReference, |
43 | | - bq_time: datetime.datetime, |
44 | 103 | *, |
45 | | - cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], |
| 104 | + table_id: str, |
| 105 | + default_project: Optional[str], |
| 106 | + bq_time: datetime.datetime, |
| 107 | + cache: Dict[str, Tuple[datetime.datetime, bigquery.Table]], |
46 | 108 | use_cache: bool = True, |
47 | 109 | publisher: bigframes.core.events.Publisher, |
48 | 110 | ) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: |
49 | 111 | """Get the table metadata, either from cache or via REST API.""" |
50 | 112 |
|
51 | | - cached_table = cache.get(table_ref) |
| 113 | + cached_table = cache.get(table_id) |
52 | 114 | if use_cache and cached_table is not None: |
53 | 115 | snapshot_timestamp, table = cached_table |
54 | 116 |
|
@@ -90,18 +152,38 @@ def get_table_metadata( |
90 | 152 |
|
91 | 153 | return cached_table |
92 | 154 |
|
93 | | - table = bqclient.get_table(table_ref) |
| 155 | + if is_information_schema(table_id): |
| 156 | + table = get_information_schema_metadata( |
| 157 | + bqclient=bqclient, table_id=table_id, default_project=default_project |
| 158 | + ) |
| 159 | + else: |
| 160 | + table_ref = google.cloud.bigquery.table.TableReference.from_string( |
| 161 | + table_id, default_project=default_project |
| 162 | + ) |
| 163 | + table = bqclient.get_table(table_ref) |
| 164 | + |
94 | 165 | # local time will lag a little bit do to network latency |
95 | 166 | # make sure it is at least table creation time. |
96 | 167 | # This is relevant if the table was created immediately before loading it here. |
97 | 168 | if (table.created is not None) and (table.created > bq_time): |
98 | 169 | bq_time = table.created |
99 | 170 |
|
100 | 171 | cached_table = (bq_time, table) |
101 | | - cache[table_ref] = cached_table |
| 172 | + cache[table_id] = cached_table |
102 | 173 | return cached_table |
103 | 174 |
|
104 | 175 |
|
| 176 | +def is_information_schema(table_id: str): |
| 177 | + table_id_casefold = table_id.casefold() |
| 178 | + # Include the "."s to ensure we don't have false positives for some user |
| 179 | + # defined dataset like MY_INFORMATION_SCHEMA or tables called |
| 180 | + # INFORMATION_SCHEMA. |
| 181 | + return ( |
| 182 | + ".INFORMATION_SCHEMA.".casefold() in table_id_casefold |
| 183 | + or table_id_casefold.startswith("INFORMATION_SCHEMA.".casefold()) |
| 184 | + ) |
| 185 | + |
| 186 | + |
105 | 187 | def is_time_travel_eligible( |
106 | 188 | bqclient: bigquery.Client, |
107 | 189 | table: google.cloud.bigquery.table.Table, |
@@ -168,6 +250,8 @@ def is_time_travel_eligible( |
168 | 250 | msg, category=bfe.TimeTravelDisabledWarning, stacklevel=stacklevel |
169 | 251 | ) |
170 | 252 | return False |
| 253 | + elif table.table_type == "VIEW": |
| 254 | + return False |
171 | 255 |
|
172 | 256 | # table might support time travel, lets do a dry-run query with time travel |
173 | 257 | if should_dry_run: |
|
0 commit comments