Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(bigquery): add support for sheets ranges #9416

Merged
merged 4 commits into from
Oct 8, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 0 additions & 102 deletions bigquery/docs/snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2303,108 +2303,6 @@ def test_query_external_gcs_permanent_table(client, to_delete):
assert len(w_states) == 4


def test_query_external_sheets_temporary_table(client):
# [START bigquery_query_external_sheets_temp]
# [START bigquery_auth_drive_scope]
import google.auth

# from google.cloud import bigquery

# Create credentials with Drive & BigQuery API scopes
# Both APIs must be enabled for your project before running this code
credentials, project = google.auth.default(
scopes=[
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/bigquery",
]
)
client = bigquery.Client(credentials=credentials, project=project)
# [END bigquery_auth_drive_scope]

# Configure the external data source and query job
external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
# Use a shareable link or grant viewing access to the email address you
# used to authenticate with BigQuery (this example Sheet is public)
sheet_url = (
"https://docs.google.com/spreadsheets"
"/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
)
external_config.source_uris = [sheet_url]
external_config.schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
external_config.options.skip_leading_rows = 1 # optionally skip header row
table_id = "us_states"
job_config = bigquery.QueryJobConfig()
job_config.table_definitions = {table_id: external_config}

# Example query to find states starting with 'W'
sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id)

query_job = client.query(sql, job_config=job_config) # API request

w_states = list(query_job) # Waits for query to finish
print("There are {} states with names starting with W.".format(len(w_states)))
# [END bigquery_query_external_sheets_temp]
assert len(w_states) == 4


def test_query_external_sheets_permanent_table(client, to_delete):
dataset_id = "query_external_sheets_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_query_external_sheets_perm]
import google.auth

# from google.cloud import bigquery
# dataset_id = 'my_dataset'

# Create credentials with Drive & BigQuery API scopes
# Both APIs must be enabled for your project before running this code
credentials, project = google.auth.default(
scopes=[
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/bigquery",
]
)
client = bigquery.Client(credentials=credentials, project=project)

# Configure the external data source
dataset_ref = client.dataset(dataset_id)
table_id = "us_states"
schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
table = bigquery.Table(dataset_ref.table(table_id), schema=schema)
external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
# Use a shareable link or grant viewing access to the email address you
# used to authenticate with BigQuery (this example Sheet is public)
sheet_url = (
"https://docs.google.com/spreadsheets"
"/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
)
external_config.source_uris = [sheet_url]
external_config.options.skip_leading_rows = 1 # optionally skip header row
table.external_data_configuration = external_config

# Create a permanent table linked to the Sheets file
table = client.create_table(table) # API request

# Example query to find states starting with 'W'
sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(dataset_id, table_id)

query_job = client.query(sql) # API request

w_states = list(query_job) # Waits for query to finish
print("There are {} states with names starting with W.".format(len(w_states)))
# [END bigquery_query_external_sheets_perm]
assert len(w_states) == 4


def test_ddl_create_view(client, to_delete, capsys):
"""Create a view via a DDL query."""
project = client.project
Expand Down
14 changes: 14 additions & 0 deletions bigquery/google/cloud/bigquery/external_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from google.cloud.bigquery._helpers import _to_bytes
from google.cloud.bigquery._helpers import _bytes_to_json
from google.cloud.bigquery._helpers import _int_or_none
from google.cloud.bigquery._helpers import _str_or_none
from google.cloud.bigquery.schema import SchemaField


Expand Down Expand Up @@ -524,6 +525,19 @@ def skip_leading_rows(self):
def skip_leading_rows(self, value):
self._properties["skipLeadingRows"] = str(value)

@property
def range(self):
"""str: The range of a sheet that BigQuery will query from.

See
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#GoogleSheetsOptions
"""
return _str_or_none(self._properties.get("range"))

@range.setter
def range(self, value):
self._properties["range"] = value

def to_api_repr(self):
"""Build an API representation of this object.

Expand Down
69 changes: 69 additions & 0 deletions bigquery/samples/query_external_sheets_permanent_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def query_external_sheets_permanent_table(dataset_id):

# [START bigquery_query_external_sheets_perm]
from google.cloud import bigquery
import google.auth

# Create credentials with Drive & BigQuery API scopes.
# Both APIs must be enabled for your project before running this code.
credentials, project = google.auth.default(
scopes=[
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/bigquery",
]
)

# TODO(developer): Construct a BigQuery client object.
client = bigquery.Client(credentials=credentials, project=project)

# TODO(developer): Set dataset_id to the ID of the dataset to fetch.
# dataset_id = "your-project.your_dataset"

# Configure the external data source.
dataset = client.get_dataset(dataset_id)
table_id = "us_states"
schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
table = bigquery.Table(dataset.table(table_id), schema=schema)
external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
# Use a shareable link or grant viewing access to the email address you
# used to authenticate with BigQuery (this example Sheet is public).
sheet_url = (
"https://docs.google.com/spreadsheets/"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, let's continue using the previous sheet. https://docs.google.com/spreadsheets/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing

I was thinking we'd want a different set of data for the range queries, but if we're still querying US States, we should just use the existing sheet.

"d/1dCG0rrY0nkJpB8t6Ko1S3tY7w9-hp0e_jsYer4LgEuA/edit?usp=sharing"
)
external_config.source_uris = [sheet_url]
external_config.options.skip_leading_rows = 1 # Optionally skip header row.
external_config.options.range = (
"Sheet1!A10:B30"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use a range that is more possible to verify the correct behavior. Since the original sheet is sorted alphabetically, the range A1:B48 should only get "Washington".

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided to use A20:B49 range. To show that it can be used not from the beginning of the sheet.

) # Optionally set range of the sheet to query from.
table.external_data_configuration = external_config

# Create a permanent table linked to the Sheets file.
table = client.create_table(table) # Make an API request.

# Example query to find states starting with "W".
sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(dataset_id, table_id)
query_job = client.query(sql) # Make an API request.

# Wait for the query to complete.
w_states = list(query_job)
print("There are {} states with names starting with W.".format(len(w_states)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's update the print statement to indicate that we used a range: ... starting with W in the selected range.

# [END bigquery_query_external_sheets_perm]
65 changes: 65 additions & 0 deletions bigquery/samples/query_external_sheets_temporary_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def query_external_sheets_temporary_table():

# [START bigquery_query_external_sheets_temp]
# [START bigquery_auth_drive_scope]
from google.cloud import bigquery
import google.auth

# Create credentials with Drive & BigQuery API scopes.
# Both APIs must be enabled for your project before running this code.
credentials, project = google.auth.default(
scopes=[
"https://www.googleapis.com/auth/drive",
"https://www.googleapis.com/auth/bigquery",
]
)

# TODO(developer): Construct a BigQuery client object.
client = bigquery.Client(credentials=credentials, project=project)
# [END bigquery_auth_drive_scope]

# Configure the external data source and query job.
external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")

# Use a shareable link or grant viewing access to the email address you
# used to authenticate with BigQuery (this example Sheet is public).
sheet_url = (
"https://docs.google.com/spreadsheets/"
"d/1dCG0rrY0nkJpB8t6Ko1S3tY7w9-hp0e_jsYer4LgEuA/edit?usp=sharing"
)
external_config.source_uris = [sheet_url]
external_config.schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
external_config.options.skip_leading_rows = 1 # Optionally skip header row.
external_config.options.range = (
"Sheet1!A10:B30"
) # Optionally set range of the sheet to query from.
table_id = "us_states"
job_config = bigquery.QueryJobConfig()
job_config.table_definitions = {table_id: external_config}

# Example query to find states starting with "W".
sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id)
query_job = client.query(sql, job_config=job_config) # Make an API request.

# Wait for the query to complete.
w_states = list(query_job)
print("There are {} states with names starting with W.".format(len(w_states)))
# [END bigquery_query_external_sheets_temp]
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from .. import query_external_sheets_permanent_table


def test_query_external_sheets_permanent_table(capsys, dataset_id):

query_external_sheets_permanent_table.query_external_sheets_permanent_table(
dataset_id
)
out, err = capsys.readouterr()
assert "There are 4 states with names starting with W." in out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from .. import query_external_sheets_temporary_table


def test_query_external_sheets_temporary_table(capsys):

query_external_sheets_temporary_table.query_external_sheets_temporary_table()
out, err = capsys.readouterr()
assert "There are 4 states with names starting with W." in out
11 changes: 9 additions & 2 deletions bigquery/tests/unit/test_external_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ def test_from_api_repr_sheets(self):
self.BASE_RESOURCE,
{
"sourceFormat": "GOOGLE_SHEETS",
"googleSheetsOptions": {"skipLeadingRows": "123"},
"googleSheetsOptions": {
"skipLeadingRows": "123",
"range": "Sheet1!A5:B10",
},
},
)

Expand All @@ -140,26 +143,30 @@ def test_from_api_repr_sheets(self):
self.assertEqual(ec.source_format, "GOOGLE_SHEETS")
self.assertIsInstance(ec.options, external_config.GoogleSheetsOptions)
self.assertEqual(ec.options.skip_leading_rows, 123)
self.assertEqual(ec.options.range, "Sheet1!A5:B10")

got_resource = ec.to_api_repr()

self.assertEqual(got_resource, resource)

del resource["googleSheetsOptions"]["skipLeadingRows"]
del resource["googleSheetsOptions"]["range"]
ec = external_config.ExternalConfig.from_api_repr(resource)
self.assertIsNone(ec.options.skip_leading_rows)
self.assertIsNone(ec.options.range)
got_resource = ec.to_api_repr()
self.assertEqual(got_resource, resource)

def test_to_api_repr_sheets(self):
ec = external_config.ExternalConfig("GOOGLE_SHEETS")
options = external_config.GoogleSheetsOptions()
options.skip_leading_rows = 123
options.range = "Sheet1!A5:B10"
ec._options = options

exp_resource = {
"sourceFormat": "GOOGLE_SHEETS",
"googleSheetsOptions": {"skipLeadingRows": "123"},
"googleSheetsOptions": {"skipLeadingRows": "123", "range": "Sheet1!A5:B10"},
}

got_resource = ec.to_api_repr()
Expand Down