Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(bigquery): update code samples of load table file and uri #10175

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
refactor(bigquery): update code samples of load table file and uri
  • Loading branch information
HemangChothani committed Jan 20, 2020
commit 22e0dd9627569024e485cc2b988aca7a74a339f4
262 changes: 0 additions & 262 deletions bigquery/docs/snippets.py
Original file line number Diff line number Diff line change
Expand Up @@ -581,268 +581,6 @@ def test_manage_views(client, to_delete):
# [END bigquery_grant_view_access]


def test_load_table_from_file(client, to_delete):
"""Upload table data from a CSV file."""
dataset_id = "load_table_from_file_dataset_{}".format(_millis())
table_id = "load_table_from_file_table_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
dataset.location = "US"
client.create_dataset(dataset)
to_delete.append(dataset)
snippets_dir = os.path.abspath(os.path.dirname(__file__))
filename = os.path.join(
snippets_dir, "..", "..", "bigquery", "tests", "data", "people.csv"
)

# [START bigquery_load_from_file]
# from google.cloud import bigquery
# client = bigquery.Client()
# filename = '/path/to/file.csv'
# dataset_id = 'my_dataset'
# table_id = 'my_table'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1
job_config.autodetect = True

with open(filename, "rb") as source_file:
job = client.load_table_from_file(source_file, table_ref, job_config=job_config)

job.result() # Waits for table load to complete.

print("Loaded {} rows into {}:{}.".format(job.output_rows, dataset_id, table_id))
# [END bigquery_load_from_file]

table = client.get_table(table_ref)
rows = list(client.list_rows(table)) # API request

assert len(rows) == 2
# Order is not preserved, so compare individually
row1 = bigquery.Row(("Wylma Phlyntstone", 29), {"full_name": 0, "age": 1})
assert row1 in rows
row2 = bigquery.Row(("Phred Phlyntstone", 32), {"full_name": 0, "age": 1})
assert row2 in rows


def test_load_table_from_uri_avro(client, to_delete, capsys):
dataset_id = "load_table_from_uri_avro_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_avro]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.AVRO
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"

load_job = client.load_table_from_uri(
uri, dataset_ref.table("us_states"), job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))

load_job.result() # Waits for table load to complete.
print("Job finished.")

destination_table = client.get_table(dataset_ref.table("us_states"))
print("Loaded {} rows.".format(destination_table.num_rows))
# [END bigquery_load_table_gcs_avro]

out, _ = capsys.readouterr()
assert "Loaded 50 rows." in out


def test_load_table_from_uri_csv(client, to_delete, capsys):
dataset_id = "load_table_from_uri_csv_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_csv]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
job_config.skip_leading_rows = 1
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"

load_job = client.load_table_from_uri(
uri, dataset_ref.table("us_states"), job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))

load_job.result() # Waits for table load to complete.
print("Job finished.")

destination_table = client.get_table(dataset_ref.table("us_states"))
print("Loaded {} rows.".format(destination_table.num_rows))
# [END bigquery_load_table_gcs_csv]

out, _ = capsys.readouterr()
assert "Loaded 50 rows." in out


def test_load_table_from_uri_json(client, to_delete, capsys):
dataset_id = "load_table_from_uri_json_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
dataset.location = "US"
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_json]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.schema = [
bigquery.SchemaField("name", "STRING"),
bigquery.SchemaField("post_abbr", "STRING"),
]
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"

load_job = client.load_table_from_uri(
uri,
dataset_ref.table("us_states"),
location="US", # Location must match that of the destination dataset.
job_config=job_config,
) # API request
print("Starting job {}".format(load_job.job_id))

load_job.result() # Waits for table load to complete.
print("Job finished.")

destination_table = client.get_table(dataset_ref.table("us_states"))
print("Loaded {} rows.".format(destination_table.num_rows))
# [END bigquery_load_table_gcs_json]

out, _ = capsys.readouterr()
assert "Loaded 50 rows." in out


def test_load_table_from_uri_cmek(client, to_delete):
dataset_id = "load_table_from_uri_cmek_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
dataset.location = "US"
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_json_cmek]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.autodetect = True
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON

# Set the encryption key to use for the destination.
# TODO: Replace this key with a key you have created in KMS.
kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
"cloud-samples-tests", "us", "test", "test"
)
encryption_config = bigquery.EncryptionConfiguration(kms_key_name=kms_key_name)
job_config.destination_encryption_configuration = encryption_config
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"

load_job = client.load_table_from_uri(
uri,
dataset_ref.table("us_states"),
location="US", # Location must match that of the destination dataset.
job_config=job_config,
) # API request

assert load_job.job_type == "load"

load_job.result() # Waits for table load to complete.

assert load_job.state == "DONE"
table = client.get_table(dataset_ref.table("us_states"))
assert table.encryption_configuration.kms_key_name == kms_key_name
# [END bigquery_load_table_gcs_json_cmek]


def test_load_table_from_uri_parquet(client, to_delete, capsys):
dataset_id = "load_table_from_uri_parquet_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_parquet]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.PARQUET
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"

load_job = client.load_table_from_uri(
uri, dataset_ref.table("us_states"), job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))

load_job.result() # Waits for table load to complete.
print("Job finished.")

destination_table = client.get_table(dataset_ref.table("us_states"))
print("Loaded {} rows.".format(destination_table.num_rows))
# [END bigquery_load_table_gcs_parquet]

out, _ = capsys.readouterr()
assert "Loaded 50 rows." in out


def test_load_table_from_uri_orc(client, to_delete, capsys):
dataset_id = "load_table_from_uri_orc_{}".format(_millis())
dataset = bigquery.Dataset(client.dataset(dataset_id))
client.create_dataset(dataset)
to_delete.append(dataset)

# [START bigquery_load_table_gcs_orc]
# from google.cloud import bigquery
# client = bigquery.Client()
# dataset_id = 'my_dataset'

dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.ORC
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"

load_job = client.load_table_from_uri(
uri, dataset_ref.table("us_states"), job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))

load_job.result() # Waits for table load to complete.
print("Job finished.")

destination_table = client.get_table(dataset_ref.table("us_states"))
print("Loaded {} rows.".format(destination_table.num_rows))
# [END bigquery_load_table_gcs_orc]

out, _ = capsys.readouterr()
assert "Loaded 50 rows." in out


def test_load_table_from_uri_autodetect(client, to_delete, capsys):
"""Load table from a GCS URI using various formats and auto-detected schema
Expand Down
2 changes: 1 addition & 1 deletion bigquery/docs/usage/encryption.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Change the key used to encrypt a table.
Load a file from Cloud Storage, using a customer-managed encryption key from
Cloud KMS for the destination table.

.. literalinclude:: ../snippets.py
.. literalinclude:: ../samples/load_table_uri_cmek.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_json_cmek]
Expand Down
24 changes: 20 additions & 4 deletions bigquery/docs/usage/tables.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ Create an integer range partitioned table with the
Load table data from a file with the
:func:`~google.cloud.bigquery.client.Client.load_table_from_file` method:

.. literalinclude:: ../snippets.py
.. literalinclude:: ../samples/load_table_file.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_from_file]
Expand All @@ -79,7 +79,7 @@ Load table data from a file with the
Load a CSV file from Cloud Storage with the
:func:`~google.cloud.bigquery.client.Client.load_table_from_uri` method:

.. literalinclude:: ../snippets.py
.. literalinclude:: ../samples/load_table_uri_csv.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_csv]
Expand All @@ -90,7 +90,7 @@ See also: `Loading CSV data from Cloud Storage

Load a JSON file from Cloud Storage:

.. literalinclude:: ../snippets.py
.. literalinclude:: ../samples/load_table_uri_json.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_json]
Expand All @@ -101,7 +101,7 @@ See also: `Loading JSON data from Cloud Storage

Load a Parquet file from Cloud Storage:

.. literalinclude:: ../snippets.py
.. literalinclude:: ../samples/load_table_uri_parquet.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_parquet]
Expand All @@ -110,6 +110,22 @@ Load a Parquet file from Cloud Storage:
See also: `Loading Parquet data from Cloud Storage
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet>`_.

Load a Avro file from Cloud Storage:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/a Avro/an Avro/


.. literalinclude:: ../samples/load_table_uri_avro.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_avro]
:end-before: [END bigquery_load_table_gcs_avro]

Load a ORC file from Cloud Storage:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/a Orc/an Orc/


.. literalinclude:: ../samples/load_table_uri_orc.py
:language: python
:dedent: 4
:start-after: [START bigquery_load_table_gcs_orc]
:end-before: [END bigquery_load_table_gcs_orc]

Updating a Table
^^^^^^^^^^^^^^^^

Expand Down
43 changes: 43 additions & 0 deletions bigquery/samples/load_table_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Copyright 2019 Google LLC
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update copyright notices to 2020 in the new files

#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def load_table_file(file_path, table_id):

# [START bigquery_load_from_file]
from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
)

with open(file_path, "rb") as source_file:
job = client.load_table_from_file(source_file, table_id, job_config=job_config)

job.result() # Waits for the job to complete.

table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
# [END bigquery_load_from_file]
return table
Loading