Skip to content

Commit c6b3914

Browse files
authored
BigQuery: Moves BigQuery tutorial for Dataproc to python-docs-samples (GoogleCloudPlatform#1494)
1 parent 509e3f2 commit c6b3914

File tree

2 files changed

+123
-0
lines changed

2 files changed

+123
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2018 Google Inc. All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
18+
def run_natality_tutorial():
19+
# [START bigquery_query_natality_tutorial]
20+
"""Create a Google BigQuery linear regression input table.
21+
22+
In the code below, the following actions are taken:
23+
* A new dataset is created "natality_regression."
24+
* A query is run against the public dataset,
25+
bigquery-public-data.samples.natality, selecting only the data of
26+
interest to the regression, the output of which is stored in a new
27+
"regression_input" table.
28+
* The output table is moved over the wire to the user's default project via
29+
the built-in BigQuery Connector for Spark that bridges BigQuery and
30+
Cloud Dataproc.
31+
"""
32+
33+
from google.cloud import bigquery
34+
35+
# Create a new Google BigQuery client using Google Cloud Platform project
36+
# defaults.
37+
client = bigquery.Client()
38+
39+
# Prepare a reference to a new dataset for storing the query results.
40+
dataset_ref = client.dataset('natality_regression')
41+
dataset = bigquery.Dataset(dataset_ref)
42+
43+
# Create the new BigQuery dataset.
44+
dataset = client.create_dataset(dataset)
45+
46+
# In the new BigQuery dataset, create a reference to a new table for
47+
# storing the query results.
48+
table_ref = dataset.table('regression_input')
49+
50+
# Configure the query job.
51+
job_config = bigquery.QueryJobConfig()
52+
53+
# Set the destination table to the table reference created above.
54+
job_config.destination = table_ref
55+
56+
# Set up a query in Standard SQL, which is the default for the BigQuery
57+
# Python client library.
58+
# The query selects the fields of interest.
59+
query = """
60+
SELECT
61+
weight_pounds, mother_age, father_age, gestation_weeks,
62+
weight_gain_pounds, apgar_5min
63+
FROM
64+
`bigquery-public-data.samples.natality`
65+
WHERE
66+
weight_pounds IS NOT NULL
67+
AND mother_age IS NOT NULL
68+
AND father_age IS NOT NULL
69+
AND gestation_weeks IS NOT NULL
70+
AND weight_gain_pounds IS NOT NULL
71+
AND apgar_5min IS NOT NULL
72+
"""
73+
74+
# Run the query.
75+
query_job = client.query(query, job_config=job_config)
76+
query_job.result() # Waits for the query to finish
77+
# [END bigquery_query_natality_tutorial]
78+
79+
80+
if __name__ == '__main__':
81+
run_natality_tutorial()
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2018 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from google.cloud import bigquery
16+
from google.cloud import exceptions
17+
18+
import natality_tutorial
19+
20+
21+
def dataset_exists(dataset, client):
22+
try:
23+
client.get_dataset(dataset)
24+
return True
25+
except exceptions.NotFound:
26+
return False
27+
28+
29+
def test_natality_tutorial():
30+
client = bigquery.Client()
31+
dataset_ref = client.dataset('natality_regression')
32+
assert not dataset_exists(dataset_ref, client)
33+
34+
natality_tutorial.run_natality_tutorial()
35+
36+
assert dataset_exists(dataset_ref, client)
37+
38+
table = client.get_table(
39+
bigquery.Table(dataset_ref.table('regression_input')))
40+
assert table.num_rows > 0
41+
42+
client.delete_dataset(dataset_ref, delete_contents=True)

0 commit comments

Comments
 (0)