Skip to content

Commit ebcbfbe

Browse files
authored
CLN: Use to_dataframe to download query results. (#247)
* CLN: Use `to_dataframe` to download query results. This allows us to remove logic for parsing the schema and align with google-cloud-bigquery. * Bumps the minimum google-cloud-bigquery version, because we need to use the new dtypes argument. * Cast to correct dtype in empty dataframes. * Improve the conda CI build to truly use dependencies from conda, not pip. Adds pydata-google-auth to conda deps.
1 parent f729a44 commit ebcbfbe

File tree

12 files changed

+178
-102
lines changed

12 files changed

+178
-102
lines changed

benchmark/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# pandas-gbq benchmarks
2+
3+
This directory contains a few scripts which are useful for performance
4+
testing the pandas-gbq library. Use cProfile to time the script and see
5+
details about where time is spent. To avoid timing how long BigQuery takes to
6+
execute a query, run the benchmark twice to ensure the results are cached.
7+
8+
## `read_gbq`
9+
10+
Read a small table (a few KB).
11+
12+
python -m cProfile --sort=cumtime read_gbq_small_results.py
13+
14+
Read a large-ish table (100+ MB).
15+
16+
python -m cProfile --sort=cumtime read_gbq_large_results.py

benchmark/read_gbq_large_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import pandas_gbq
2+
3+
# Select 163 MB worth of data, to time how long it takes to download large
4+
# result sets.
5+
df = pandas_gbq.read_gbq(
6+
"SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
7+
dialect="standard",
8+
)

benchmark/read_gbq_small_results.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import pandas_gbq
2+
3+
# Select a few KB worth of data, to time downloading small result sets.
4+
df = pandas_gbq.read_gbq(
5+
"SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
6+
dialect="standard",
7+
)

ci/requirements-2.7.pip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@ mock
22
pandas==0.17.1
33
google-auth==1.4.1
44
google-auth-oauthlib==0.0.1
5-
google-cloud-bigquery==0.32.0
5+
google-cloud-bigquery==1.9.0
66
pydata-google-auth==0.1.2

ci/requirements-3.5.pip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
pandas==0.19.0
22
google-auth==1.4.1
33
google-auth-oauthlib==0.0.1
4-
google-cloud-bigquery==0.32.0
4+
google-cloud-bigquery==1.9.0
55
pydata-google-auth==0.1.2

ci/requirements-3.6-0.20.1.conda

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
google-auth
2-
google-auth-oauthlib
3-
google-cloud-bigquery==0.32.0
1+
pydata-google-auth
2+
google-cloud-bigquery==1.9.0
43
pytest
54
pytest-cov
65
codecov

ci/run_conda.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ fi
2121

2222
REQ="ci/requirements-${PYTHON}-${PANDAS}"
2323
conda install -q --file "$REQ.conda";
24-
python setup.py develop
24+
python setup.py develop --no-deps
2525

2626
# Run the tests
2727
$DIR/run_tests.sh

docs/source/changelog.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
11
Changelog
22
=========
33

4+
.. _changelog-0.10.0:
5+
6+
0.10.0 / TBD
7+
------------
8+
9+
Dependency updates
10+
~~~~~~~~~~~~~~~~~~
11+
12+
- Update the minimum version of ``google-cloud-bigquery`` to 1.9.0.
13+
(:issue:`247`)
14+
15+
Internal changes
16+
~~~~~~~~~~~~~~~~
17+
18+
- Use ``to_dataframe()`` from ``google-cloud-bigquery`` in the ``read_gbq()``
19+
function. (:issue:`247`)
20+
21+
422
.. _changelog-0.9.0:
523

624
0.9.0 / 2019-01-11

pandas_gbq/gbq.py

Lines changed: 49 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,9 @@
11
import logging
22
import time
33
import warnings
4-
from collections import OrderedDict
54
from datetime import datetime
65

76
import numpy as np
8-
from pandas import DataFrame
97

108
from pandas_gbq.exceptions import AccessDenied
119

@@ -37,7 +35,7 @@ def _check_google_client_version():
3735
raise ImportError("Could not import pkg_resources (setuptools).")
3836

3937
# https://github.com/GoogleCloudPlatform/google-cloud-python/blob/master/bigquery/CHANGELOG.md
40-
bigquery_minimum_version = pkg_resources.parse_version("0.32.0")
38+
bigquery_minimum_version = pkg_resources.parse_version("1.9.0")
4139
BIGQUERY_INSTALLED_VERSION = pkg_resources.get_distribution(
4240
"google-cloud-bigquery"
4341
).parsed_version
@@ -482,15 +480,16 @@ def run_query(self, query, **kwargs):
482480
rows_iter = query_reply.result()
483481
except self.http_error as ex:
484482
self.process_http_error(ex)
485-
result_rows = list(rows_iter)
486-
total_rows = rows_iter.total_rows
487-
schema = {
488-
"fields": [field.to_api_repr() for field in rows_iter.schema]
489-
}
490483

491-
logger.debug("Got {} rows.\n".format(total_rows))
484+
schema_fields = [field.to_api_repr() for field in rows_iter.schema]
485+
nullsafe_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
486+
df = rows_iter.to_dataframe(dtypes=nullsafe_dtypes)
487+
488+
if df.empty:
489+
df = _cast_empty_df_dtypes(schema_fields, df)
492490

493-
return schema, result_rows
491+
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
492+
return df
494493

495494
def load_data(
496495
self,
@@ -638,45 +637,62 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema):
638637
table.create(table_id, table_schema)
639638

640639

641-
def _parse_schema(schema_fields):
642-
# see:
640+
def _bqschema_to_nullsafe_dtypes(schema_fields):
641+
# Only specify dtype when the dtype allows nulls. Otherwise, use pandas's
642+
# default dtype choice.
643+
#
644+
# See:
643645
# http://pandas.pydata.org/pandas-docs/dev/missing_data.html
644646
# #missing-data-casting-rules-and-indexing
645647
dtype_map = {
646648
"FLOAT": np.dtype(float),
649+
# Even though TIMESTAMPs are timezone-aware in BigQuery, pandas doesn't
650+
# support datetime64[ns, UTC] as dtype in DataFrame constructors. See:
651+
# https://github.com/pandas-dev/pandas/issues/12513
647652
"TIMESTAMP": "datetime64[ns]",
648653
"TIME": "datetime64[ns]",
649654
"DATE": "datetime64[ns]",
650655
"DATETIME": "datetime64[ns]",
651-
"BOOLEAN": bool,
652-
"INTEGER": np.int64,
653656
}
654657

658+
dtypes = {}
655659
for field in schema_fields:
656660
name = str(field["name"])
657661
if field["mode"].upper() == "REPEATED":
658-
yield name, object
659-
else:
660-
dtype = dtype_map.get(field["type"].upper())
661-
yield name, dtype
662+
continue
663+
664+
dtype = dtype_map.get(field["type"].upper())
665+
if dtype:
666+
dtypes[name] = dtype
662667

668+
return dtypes
663669

664-
def _parse_data(schema, rows):
665670

666-
column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
667-
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
671+
def _cast_empty_df_dtypes(schema_fields, df):
672+
"""Cast any columns in an empty dataframe to correct type.
668673
669-
for column in df:
670-
dtype = column_dtypes[column]
671-
null_safe = (
672-
df[column].notnull().all()
673-
or dtype == float
674-
or dtype == "datetime64[ns]"
674+
In an empty dataframe, pandas cannot choose a dtype unless one is
675+
explicitly provided. The _bqschema_to_nullsafe_dtypes() function only
676+
provides dtypes when the dtype safely handles null values. This means
677+
that empty int64 and boolean columns are incorrectly classified as
678+
``object``.
679+
"""
680+
if not df.empty:
681+
raise ValueError(
682+
"DataFrame must be empty in order to cast non-nullsafe dtypes"
675683
)
676-
if dtype and null_safe:
677-
df[column] = df[column].astype(
678-
column_dtypes[column], errors="ignore"
679-
)
684+
685+
dtype_map = {"BOOLEAN": bool, "INTEGER": np.int64}
686+
687+
for field in schema_fields:
688+
column = str(field["name"])
689+
if field["mode"].upper() == "REPEATED":
690+
continue
691+
692+
dtype = dtype_map.get(field["type"].upper())
693+
if dtype:
694+
df[column] = df[column].astype(dtype)
695+
680696
return df
681697

682698

@@ -825,8 +841,8 @@ def read_gbq(
825841
credentials=credentials,
826842
private_key=private_key,
827843
)
828-
schema, rows = connector.run_query(query, configuration=configuration)
829-
final_df = _parse_data(schema, rows)
844+
845+
final_df = connector.run_query(query, configuration=configuration)
830846

831847
# Reindex the DataFrame on the provided column
832848
if index_col is not None:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def readme():
2222
"pydata-google-auth",
2323
"google-auth",
2424
"google-auth-oauthlib",
25-
"google-cloud-bigquery>=0.32.0",
25+
"google-cloud-bigquery>=1.9.0",
2626
]
2727

2828
extras = {"tqdm": "tqdm>=4.23.0"}

0 commit comments

Comments
 (0)