Skip to content

Commit

Permalink
feat: add date, datetime, time, timestamp dtype to to_dataframe (#1547)
Browse files Browse the repository at this point in the history
  • Loading branch information
chelsea-lin authored Apr 18, 2023
1 parent 6458bbd commit 64e913d
Show file tree
Hide file tree
Showing 6 changed files with 494 additions and 38 deletions.
27 changes: 23 additions & 4 deletions google/cloud/bigquery/_pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,10 @@ def default_types_mapper(
int_dtype: Union[Any, None] = None,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = None,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = None,
timestamp_dtype: Union[Any, None] = None,
):
"""Create a mapping from pyarrow types to pandas types.
Expand Down Expand Up @@ -321,13 +325,28 @@ def types_mapper(arrow_data_type):
elif (
# If date_as_object is True, we know some DATE columns are
# out-of-bounds of what is supported by pandas.
not date_as_object
date_dtype is not None
and not date_as_object
and pyarrow.types.is_date(arrow_data_type)
):
return db_dtypes.DateDtype()
return date_dtype

elif pyarrow.types.is_time(arrow_data_type):
return db_dtypes.TimeDtype()
elif (
datetime_dtype is not None
and pyarrow.types.is_timestamp(arrow_data_type)
and arrow_data_type.tz is None
):
return datetime_dtype

elif (
timestamp_dtype is not None
and pyarrow.types.is_timestamp(arrow_data_type)
and arrow_data_type.tz is not None
):
return timestamp_dtype

elif time_dtype is not None and pyarrow.types.is_time(arrow_data_type):
return time_dtype

return types_mapper

Expand Down
6 changes: 6 additions & 0 deletions google/cloud/bigquery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@ class DefaultPandasDTypes(enum.Enum):
INT_DTYPE = object()
"""Specifies default integer dtype"""

DATE_DTYPE = object()
"""Specifies default date dtype"""

TIME_DTYPE = object()
"""Specifies default time dtype"""


class DestinationFormat(object):
"""The exported file format. The default value is :attr:`CSV`.
Expand Down
64 changes: 60 additions & 4 deletions google/cloud/bigquery/job/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@
except ImportError: # pragma: NO COVER
pandas = None

try:
import db_dtypes # type: ignore
except ImportError: # pragma: NO COVER
db_dtypes = None

if typing.TYPE_CHECKING: # pragma: NO COVER
# Assumption: type checks are only used by library developers and CI environments
# that have all optional dependencies installed, thus no conditional imports.
Expand Down Expand Up @@ -1637,6 +1642,10 @@ def to_dataframe(
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
timestamp_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Return a pandas DataFrame from a QueryJob
Expand Down Expand Up @@ -1697,7 +1706,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
Expand All @@ -1707,7 +1716,7 @@ def to_dataframe(
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
Expand All @@ -1717,7 +1726,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
Expand All @@ -1727,7 +1736,50 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
date_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
type, instead of relying on the default ``db_dtypes.DateDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Date type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type
.. versionadded:: 3.10.0
datetime_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type
.. versionadded:: 3.10.0
time_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("object")``. BigQuery Time type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type
.. versionadded:: 3.10.0
timestamp_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type
.. versionadded:: 3.10.0
Returns:
pandas.DataFrame:
Expand Down Expand Up @@ -1755,6 +1807,10 @@ def to_dataframe(
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
)

# If changing the signature of this method, make sure to apply the same
Expand Down
128 changes: 106 additions & 22 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1935,6 +1935,10 @@ def to_dataframe(
int_dtype: Union[Any, None] = DefaultPandasDTypes.INT_DTYPE,
float_dtype: Union[Any, None] = None,
string_dtype: Union[Any, None] = None,
date_dtype: Union[Any, None] = DefaultPandasDTypes.DATE_DTYPE,
datetime_dtype: Union[Any, None] = None,
time_dtype: Union[Any, None] = DefaultPandasDTypes.TIME_DTYPE,
timestamp_dtype: Union[Any, None] = None,
) -> "pandas.DataFrame":
"""Create a pandas DataFrame by loading all pages of a query.
Expand Down Expand Up @@ -1999,7 +2003,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
int_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
Expand All @@ -2009,7 +2013,7 @@ def to_dataframe(
Integer types can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
float_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
Expand All @@ -2019,7 +2023,7 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
string_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
Expand All @@ -2029,7 +2033,50 @@ def to_dataframe(
type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
.. versionadded:: 3.7.1
.. versionadded:: 3.8.0
date_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.date32())``) to convert BigQuery Date
type, instead of relying on the default ``db_dtypes.DateDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Date type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#date_type
.. versionadded:: 3.10.0
datetime_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us"))``) to convert BigQuery Datetime
type, instead of relying on the default ``numpy.dtype("datetime64[ns]``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#datetime_type
.. versionadded:: 3.10.0
time_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.time64("us"))``) to convert BigQuery Time
type, instead of relying on the default ``db_dtypes.TimeDtype()``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("object")``. BigQuery Time type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#time_type
.. versionadded:: 3.10.0
timestamp_dtype (Optional[pandas.Series.dtype, None]):
If set, indicate a pandas ExtensionDtype (e.g.
``pandas.ArrowDtype(pyarrow.timestamp("us", tz="UTC"))``) to convert BigQuery Timestamp
type, instead of relying on the default ``numpy.dtype("datetime64[ns, UTC]")``.
If you explicitly set the value to ``None``, then the data type will be
``numpy.dtype("datetime64[ns, UTC]")`` or ``object`` if out of bound. BigQuery
Datetime type can be found at:
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp_type
.. versionadded:: 3.10.0
Returns:
pandas.DataFrame:
Expand Down Expand Up @@ -2059,6 +2106,9 @@ def to_dataframe(
if int_dtype is DefaultPandasDTypes.INT_DTYPE:
int_dtype = pandas.Int64Dtype()

if time_dtype is DefaultPandasDTypes.TIME_DTYPE:
time_dtype = db_dtypes.TimeDtype()

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)

Expand All @@ -2071,6 +2121,24 @@ def to_dataframe(
if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)

if (
date_dtype is not None
and date_dtype is not DefaultPandasDTypes.DATE_DTYPE
and not hasattr(date_dtype, "__from_arrow__")
):
raise ValueError("date_dtype", _NO_SUPPORTED_DTYPE)

if datetime_dtype is not None and not hasattr(datetime_dtype, "__from_arrow__"):
raise ValueError("datetime_dtype", _NO_SUPPORTED_DTYPE)

if time_dtype is not None and not hasattr(time_dtype, "__from_arrow__"):
raise ValueError("time_dtype", _NO_SUPPORTED_DTYPE)

if timestamp_dtype is not None and not hasattr(
timestamp_dtype, "__from_arrow__"
):
raise ValueError("timestamp_dtype", _NO_SUPPORTED_DTYPE)

if dtypes is None:
dtypes = {}

Expand All @@ -2086,25 +2154,29 @@ def to_dataframe(
create_bqstorage_client=create_bqstorage_client,
)

# When converting date or timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the date_as_object or timestamp_as_object parameter to True,
# if necessary.
date_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_date(col.type)
)
# Default date dtype is `db_dtypes.DateDtype()` that could cause out of bounds error,
# when pyarrow converts date values to nanosecond precision. To avoid the error, we
# set the date_as_object parameter to True, if necessary.
date_as_object = False
if date_dtype is DefaultPandasDTypes.DATE_DTYPE:
date_dtype = db_dtypes.DateDtype()
date_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be date32 or date64 (plus units).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_date(col.type)
)

timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_timestamp(col.type)
)
timestamp_as_object = False
if datetime_dtype is None and timestamp_dtype is None:
timestamp_as_object = not all(
self.__can_cast_timestamp_ns(col)
for col in record_batch
# Type can be datetime and timestamp (plus units and time zone).
# See: https://arrow.apache.org/docs/python/api/datatypes.html
if pyarrow.types.is_timestamp(col.type)
)

if len(record_batch) > 0:
df = record_batch.to_pandas(
Expand All @@ -2117,6 +2189,10 @@ def to_dataframe(
int_dtype=int_dtype,
float_dtype=float_dtype,
string_dtype=string_dtype,
date_dtype=date_dtype,
datetime_dtype=datetime_dtype,
time_dtype=time_dtype,
timestamp_dtype=timestamp_dtype,
),
)
else:
Expand Down Expand Up @@ -2317,6 +2393,10 @@ def to_dataframe(
int_dtype=None,
float_dtype=None,
string_dtype=None,
date_dtype=None,
datetime_dtype=None,
time_dtype=None,
timestamp_dtype=None,
) -> "pandas.DataFrame":
"""Create an empty dataframe.
Expand All @@ -2330,6 +2410,10 @@ def to_dataframe(
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
date_dtype (Any): Ignored. Added for compatibility with RowIterator.
datetime_dtype (Any): Ignored. Added for compatibility with RowIterator.
time_dtype (Any): Ignored. Added for compatibility with RowIterator.
timestamp_dtype (Any): Ignored. Added for compatibility with RowIterator.
Returns:
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.
Expand Down
Loading

0 comments on commit 64e913d

Please sign in to comment.