Skip to content

Commit

Permalink
BUG: Retain timezone dtype with cut and qcut (pandas-dev#19890)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and jreback committed Mar 9, 2018
1 parent c730d08 commit cc1b934
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 37 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,7 @@ Reshaping
- Bug in :func:`DataFrame.iterrows`, which would infers strings not compliant to `ISO8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ to datetimes (:issue:`19671`)
- Bug in :class:`Series` constructor with ``Categorical`` where a ```ValueError`` is not raised when an index of different length is given (:issue:`19342`)
- Bug in :meth:`DataFrame.astype` where column metadata is lost when converting to categorical or a dictionary of dtypes (:issue:`19920`)
- Bug in :func:`cut` and :func:`qcut` where timezone information was dropped (:issue:`19872`)

Other
^^^^^
Expand Down
30 changes: 22 additions & 8 deletions pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Quantilization functions and related stuff
"""
from functools import partial

from pandas.core.dtypes.missing import isna
from pandas.core.dtypes.common import (
Expand All @@ -9,6 +10,7 @@
is_categorical_dtype,
is_datetime64_dtype,
is_timedelta64_dtype,
is_datetime64tz_dtype,
_ensure_int64)

import pandas.core.algorithms as algos
Expand Down Expand Up @@ -239,7 +241,8 @@ def _bins_to_cuts(x, bins, right=True, labels=None,
ids = _ensure_int64(bins.searchsorted(x, side=side))

if include_lowest:
ids[x == bins[0]] = 1
# Numpy 1.9 support: ensure this mask is a Numpy array
ids[np.asarray(x == bins[0])] = 1

na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
has_nas = na_mask.any()
Expand Down Expand Up @@ -284,12 +287,14 @@ def _coerce_to_type(x):
"""
dtype = None

if is_timedelta64_dtype(x):
x = to_timedelta(x)
dtype = np.timedelta64
if is_datetime64tz_dtype(x):
dtype = x.dtype
elif is_datetime64_dtype(x):
x = to_datetime(x)
dtype = np.datetime64
elif is_timedelta64_dtype(x):
x = to_timedelta(x)
dtype = np.timedelta64

if dtype is not None:
# GH 19768: force NaT to NaN during integer conversion
Expand All @@ -305,7 +310,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
Parameters
----------
bins : list-liek of bins
bins : list-like of bins
dtype : dtype of data
Raises
Expand All @@ -318,7 +323,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
bins = to_timedelta(bins).view(np.int64)
else:
raise ValueError("bins must be of timedelta64 dtype")
elif is_datetime64_dtype(dtype):
elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
if bins_dtype in ['datetime', 'datetime64']:
bins = to_datetime(bins).view(np.int64)
else:
Expand All @@ -333,7 +338,10 @@ def _format_labels(bins, precision, right=True,

closed = 'right' if right else 'left'

if is_datetime64_dtype(dtype):
if is_datetime64tz_dtype(dtype):
formatter = partial(Timestamp, tz=dtype.tz)
adjust = lambda x: x - Timedelta('1ns')
elif is_datetime64_dtype(dtype):
formatter = Timestamp
adjust = lambda x: x - Timedelta('1ns')
elif is_timedelta64_dtype(dtype):
Expand Down Expand Up @@ -372,7 +380,13 @@ def _preprocess_for_cut(x):
series_index = x.index
name = x.name

x = np.asarray(x)
# Check that the passed array is a Pandas or Numpy object
# We don't want to strip away a Pandas data-type here (e.g. datetimetz)
ndim = getattr(x, 'ndim', None)
if ndim is None:
x = np.asarray(x)
if x.ndim != 1:
raise ValueError("Input array must be 1 dimensional")

return x_is_series, series_index, name, x

Expand Down
108 changes: 79 additions & 29 deletions pandas/tests/reshape/test_tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from pandas.compat import zip

from pandas import (Series, isna, to_datetime, DatetimeIndex,
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
Timestamp, Interval, IntervalIndex, Categorical,
cut, qcut, date_range, NaT, TimedeltaIndex)
from pandas.tseries.offsets import Nano, Day
Expand Down Expand Up @@ -104,6 +104,12 @@ def test_cut_corner(self):

pytest.raises(ValueError, cut, [1, 2, 3], 0.5)

@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
@pytest.mark.parametrize('cut_func', [cut, qcut])
def test_cut_not_1d_arg(self, arg, cut_func):
with pytest.raises(ValueError):
cut_func(arg, 2)

def test_cut_out_of_range_more(self):
# #1511
s = Series([0, -1, 0, 1, -3], name='x')
Expand Down Expand Up @@ -251,18 +257,6 @@ def test_qcut_nas(self):
result = qcut(arr, 4)
assert isna(result[:20]).all()

@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)

def test_qcut_index(self):
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
Expand Down Expand Up @@ -452,6 +446,37 @@ def test_single_bin(self):
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable

array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable

hundred_elements = np.arange(100)

tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))


class TestDatelike(object):

@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)

def test_datetime_cut(self):
# GH 14714
# testing for time data to be present as series
Expand Down Expand Up @@ -488,6 +513,47 @@ def test_datetime_cut(self):
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)

@pytest.mark.parametrize('bins', [
3, [Timestamp('2013-01-01 04:57:07.200000'),
Timestamp('2013-01-01 21:00:00'),
Timestamp('2013-01-02 13:00:00'),
Timestamp('2013-01-03 05:00:00')]])
@pytest.mark.parametrize('box', [list, np.array, Index, Series])
def test_datetimetz_cut(self, bins, box):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
def test_datetimetz_qcut(self, bins):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
result = qcut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)

def test_datetime_bin(self):
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
Expand Down Expand Up @@ -523,19 +589,3 @@ def f():
mask = result.isna()
tm.assert_numpy_array_equal(
mask, np.array([False, True, True, True, True]))

@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable

array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable

hundred_elements = np.arange(100)

tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))

0 comments on commit cc1b934

Please sign in to comment.