API: Add CalendarDay ('CD') offset (#22288)

pandas-dev · jschendel · Sep 22, 2018 · Jul 22, 2018 · Jul 22, 2018 · Sep 4, 2018
commit 2d21d9ba68a1ab6040fc7ca713c9da80808b655a
diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst
@@ -369,7 +369,7 @@ In practice this becomes very cumbersome because we often need a very long
 index with a large number of timestamps. If we need timestamps on a regular
 frequency, we can use the :func:`date_range` and :func:`bdate_range` functions
 to create a ``DatetimeIndex``. The default frequency for ``date_range`` is a
-**calendar day** while the default for ``bdate_range`` is a **business day**:
+**day** while the default for ``bdate_range`` is a **business day**:
 
 .. ipython:: python
 
@@ -886,6 +886,27 @@ normalized after the function is applied.
    hour.apply(pd.Timestamp('2014-01-01 23:00'))
 
 
+.. _timeseries.dayvscalendarday:
+
+Day vs. CalendarDay
+~~~~~~~~~~~~~~~~~~~
+
+:class:`Day` (``'D'``) is a timedelta-like offset that respects absolute time
+arithmetic and is an alias for 24 :class:`Hour`. This offset is the default
+argument to many pandas time related function like :func:`date_range` and :func:`timedelta_range`.
+
+:class:`CalendarDay` (``'CD'``) is a relativedelta-like offset that respects
+calendar time arithmetic. :class:`CalendarDay` is useful preserving calendar day
+semantics with date times with have day light savings transitions, i.e. :class:`CalendarDay`
+will preserve the hour before the day light savings transition.
+
+.. ipython:: python
+
+   ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')
+   ts + pd.offsets.Day(1)
+   ts + pd.offsets.CalendarDay(1)
+
+
 Parametric Offsets
 ~~~~~~~~~~~~~~~~~~
 
@@ -1176,7 +1197,8 @@ frequencies. We will refer to these aliases as *offset aliases*.
 
     "B", "business day frequency"
     "C", "custom business day frequency"
-    "D", "calendar day frequency"
+    "D", "day frequency"
+    "CD", "calendar day frequency"
     "W", "weekly frequency"
     "M", "month end frequency"
     "SM", "semi-month end frequency (15th and end of month)"

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -285,6 +285,46 @@ that the dates have been converted to UTC
 .. ipython:: python
     pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True)
 
+.. _whatsnew_0240.api_breaking.calendarday:
+
+CalendarDay Offset
+^^^^^^^^^^^^^^^^^^
+
+:class:`Day` and associated frequency alias ``'D'`` were documented to represent
+a calendar day; however, arithmetic and operations with :class:`Day` sometimes
+respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``).
+
+*Previous Behavior*:
+
+.. code-block:: ipython
+
+
+    In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')
+
+    # Respects calendar arithmetic
+    In [3]: pd.date_range(start=ts, freq='D', periods=3)
+    Out[3]:
+    DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00',
+                   '2016-11-01 00:00:00+02:00'],
+                  dtype='datetime64[ns, Europe/Helsinki]', freq='D')
+
+    # Respects absolute arithmetic
+    In [4]: ts + pd.tseries.frequencies.to_offset('D')
+    Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki')
+
+:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available
+and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'``
+will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`)
+See the :ref:`documentation here <timeseries.dayvscalendarday>` for more information.
+
+Addition with :class:`CalendarDay` across a daylight savings time transition:
+
+.. ipython:: python
+
+   ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')
+   ts + pd.offsets.Day(1)
+   ts + pd.offsets.CalendarDay(1)
+
 .. _whatsnew_0240.api_breaking.period_end_time:
 
 Time values in ``dt.end_time`` and ``to_timestamp(how='end')``

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -32,7 +32,7 @@
 from pandas.core import ops
 
 from pandas.tseries.frequencies import to_offset
-from pandas.tseries.offsets import Tick, Day, generate_range
+from pandas.tseries.offsets import Tick, generate_range
 
 from pandas.core.arrays import datetimelike as dtl
 
@@ -239,56 +239,33 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
         start, end, _normalized = _maybe_normalize_endpoints(start, end,
                                                              normalize)
 
-        tz, inferred_tz = _infer_tz_from_endpoints(start, end, tz)
-
-        if hasattr(freq, 'delta') and freq != Day():
-            # sub-Day Tick
-            if inferred_tz is None and tz is not None:
-                # naive dates
-                if start is not None and start.tz is None:
-                    start = start.tz_localize(tz, ambiguous=False)
-
-                if end is not None and end.tz is None:
-                    end = end.tz_localize(tz, ambiguous=False)
-
-            if start and end:
-                if start.tz is None and end.tz is not None:
-                    start = start.tz_localize(end.tz, ambiguous=False)
-
-                if end.tz is None and start.tz is not None:
-                    end = end.tz_localize(start.tz, ambiguous=False)
-
+        tz, _ = _infer_tz_from_endpoints(start, end, tz)
+
+        if tz is not None:
+            # Localize the start and end arguments
+            start = _maybe_localize_point(
+                start, getattr(start, 'tz', None), start, freq, tz
+            )
+            end = _maybe_localize_point(
+                end, getattr(end, 'tz', None), end, freq, tz
+            )
+        if start and end:
+            # Make sure start and end have the same tz
+            start = _maybe_localize_point(
+                start, start.tz, end.tz, freq, tz
+            )
+            end = _maybe_localize_point(
+                end, end.tz, start.tz, freq, tz
+            )
+        if freq is not None:
             if cls._use_cached_range(freq, _normalized, start, end):
+                # Currently always False; never hit
+                # Should be reimplemented as apart of GH 17914
                 index = cls._cached_range(start, end, periods=periods,
                                           freq=freq)
             else:
                 index = _generate_regular_range(cls, start, end, periods, freq)
 
-        else:
-
-            if tz is not None:
-                # naive dates
-                if start is not None and start.tz is not None:
-                    start = start.replace(tzinfo=None)
-
-                if end is not None and end.tz is not None:
-                    end = end.replace(tzinfo=None)
-
-            if start and end:
-                if start.tz is None and end.tz is not None:
-                    end = end.replace(tzinfo=None)
-
-                if end.tz is None and start.tz is not None:
-                    start = start.replace(tzinfo=None)
-
-            if freq is not None:
-                if cls._use_cached_range(freq, _normalized, start, end):
-                    index = cls._cached_range(start, end, periods=periods,
-                                              freq=freq)
-                else:
-                    index = _generate_regular_range(cls, start, end,
-                                                    periods, freq)
-
                 if tz is not None and getattr(index, 'tz', None) is None:
                     arr = conversion.tz_localize_to_utc(
                         ensure_int64(index.values),
@@ -302,12 +279,12 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
                         start = start.tz_localize(tz).asm8
                     if end is not None:
                         end = end.tz_localize(tz).asm8
-            else:
-                # Create a linearly spaced date_range in local time
-                start = start.tz_localize(tz)
-                end = end.tz_localize(tz)
-                arr = np.linspace(start.value, end.value, periods)
-                index = cls._simple_new(arr.astype('M8[ns]'), freq=None, tz=tz)
+        else:
+            # Create a linearly spaced date_range in local time
+            arr = np.linspace(start.value, end.value, periods)
+            index = cls._simple_new(
+                arr.astype('M8[ns]', copy=False), freq=None, tz=tz
+            )
 
         if not left_closed and len(index) and index[0] == start:
             index = index[1:]
@@ -1256,10 +1233,10 @@ def _generate_regular_range(cls, start, end, periods, freq):
         data = cls._simple_new(data.view(_NS_DTYPE), None, tz=tz)
     else:
         tz = None
+        # start and end should have the same timezone by this point
         if isinstance(start, Timestamp):
             tz = start.tz
-
-        if isinstance(end, Timestamp):
+        elif isinstance(end, Timestamp):
             tz = end.tz
 
         xdr = generate_range(start=start, end=end,
@@ -1330,3 +1307,32 @@ def _maybe_normalize_endpoints(start, end, normalize):
             _normalized = _normalized and end.time() == _midnight
 
     return start, end, _normalized
+
+
+def _maybe_localize_point(ts, is_none, is_not_none, freq, tz):
+    """
+    Localize a start or end Timestamp to the timezone of the corresponding
+    start or end Timestamp
+
+    Parameters
+    ----------
+    ts : start or end Timestamp to potentially localize
+    is_none : argument that should be None
+    is_not_none : argument that should not be None
+    freq : Tick, DateOffset, or None
+    tz : str, timezone object or None
+
+    Returns
+    -------
+    ts : Timestamp
+    """
+    # Make sure start and end are timezone localized if:
+    # 1) freq = a Timedelta-like frequency (Tick)
+    # 2) freq = None i.e. generating a linspaced range
+    if isinstance(freq, Tick) or freq is None:
+        localize_args = {'tz': tz, 'ambiguous': False}
+    else:
+        localize_args = {'tz': None}
+    if is_none is None and is_not_none is not None:
+        ts = ts.tz_localize(**localize_args)
+    return ts
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -385,7 +385,10 @@ def _generate_range(cls, start, end, periods, name=None, freq=None,
 
     @classmethod
     def _use_cached_range(cls, freq, _normalized, start, end):
-        return _use_cached_range(freq, _normalized, start, end)
+        # Note: This always returns False
+        return (freq._should_cache() and
+                not (freq._normalize_cache and not _normalized) and
+                _naive_in_cache_range(start, end))
 
     def _convert_for_op(self, value):
         """ Convert value to be insertable to ndarray """
@@ -1580,7 +1583,7 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None,
         Right bound for generating dates.
     periods : integer, optional
         Number of periods to generate.
-    freq : str or DateOffset, default 'D' (calendar daily)
+    freq : str or DateOffset, default 'D'
         Frequency strings can have multiples, e.g. '5H'. See
         :ref:`here <timeseries.offset_aliases>` for a list of
         frequency aliases.
@@ -1861,17 +1864,7 @@ def _naive_in_cache_range(start, end):
     else:
         if start.tzinfo is not None or end.tzinfo is not None:
             return False
-        return _in_range(start, end, _CACHE_START, _CACHE_END)
-
-
-def _in_range(start, end, rng_start, rng_end):
-    return start > rng_start and end < rng_end
-
-
-def _use_cached_range(freq, _normalized, start, end):
-    return (freq._should_cache() and
-            not (freq._normalize_cache and not _normalized) and
-            _naive_in_cache_range(start, end))
+        return start > _CACHE_START and end < _CACHE_END
 
 
 def _time_to_micros(time):

diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -1052,7 +1052,7 @@ def interval_range(start=None, end=None, periods=None, freq=None,
     freq : numeric, string, or DateOffset, default None
         The length of each interval. Must be consistent with the type of start
         and end, e.g. 2 for numeric, or '5H' for datetime-like.  Default is 1
-        for numeric and 'D' (calendar daily) for datetime-like.
+        for numeric and 'D' for datetime-like.
     name : string, default None
         Name of the resulting IntervalIndex
     closed : {'left', 'right', 'both', 'neither'}, default 'right'

diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -840,7 +840,7 @@ def period_range(start=None, end=None, periods=None, freq='D', name=None):
         Right bound for generating periods
     periods : integer, default None
         Number of periods to generate
-    freq : string or DateOffset, default 'D' (calendar daily)
+    freq : string or DateOffset, default 'D'
         Frequency alias
     name : string, default None
         Name of the resulting PeriodIndex

diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
@@ -737,7 +737,7 @@ def timedelta_range(start=None, end=None, periods=None, freq=None,
         Right bound for generating timedeltas
     periods : integer, default None
         Number of periods to generate
-    freq : string or DateOffset, default 'D' (calendar daily)
+    freq : string or DateOffset, default 'D'
         Frequency strings can have multiples, e.g. '5H'
     name : string, default None
         Name of the resulting TimedeltaIndex