Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Datetime/Timestamp.normalize for timezone naive datetimes #23634

Merged
merged 19 commits into from
Nov 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion asv_bench/benchmarks/timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def time_microsecond(self, tz, freq):


class TimestampOps(object):
params = [None, 'US/Eastern']
params = [None, 'US/Eastern', 'UTC']
param_names = ['tz']

def setup(self, tz):
Expand All @@ -102,6 +102,9 @@ def time_replace_None(self, tz):
def time_to_pydatetime(self, tz):
self.ts.to_pydatetime()

def time_normalize(self, tz):
self.ts.normalize()


class TimestampAcrossDst(object):
def setup(self):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,7 @@ Performance Improvements
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)


.. _whatsnew_0240.docs:
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/tslibs/ccalendar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
int_to_weekday = {num: name for num, name in enumerate(DAYS)}
weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}

DAY_SECONDS = 86400
HOUR_SECONDS = 3600

# ----------------------------------------------------------------------


Expand Down
31 changes: 11 additions & 20 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ from cpython.datetime cimport (datetime, tzinfo,
PyDateTime_CheckExact, PyDateTime_IMPORT)
PyDateTime_IMPORT

from ccalendar import DAY_SECONDS, HOUR_SECONDS

from np_datetime cimport (check_dts_bounds,
npy_datetimestruct,
pandas_datetime_to_datetimestruct, _string_to_dts,
Expand All @@ -41,8 +43,6 @@ from nattype cimport NPY_NAT, checknull_with_nat
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL
cdef int64_t HOURS_NS = 3600000000000
NS_DTYPE = np.dtype('M8[ns]')
TD_DTYPE = np.dtype('m8[ns]')

Expand Down Expand Up @@ -875,6 +875,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
int64_t *tdata
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
int64_t HOURS_NS = HOUR_SECONDS * 1000000000
ndarray[int64_t] result, result_a, result_b, dst_hours
npy_datetimestruct dts
bint infer_dst = False, is_dst = False, fill = False
Expand Down Expand Up @@ -931,10 +932,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
result_b[:] = NPY_NAT

idx_shifted_left = (np.maximum(0, trans.searchsorted(
vals - DAY_NS, side='right') - 1)).astype(np.int64)
vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)

idx_shifted_right = (np.maximum(0, trans.searchsorted(
vals + DAY_NS, side='right') - 1)).astype(np.int64)
vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)

for i in range(n):
val = vals[i]
Expand Down Expand Up @@ -1116,9 +1117,9 @@ def normalize_date(dt: object) -> datetime:
@cython.boundscheck(False)
def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
"""
Normalize each of the (nanosecond) timestamps in the given array by
rounding down to the beginning of the day (i.e. midnight). If `tz`
is not None, then this is midnight for this timezone.
Normalize each of the (nanosecond) timezone aware timestamps in the given
array by rounding down to the beginning of the day (i.e. midnight).
This is midnight for timezone, `tz`.
Parameters
----------
Expand All @@ -1130,21 +1131,11 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
result : int64 ndarray of converted of normalized nanosecond timestamps
"""
cdef:
Py_ssize_t i, n = len(stamps)
npy_datetimestruct dts
Py_ssize_t n = len(stamps)
int64_t[:] result = np.empty(n, dtype=np.int64)

if tz is not None:
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this case never reached?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

with nogil:
for i in range(n):
if stamps[i] == NPY_NAT:
result[i] = NPY_NAT
continue
dt64_to_dtstruct(stamps[i], &dts)
result[i] = _normalized_stamp(&dts)
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)

return result.base # .base to access underlying np.ndarray

Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cimport numpy as cnp
from numpy cimport ndarray, int64_t, int32_t, int8_t
cnp.import_array()

from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL
from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS
from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek,
get_week_of_year, get_day_of_year)
from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct,
Expand All @@ -36,7 +36,8 @@ def get_time_micros(ndarray[int64_t] dtindex):
cdef:
ndarray[int64_t] micros

micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL
micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64)
micros //= 1000LL
return micros


Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cdef extern from "src/datetime/np_datetime_strings.h":
npy_datetimestruct *out,
int *out_local, int *out_tzoffset)


# ----------------------------------------------------------------------
# numpy object inspection

Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ from util cimport (is_timedelta64_object, is_datetime64_object,
is_integer_object, is_float_object,
is_string_object)

from ccalendar import DAY_SECONDS

from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct,
pandas_timedeltastruct)

Expand All @@ -38,8 +40,6 @@ from offsets cimport to_offset
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL

# components named tuple
Components = collections.namedtuple('Components', [
'days', 'hours', 'minutes', 'seconds',
Expand Down Expand Up @@ -266,10 +266,10 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
m = 1000000000L * 2629746
p = 9
elif unit == 'W':
m = 1000000000L * 86400 * 7
m = 1000000000L * DAY_SECONDS * 7
p = 9
elif unit == 'D' or unit == 'd':
m = 1000000000L * 86400
m = 1000000000L * DAY_SECONDS
p = 9
elif unit == 'h':
m = 1000000000L * 3600
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ from util cimport (is_datetime64_object, is_timedelta64_object,
is_offset_object)

cimport ccalendar
from ccalendar import DAY_SECONDS
from conversion import tz_localize_to_utc, normalize_i8_timestamps
from conversion cimport (tz_convert_single, _TSObject,
convert_to_tsobject, convert_datetime_to_tsobject)
Expand Down Expand Up @@ -1285,6 +1286,10 @@ class Timestamp(_Timestamp):
Normalize Timestamp to midnight, preserving
tz information.
"""
if self.tz is None or is_utc(self.tz):
DAY_NS = DAY_SECONDS * 1000000000
normalized_value = self.value - (self.value % DAY_NS)
return Timestamp(normalized_value).tz_localize(self.tz)
normalized_value = normalize_i8_timestamps(
np.array([self.value], dtype='i8'), tz=self.tz)[0]
return Timestamp(normalized_value).tz_localize(self.tz)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timezones.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

cdef bint is_utc(object tz)
cpdef bint is_utc(object tz)
cdef bint is_tzlocal(object tz)

cdef bint treat_tz_as_pytz(object tz)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cdef int64_t NPY_NAT = get_nat()

# ----------------------------------------------------------------------

cdef inline bint is_utc(object tz):
cpdef inline bint is_utc(object tz):
return tz is UTC or isinstance(tz, _dateutil_tzutc)


Expand Down
11 changes: 9 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas._libs import lib, tslib
from pandas._libs.tslib import Timestamp, NaT, iNaT
from pandas._libs.tslibs import (
normalize_date,
ccalendar, normalize_date,
conversion, fields, timezones,
resolution as libresolution)

Expand Down Expand Up @@ -853,7 +853,14 @@ def normalize(self):
'2014-08-01 00:00:00+05:30'],
dtype='datetime64[ns, Asia/Calcutta]', freq=None)
"""
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
if self.tz is None or timezones.is_utc(self.tz):
not_null = self.notna()
DAY_NS = ccalendar.DAY_SECONDS * 1000000000
new_values = self.asi8.copy()
adjustment = (new_values[not_null] % DAY_NS)
new_values[not_null] = new_values[not_null] - adjustment
else:
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
return type(self)(new_values, freq='infer').tz_localize(self.tz)

def to_period(self, freq=None):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/datetimes/test_scalar_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ def test_normalize(self):
assert result.is_normalized
assert not rng.is_normalized

def test_normalize_nat(self):
dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')])
result = dti.normalize()
expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')])
tm.assert_index_equal(result, expected)


class TestDateTimeIndexToJulianDate(object):

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/scalar/timestamp/test_unary_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,17 @@ def test_replace_dst_border(self):
expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago')
assert result == expected

# --------------------------------------------------------------
# Timestamp.normalize

@pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a normalize_nat test as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't define normalize for NaT.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could have one for Timstamp mirroring (another issue). Probably would just return NaT

def test_normalize(self, tz_naive_fixture, arg):
tz = tz_naive_fixture
ts = Timestamp(arg, tz=tz)
result = ts.normalize()
expected = Timestamp('2013-11-30', tz=tz)
assert result == expected

# --------------------------------------------------------------

@td.skip_if_windows
Expand Down