Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

POC/ENH: infer resolution in array_to_datetime #55741

Merged
merged 15 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 61 additions & 4 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,15 @@ import numpy as np

cnp.import_array()

from pandas._libs.tslibs.dtypes cimport (
get_supported_reso,
npy_unit_to_abbrev,
)
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
check_dts_bounds,
get_datetime64_unit,
import_pandas_datetime,
npy_datetimestruct,
npy_datetimestruct_to_datetime,
Expand Down Expand Up @@ -438,6 +443,7 @@ cpdef array_to_datetime(
utc : bool, default False
indicator whether the dates should be UTC
creso : NPY_DATETIMEUNIT, default NPY_FR_ns
Set to NPY_FR_GENERIC to infer a resolution.

Returns
-------
Expand All @@ -461,14 +467,19 @@ cpdef array_to_datetime(
set out_tzoffset_vals = set()
tzinfo tz_out = None
cnp.flatiter it = cnp.PyArray_IterNew(values)
DatetimeParseState state = DatetimeParseState()
str reso_str
NPY_DATETIMEUNIT item_reso
bint infer_reso = creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC
DatetimeParseState state = DatetimeParseState(creso)
str abbrev

# specify error conditions
assert is_raise or is_ignore or is_coerce

reso_str = npy_unit_to_abbrev(creso)
result = np.empty((<object>values).shape, dtype=f"M8[{reso_str}]")
if infer_reso:
abbrev = "ns"
else:
abbrev = npy_unit_to_abbrev(creso)
result = np.empty((<object>values).shape, dtype=f"M8[{abbrev}]")
iresult = result.view("i8").ravel()

for i in range(n):
Expand All @@ -481,18 +492,37 @@ cpdef array_to_datetime(
iresult[i] = NPY_NAT

elif PyDateTime_Check(val):
if isinstance(val, _Timestamp):
item_reso = val._creso
else:
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
tz_out = state.process_datetime(val, tz_out, utc_convert)
iresult[i] = parse_pydatetime(val, &dts, creso=creso)

elif PyDate_Check(val):
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
WillAyd marked this conversation as resolved.
Show resolved Hide resolved
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)

elif is_datetime64_object(val):
item_reso = get_supported_reso(get_datetime64_unit(val))
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
iresult[i] = get_datetime64_nanos(val, creso)

elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
item_reso = NPY_FR_ns
state.update_creso(item_reso)
if infer_reso:
creso = state.creso

if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
Expand All @@ -509,11 +539,20 @@ cpdef array_to_datetime(
if parse_today_now(val, &iresult[i], utc, creso):
# We can't _quite_ dispatch this to convert_str_to_tsobject
# bc there isn't a nice way to pass "utc"
item_reso = NPY_DATETIMEUNIT.NPY_FR_us
state.update_creso(item_reso)
if infer_reso:
creso = state.creso
continue

_ts = convert_str_to_tsobject(
val, None, unit="ns", dayfirst=dayfirst, yearfirst=yearfirst
)
item_reso = _ts.creso
state.update_creso(item_reso)
if infer_reso:
creso = state.creso

_ts.ensure_reso(creso, val)

iresult[i] = _ts.value
Expand Down Expand Up @@ -560,6 +599,24 @@ cpdef array_to_datetime(
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = timezone(timedelta(seconds=tz_offset))

if infer_reso:
if state.creso_ever_changed:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what to do here but my first read of state.creso_ever_changed was confusion over the scope of ever; I am inferring from the way the function is currently written that ever refers to the lifetime of this function, but if we have parse states that can persist across multiple function calls that terminology can get a little vague

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we have parse states that can persist across multiple function calls

we do not

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to keep it as part of the state then instead of just local to the function?

To be clear not a hold up on this PR for me. Just a consideration point as this continues to evolve

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bc it gets set inside a state-updating method that i dont want to duplicate in a bunch of places. also there are a couple of different places where we use DatetimeParseState and im trying to iron out the kinks in behavior differences between them.

# We encountered mismatched resolutions, need to re-parse with
WillAyd marked this conversation as resolved.
Show resolved Hide resolved
# the correct one.
return array_to_datetime(
values,
errors=errors,
yearfirst=yearfirst,
dayfirst=dayfirst,
utc=utc,
creso=state.creso,
)

# Otherwise we can use the single reso that we encountered and avoid
# a second pass.
abbrev = npy_unit_to_abbrev(state.creso)
result = iresult.view(f"M8[{abbrev}]")
return result, tz_out


Expand Down
63 changes: 63 additions & 0 deletions pandas/tests/tslibs/test_array_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,13 @@
iNaT,
tslib,
)
from pandas._libs.tslibs.dtypes import NpyDatetimeUnit

from pandas import Timestamp
import pandas._testing as tm

creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value


@pytest.mark.parametrize(
"data,expected",
Expand Down Expand Up @@ -203,3 +206,63 @@ def test_datetime_subclass(data, expected):

expected = np.array(expected, dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)


class TestArrayToDatetimeResolutionInference:
# TODO: tests that include tzs, ints

def test_infer_homogeoneous_datetimes(self):
dt = datetime(2023, 10, 27, 18, 3, 5, 678000)
arr = np.array([dt, dt, dt], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array([dt, dt, dt], dtype="M8[us]")
tm.assert_numpy_array_equal(result, expected)

def test_infer_homogeoneous_date_objects(self):
dt = datetime(2023, 10, 27, 18, 3, 5, 678000)
dt2 = dt.date()
arr = np.array([None, dt2, dt2, dt2], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]")
tm.assert_numpy_array_equal(result, expected)

def test_infer_homogeoneous_dt64(self):
dt = datetime(2023, 10, 27, 18, 3, 5, 678000)
dt64 = np.datetime64(dt, "ms")
arr = np.array([None, dt64, dt64, dt64], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array([np.datetime64("NaT"), dt64, dt64, dt64], dtype="M8[ms]")
tm.assert_numpy_array_equal(result, expected)

def test_infer_homogeoneous_timestamps(self):
dt = datetime(2023, 10, 27, 18, 3, 5, 678000)
ts = Timestamp(dt).as_unit("ns")
arr = np.array([None, ts, ts, ts], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array([np.datetime64("NaT")] + [ts.asm8] * 3, dtype="M8[ns]")
tm.assert_numpy_array_equal(result, expected)

def test_infer_homogeoneous_datetimes_strings(self):
item = "2023-10-27 18:03:05.678000"
arr = np.array([None, item, item, item], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array([np.datetime64("NaT"), item, item, item], dtype="M8[us]")
tm.assert_numpy_array_equal(result, expected)

def test_infer_heterogeneous(self):
dtstr = "2023-10-27 18:03:05.678000"

arr = np.array([dtstr, dtstr[:-3], dtstr[:-7], None], dtype=object)
result, tz = tslib.array_to_datetime(arr, creso=creso_infer)
assert tz is None
expected = np.array(arr, dtype="M8[us]")
tm.assert_numpy_array_equal(result, expected)

result, tz = tslib.array_to_datetime(arr[::-1], creso=creso_infer)
assert tz is None
tm.assert_numpy_array_equal(result, expected[::-1])
Loading