Skip to content

BUG: to_datetime incorrect OverflowError #50533

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ Categorical
Datetimelike
^^^^^^^^^^^^
- Bug in :func:`pandas.infer_freq`, raising ``TypeError`` when inferred on :class:`RangeIndex` (:issue:`47084`)
- Bug in :func:`to_datetime` incorrectly raising ``OverflowError`` with string arguments corresponding to large integers (:issue:`50533`)
- Bug in :func:`to_datetime` was raising on invalid offsets with ``errors='coerce'`` and ``infer_datetime_format=True`` (:issue:`48633`)
- Bug in :class:`DatetimeIndex` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``dtype`` or data (:issue:`48659`)
- Bug in subtracting a ``datetime`` scalar from :class:`DatetimeIndex` failing to retain the original ``freq`` attribute (:issue:`48818`)
Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/tslibs/parsing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ from pandas._libs.tslibs.nattype cimport (
c_NaT as NaT,
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
npy_datetimestruct,
Expand Down Expand Up @@ -298,6 +299,12 @@ def parse_datetime_string(
# following may be raised from dateutil
# TypeError: 'NoneType' object is not iterable
raise ValueError(f'Given date string "{date_string}" not likely a datetime')
except OverflowError as err:
# with e.g. "08335394550" dateutil raises when trying to pass
# year=8335394550 to datetime.replace
raise OutOfBoundsDatetime(
f'Parsing "{date_string}" to datetime overflows'
) from err

return dt

Expand Down
25 changes: 9 additions & 16 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,7 @@
TimeNonexistent,
npt,
)
from pandas.errors import (
OutOfBoundsDatetime,
PerformanceWarning,
)
from pandas.errors import PerformanceWarning
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_inclusive

Expand Down Expand Up @@ -2154,18 +2151,14 @@ def objects_to_datetime64ns(

flags = data.flags
order: Literal["F", "C"] = "F" if flags.f_contiguous else "C"
try:
result, tz_parsed = tslib.array_to_datetime(
data.ravel("K"),
errors=errors,
utc=utc,
dayfirst=dayfirst,
yearfirst=yearfirst,
)
result = result.reshape(data.shape, order=order)
except OverflowError as err:
# Exception is raised when a part of date is greater than 32 bit signed int
raise OutOfBoundsDatetime("Out of bounds nanosecond timestamp") from err
result, tz_parsed = tslib.array_to_datetime(
data.ravel("K"),
errors=errors,
utc=utc,
dayfirst=dayfirst,
yearfirst=yearfirst,
)
result = result.reshape(data.shape, order=order)

if tz_parsed is not None:
# We can take a shortcut since the datetime64 numpy array
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,29 @@ def test_to_datetime_parse_timezone_keeps_name(self):


class TestToDatetime:
@pytest.mark.filterwarnings("ignore:Could not infer format")
def test_to_datetime_overflow(self):
# we should get an OutOfBoundsDatetime, NOT OverflowError
# TODO: Timestamp raises VaueError("could not convert string to Timestamp")
# can we make these more consistent?
arg = "08335394550"
msg = 'Parsing "08335394550" to datetime overflows, at position 0'
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(arg)

with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime([arg])

res = to_datetime(arg, errors="coerce")
assert res is NaT
res = to_datetime([arg], errors="coerce")
tm.assert_index_equal(res, Index([NaT]))

res = to_datetime(arg, errors="ignore")
assert isinstance(res, str) and res == arg
res = to_datetime([arg], errors="ignore")
tm.assert_index_equal(res, Index([arg], dtype=object))

def test_to_datetime_mixed_datetime_and_string(self):
# GH#47018 adapted old doctest with new behavior
d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1)))
Expand Down