Skip to content

Commit

Permalink
ENH: read_stata return non-nano (#55642)
Browse files Browse the repository at this point in the history
* ENH: read_stata return non-nano

* GH ref

* mypy fixup

* update doctest

* simplify

* avoid Series.view

* dont go through Series

* move whatsnew

* remove outdated whatsnew
  • Loading branch information
jbrockmendel authored Feb 2, 2024
1 parent 9c76d54 commit 4663edd
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 119 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enhancement2
Other enhancements
^^^^^^^^^^^^^^^^^^
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
-

.. ---------------------------------------------------------------------------
Expand Down
146 changes: 54 additions & 92 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
Timestamp,
isna,
to_datetime,
to_timedelta,
)
from pandas.core.frame import DataFrame
from pandas.core.indexes.base import Index
Expand Down Expand Up @@ -232,6 +231,7 @@


stata_epoch: Final = datetime(1960, 1, 1)
unix_epoch: Final = datetime(1970, 1, 1)


def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
Expand All @@ -256,7 +256,7 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
>>> dates = pd.Series([52])
>>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
0 1961-01-01
dtype: datetime64[ns]
dtype: datetime64[s]
Notes
-----
Expand All @@ -280,76 +280,51 @@ def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
date - ty
years since 0000
"""
MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
MAX_DAY_DELTA = (Timestamp.max - datetime(1960, 1, 1)).days
MIN_DAY_DELTA = (Timestamp.min - datetime(1960, 1, 1)).days
MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000

def convert_year_month_safe(year, month) -> Series:
"""
Convert year and month to datetimes, using pandas vectorized versions
when the date range falls within the range supported by pandas.
Otherwise it falls back to a slower but more robust method
using datetime.
"""
if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
return to_datetime(100 * year + month, format="%Y%m")
else:
index = getattr(year, "index", None)
return Series([datetime(y, m, 1) for y, m in zip(year, month)], index=index)

def convert_year_days_safe(year, days) -> Series:
"""
Converts year (e.g. 1999) and days since the start of the year to a
datetime or datetime64 Series
"""
if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
return to_datetime(year, format="%Y") + to_timedelta(days, unit="d")
else:
index = getattr(year, "index", None)
value = [
datetime(y, 1, 1) + timedelta(days=int(d)) for y, d in zip(year, days)
]
return Series(value, index=index)
if fmt.startswith(("%tc", "tc")):
# Delta ms relative to base
td = np.timedelta64(stata_epoch - unix_epoch, "ms")
res = np.array(dates._values, dtype="M8[ms]") + td
return Series(res, index=dates.index)

def convert_delta_safe(base, deltas, unit) -> Series:
"""
Convert base dates and deltas to datetimes, using pandas vectorized
versions if the deltas satisfy restrictions required to be expressed
as dates in pandas.
"""
index = getattr(deltas, "index", None)
if unit == "d":
if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
values = [base + timedelta(days=int(d)) for d in deltas]
return Series(values, index=index)
elif unit == "ms":
if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
values = [
base + timedelta(microseconds=(int(d) * 1000)) for d in deltas
]
return Series(values, index=index)
else:
raise ValueError("format not understood")
base = to_datetime(base)
deltas = to_timedelta(deltas, unit=unit)
return base + deltas
elif fmt.startswith(("%td", "td", "%d", "d")):
# Delta days relative to base
td = np.timedelta64(stata_epoch - unix_epoch, "D")
res = np.array(dates._values, dtype="M8[D]") + td
return Series(res, index=dates.index)

elif fmt.startswith(("%tm", "tm")):
# Delta months relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12
res = np.array(ordinals, dtype="M8[M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%tq", "tq")):
# Delta quarters relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4
res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%th", "th")):
# Delta half-years relative to base
ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2
res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]")
return Series(res, index=dates.index)

elif fmt.startswith(("%ty", "ty")):
# Years -- not delta
ordinals = dates - 1970
res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]")
return Series(res, index=dates.index)

# TODO(non-nano): If/when pandas supports more than datetime64[ns], this
# should be improved to use correct range, e.g. datetime[Y] for yearly
bad_locs = np.isnan(dates)
has_bad_values = False
if bad_locs.any():
has_bad_values = True
dates._values[bad_locs] = 1.0 # Replace with NaT
dates = dates.astype(np.int64)

if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
base = stata_epoch
ms = dates
conv_dates = convert_delta_safe(base, ms, "ms")
elif fmt.startswith(("%tC", "tC")):
if fmt.startswith(("%tC", "tC")):
warnings.warn(
"Encountered %tC format. Leaving in Stata Internal Format.",
stacklevel=find_stack_level(),
Expand All @@ -358,33 +333,18 @@ def convert_delta_safe(base, deltas, unit) -> Series:
if has_bad_values:
conv_dates[bad_locs] = NaT
return conv_dates
# Delta days relative to base
elif fmt.startswith(("%td", "td", "%d", "d")):
base = stata_epoch
days = dates
conv_dates = convert_delta_safe(base, days, "d")
# does not count leap days - 7 days is a week.
# 52nd week may have more than 7 days
elif fmt.startswith(("%tw", "tw")):
year = stata_epoch.year + dates // 52
days = (dates % 52) * 7
conv_dates = convert_year_days_safe(year, days)
elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
year = stata_epoch.year + dates // 12
month = (dates % 12) + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
year = stata_epoch.year + dates // 4
quarter_month = (dates % 4) * 3 + 1
conv_dates = convert_year_month_safe(year, quarter_month)
elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
year = stata_epoch.year + dates // 2
month = (dates % 2) * 6 + 1
conv_dates = convert_year_month_safe(year, month)
elif fmt.startswith(("%ty", "ty")): # Years -- not delta
year = dates
first_month = np.ones_like(dates)
conv_dates = convert_year_month_safe(year, first_month)
per_y = (year - 1970).array.view("Period[Y]")
per_d = per_y.asfreq("D", how="S")
per_d_shifted = per_d + days._values
per_s = per_d_shifted.asfreq("s", how="S")
conv_dates_arr = per_s.view("M8[s]")
conv_dates = Series(conv_dates_arr, index=dates.index)

else:
raise ValueError(f"Date fmt {fmt} not understood")

Expand All @@ -409,24 +369,26 @@ def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
index = dates.index
NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
US_PER_DAY = NS_PER_DAY / 1000
MS_PER_DAY = NS_PER_DAY / 1_000_000

def parse_dates_safe(
dates: Series, delta: bool = False, year: bool = False, days: bool = False
):
d = {}
if lib.is_np_dtype(dates.dtype, "M"):
if delta:
time_delta = dates - Timestamp(stata_epoch).as_unit("ns")
d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds
time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit(
"ms"
)
d["delta"] = time_delta._values.view(np.int64)
if days or year:
date_index = DatetimeIndex(dates)
d["year"] = date_index._data.year
d["month"] = date_index._data.month
if days:
days_in_ns = dates._values.view(np.int64) - to_datetime(
d["year"], format="%Y"
)._values.view(np.int64)
d["days"] = days_in_ns // NS_PER_DAY
year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype)
diff = dates - year_start
d["days"] = np.asarray(diff).astype("m8[D]").view("int64")

elif infer_dtype(dates, skipna=False) == "datetime":
if delta:
Expand Down Expand Up @@ -466,7 +428,7 @@ def g(x: datetime) -> int:

if fmt in ["%tc", "tc"]:
d = parse_dates_safe(dates, delta=True)
conv_dates = d.delta / 1000
conv_dates = d.delta
elif fmt in ["%tC", "tC"]:
warnings.warn(
"Stata Internal Format tC not supported.",
Expand All @@ -475,7 +437,7 @@ def g(x: datetime) -> int:
conv_dates = dates
elif fmt in ["%td", "td"]:
d = parse_dates_safe(dates, delta=True)
conv_dates = d.delta // US_PER_DAY
conv_dates = d.delta // MS_PER_DAY
elif fmt in ["%tw", "tw"]:
d = parse_dates_safe(dates, year=True, days=True)
conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
Expand Down
Loading

0 comments on commit 4663edd

Please sign in to comment.