Skip to content

Commit 66ddd5e

Browse files
authored
PERF: Improve performance for arrow engine and dtype_backend=pyarrow for datetime conversion (#52548)
1 parent 7187e67 commit 66ddd5e

File tree

3 files changed

+29
-0
lines changed

3 files changed

+29
-0
lines changed

asv_bench/benchmarks/io/csv.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,4 +555,19 @@ def time_read_csv_index_col(self):
555555
read_csv(self.StringIO_input, index_col="a")
556556

557557

558+
class ReadCSVDatePyarrowEngine(StringIORewind):
559+
def setup(self):
560+
count_elem = 100_000
561+
data = "a\n" + "2019-12-31\n" * count_elem
562+
self.StringIO_input = StringIO(data)
563+
564+
def time_read_csv_index_col(self):
565+
read_csv(
566+
self.StringIO_input,
567+
parse_dates=["a"],
568+
engine="pyarrow",
569+
dtype_backend="pyarrow",
570+
)
571+
572+
558573
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.0.1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Bug fixes
2828
- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`)
2929
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3030
- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`)
31+
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3132

3233
.. ---------------------------------------------------------------------------
3334
.. _whatsnew_201.other:

pandas/io/parsers/base_parser.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
from pandas.core.dtypes.missing import isna
6262

6363
from pandas import (
64+
ArrowDtype,
6465
DatetimeIndex,
6566
StringDtype,
6667
)
@@ -867,6 +868,7 @@ def _do_date_conversions(
867868
self.index_names,
868869
names,
869870
keep_date_col=self.keep_date_col,
871+
dtype_backend=self.dtype_backend,
870872
)
871873

872874
return names, data
@@ -1203,6 +1205,7 @@ def _process_date_conversion(
12031205
index_names,
12041206
columns,
12051207
keep_date_col: bool = False,
1208+
dtype_backend=lib.no_default,
12061209
):
12071210
def _isindex(colspec):
12081211
return (isinstance(index_col, list) and colspec in index_col) or (
@@ -1228,6 +1231,16 @@ def _isindex(colspec):
12281231
colspec = orig_names[colspec]
12291232
if _isindex(colspec):
12301233
continue
1234+
elif dtype_backend == "pyarrow":
1235+
import pyarrow as pa
1236+
1237+
dtype = data_dict[colspec].dtype
1238+
if isinstance(dtype, ArrowDtype) and (
1239+
pa.types.is_timestamp(dtype.pyarrow_dtype)
1240+
or pa.types.is_date(dtype.pyarrow_dtype)
1241+
):
1242+
continue
1243+
12311244
# Pyarrow engine returns Series which we need to convert to
12321245
# numpy array before converter, its a no-op for other parsers
12331246
data_dict[colspec] = converter(

0 commit comments

Comments
 (0)