Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
d0e7d86
ENH: Add arrow engine to to_csv
lithomas1 Jun 25, 2023
4b7f880
Merge branch 'main' of https://github.com/pandas-dev/pandas into arro…
lithomas1 Jul 22, 2023
8328120
pass more
lithomas1 Jul 23, 2023
f988f0d
Merge branch 'main' of https://github.com/pandas-dev/pandas into arro…
lithomas1 Aug 3, 2023
a889ebf
xfail everything
lithomas1 Aug 3, 2023
1f7ffea
revert unintentional change
lithomas1 Aug 3, 2023
faeed4c
fix typing and tests
lithomas1 Aug 3, 2023
47d48f1
green everything?
lithomas1 Aug 4, 2023
9a8d250
Merge branch 'main' into arrow-to-csv
lithomas1 Aug 4, 2023
ae9f87c
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Aug 4, 2023
c49309c
move option to end
lithomas1 Aug 8, 2023
74be30c
Merge branch 'main' into arrow-to-csv
lithomas1 Aug 9, 2023
d08991c
Merge branch 'main' into arrow-to-csv
lithomas1 Aug 18, 2023
08d9cf5
Merge branch 'main' into arrow-to-csv
lithomas1 Sep 24, 2023
8689109
Merge branch 'main' into arrow-to-csv
lithomas1 Nov 15, 2023
da13091
Update csvs.py
lithomas1 Nov 16, 2023
6345ab5
Update csvs.py
lithomas1 Nov 22, 2023
3948072
Merge branch 'main' into arrow-to-csv
lithomas1 Nov 22, 2023
bde1a2b
green and move whatsnew
lithomas1 Nov 22, 2023
cb5f6cd
updates
lithomas1 Nov 26, 2023
a9d3cc4
Merge branch 'main' of github.com:pandas-dev/pandas into arrow-to-csv
lithomas1 Nov 26, 2023
3d95a92
address code review
lithomas1 Nov 26, 2023
968b4bb
Merge branch 'main' into arrow-to-csv
lithomas1 Nov 28, 2023
c527ea3
Merge branch 'main' of github.com:pandas-dev/pandas into arrow-to-csv
lithomas1 Dec 7, 2023
ba451e1
fix tests
lithomas1 Dec 7, 2023
2999ebb
Merge branch 'main' into arrow-to-csv
lithomas1 Jan 15, 2024
68fd1e4
Merge branch 'main' into arrow-to-csv
swt2c Sep 5, 2025
8ff04f7
Move whatsnew entry to v3.0.0
swt2c Sep 5, 2025
84b4e59
Update versionadded to 3.0.0
swt2c Sep 5, 2025
5d6305e
No need to support pyarrow < 11 anymore
swt2c Sep 5, 2025
7da6613
Fixup test
swt2c Sep 5, 2025
7370008
Add escapechar to unsupported options
swt2c Sep 5, 2025
a157861
Sort whatsnew
swt2c Sep 5, 2025
b19c5a3
Fix type ignore
swt2c Sep 6, 2025
8a13c4b
Hopefully fix test_to_csv_single_level_multi_index on Windows
swt2c Sep 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ Other enhancements
- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`)
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
- Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)
Expand Down
26 changes: 24 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3717,6 +3717,7 @@ def to_csv(
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
engine: str = "python",
) -> str: ...

@overload
Expand Down Expand Up @@ -3744,6 +3745,7 @@ def to_csv(
decimal: str = ...,
errors: OpenFileErrors = ...,
storage_options: StorageOptions = ...,
engine: str = "python",
) -> None: ...

@final
Expand All @@ -3762,7 +3764,7 @@ def to_csv(
header: bool | list[str] = True,
index: bool = True,
index_label: IndexLabel | None = None,
mode: str = "w",
mode: str | None = None,
encoding: str | None = None,
compression: CompressionOptions = "infer",
quoting: int | None = None,
Expand All @@ -3775,6 +3777,7 @@ def to_csv(
decimal: str = ".",
errors: OpenFileErrors = "strict",
storage_options: StorageOptions | None = None,
engine: str = "python",
) -> str | None:
r"""
Write object to a comma-separated values (csv) file.
Expand Down Expand Up @@ -3807,14 +3810,17 @@ def to_csv(
sequence should be given if the object uses MultiIndex. If
False do not print fields for index names. Use index_label=False
for easier importing in R.
mode : {{'w', 'x', 'a'}}, default 'w'
mode : {{'w', 'x', 'a'}}, default 'w' (Python engine) or 'wb' (Pyarrow engine)
Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
the file opening. Typical values include:

- 'w', truncate the file first.
- 'x', exclusive creation, failing if the file already exists.
- 'a', append to the end of file if it exists.

.. note::
The pyarrow engine can only handle binary buffers.

encoding : str, optional
A string representing the encoding to use in the output file,
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
Expand Down Expand Up @@ -3862,6 +3868,16 @@ def to_csv(

{storage_options}

engine : str, default 'python'
The engine to use. Available options are "pyarrow" or "python".
The pyarrow engine requires the pyarrow library to be installed
and is generally faster than the python engine.

However, the python engine may be more feature complete than the
pyarrow engine.

.. versionadded:: 3.0.0

Returns
-------
None or str
Expand Down Expand Up @@ -3925,8 +3941,14 @@ def to_csv(
decimal=decimal,
)

if mode is None:
mode = "w"
if engine == "pyarrow":
mode += "b"

return DataFrameRenderer(formatter).to_csv(
path_or_buf,
engine=engine,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
Expand Down
104 changes: 94 additions & 10 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,21 @@
Sequence,
)
import csv as csvlib
import io
import os
from typing import (
IO,
TYPE_CHECKING,
Any,
AnyStr,
cast,
)

import numpy as np

from pandas._libs import writers as libwriters
from pandas._typing import SequenceNotStr
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -60,6 +64,7 @@ def __init__(
self,
formatter: DataFrameFormatter,
path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
engine: str = "python",
sep: str = ",",
cols: Sequence[Hashable] | None = None,
index_label: IndexLabel | None = None,
Expand All @@ -81,6 +86,7 @@ def __init__(
self.obj = self.fmt.frame

self.filepath_or_buffer = path_or_buf
self.engine = engine
self.encoding = encoding
self.compression: CompressionOptions = compression
self.mode = mode
Expand Down Expand Up @@ -247,6 +253,11 @@ def save(self) -> None:
"""
Create the writer & save.
"""
if self.engine == "pyarrow" and (
"b" not in self.mode or isinstance(self.filepath_or_buffer, io.TextIOBase)
):
raise ValueError("The pyarrow engine can only open files in binary mode.")

# apply compression and byte/text conversion
with get_handle(
self.filepath_or_buffer,
Expand All @@ -255,26 +266,99 @@ def save(self) -> None:
errors=self.errors,
compression=self.compression,
storage_options=self.storage_options,
# pyarrow engine exclusively writes bytes
is_text=self.engine == "python",
) as handles:
# Note: self.encoding is irrelevant here
# error: Argument "quoting" to "writer" has incompatible type "int";
# expected "Literal[0, 1, 2, 3]"

# This is a mypy bug?
# error: Cannot infer type argument 1 of "_save" of "CSVFormatter" [misc]
self._save(handles.handle) # type: ignore[misc]

def _save_pyarrow(self, handle: IO[AnyStr]) -> None:
pa = import_optional_dependency("pyarrow")
pa_csv = import_optional_dependency("pyarrow.csv")

if self.quotechar is not None and self.quotechar != '"':
raise ValueError('The pyarrow engine only supports " as a quotechar.')

unsupported_options = [
# each pair is (option value, default, option name)
(self.decimal, ".", "decimal"),
(self.float_format, None, "float_format"),
(self.na_rep, "", "na_rep"),
(self.date_format, None, "date_foramt"),
(self.lineterminator, os.linesep, "lineterminator"),
(self.encoding, None, "encoding"),
(self.errors, "strict", "errors"),
(self.escapechar, None, "escapechar"),
]

for opt_val, default, option in unsupported_options:
if opt_val != default:
raise ValueError(
f"The {option} option is not supported with the pyarrow engine."
)

# Convert index to column and rename name to empty string
# since we serialize the index as basically a column with no name
# TODO: this won't work for multi-indexes (without names)
obj = self.obj
if self.index:
new_names = [
label if label is not None else "" for label in self.obj.index.names
]
obj = self.obj.reset_index(names=new_names)

table = pa.Table.from_pandas(obj)

# Map quoting arg to pyarrow equivalents
if self.quoting == csvlib.QUOTE_MINIMAL:
pa_quoting = "needed"
elif self.quotechar is None:
raise TypeError("quotechar must be set if quoting enabled")
elif self.quoting == csvlib.QUOTE_ALL:
# TODO: Is this a 1-1 mapping?
# This doesn't quote nulls, check if Python does this
pa_quoting = "all_valid"
elif self.quoting == csvlib.QUOTE_NONE:
pa_quoting = "none"
else:
raise NotImplementedError(
f"Quoting option {self.quoting} is not supported with engine='pyarrow'"
)

kwargs: dict[str, Any] = {
"include_header": self._need_to_save_header,
"batch_size": self.chunksize,
}
kwargs["delimiter"] = self.sep
kwargs["quoting_style"] = pa_quoting

write_options = pa_csv.WriteOptions(**kwargs)
pa_csv.write_csv(table, handle, write_options)

def _save(self, handle: IO[AnyStr]) -> None:
if self.engine == "pyarrow":
self._save_pyarrow(handle)
else:
self.writer = csvlib.writer(
handles.handle,
# error: Argument of type "IO[AnyStr@_save]" cannot be assigned
# to parameter "csvfile" of type "SupportsWrite[str]"
# in function "writer"
# error: Argument "quoting" to "writer" has incompatible type "int";
# expected "Literal[0, 1, 2, 3]"
handle, # type: ignore[arg-type]
lineterminator=self.lineterminator,
delimiter=self.sep,
quoting=self.quoting, # type: ignore[arg-type]
doublequote=self.doublequote,
escapechar=self.escapechar,
quotechar=self.quotechar,
)

self._save()

def _save(self) -> None:
if self._need_to_save_header:
self._save_header()
self._save_body()
if self._need_to_save_header:
self._save_header()
self._save_body()

def _save_header(self) -> None:
if not self.has_mi_columns or self._has_aliases:
Expand Down
16 changes: 13 additions & 3 deletions pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,18 @@
from csv import QUOTE_NONE
from decimal import Decimal
from functools import partial
from io import StringIO
from io import (
BytesIO,
StringIO,
)
import math
import re
from shutil import get_terminal_size
from typing import (
TYPE_CHECKING,
Any,
Final,
Union,
cast,
)

Expand Down Expand Up @@ -977,6 +981,7 @@ def to_string(
def to_csv(
self,
path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
engine: str = "python",
encoding: str | None = None,
sep: str = ",",
columns: Sequence[Hashable] | None = None,
Expand All @@ -1000,12 +1005,13 @@ def to_csv(

if path_or_buf is None:
created_buffer = True
path_or_buf = StringIO()
path_or_buf = StringIO() if engine == "python" else BytesIO()
else:
created_buffer = False

csv_formatter = CSVFormatter(
path_or_buf=path_or_buf,
engine=engine,
lineterminator=lineterminator,
sep=sep,
encoding=encoding,
Expand All @@ -1026,8 +1032,12 @@ def to_csv(
csv_formatter.save()

if created_buffer:
assert isinstance(path_or_buf, StringIO)
path_or_buf = cast(Union[BytesIO, StringIO], path_or_buf)
content = path_or_buf.getvalue()
if isinstance(content, bytes):
# Need to decode into string since the
# pyarrow engine only writes binary data
content = content.decode("utf-8")
path_or_buf.close()
return content

Expand Down
Loading
Loading