pandas-dev · swt2c · Jun 25, 2023 · Jul 22, 2023 · Jul 23, 2023 · Aug 3, 2023
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -216,6 +216,7 @@ Other enhancements
 - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
 - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
 - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
+- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`)
 - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
 - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
 - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3717,6 +3717,7 @@ def to_csv(
         decimal: str = ...,
         errors: OpenFileErrors = ...,
         storage_options: StorageOptions = ...,
+        engine: str = "python",
     ) -> str: ...
 
     @overload
@@ -3744,6 +3745,7 @@ def to_csv(
         decimal: str = ...,
         errors: OpenFileErrors = ...,
         storage_options: StorageOptions = ...,
+        engine: str = "python",
     ) -> None: ...
 
     @final
@@ -3762,7 +3764,7 @@ def to_csv(
         header: bool | list[str] = True,
         index: bool = True,
         index_label: IndexLabel | None = None,
-        mode: str = "w",
+        mode: str | None = None,
         encoding: str | None = None,
         compression: CompressionOptions = "infer",
         quoting: int | None = None,
@@ -3775,6 +3777,7 @@ def to_csv(
         decimal: str = ".",
         errors: OpenFileErrors = "strict",
         storage_options: StorageOptions | None = None,
+        engine: str = "python",
     ) -> str | None:
         r"""
         Write object to a comma-separated values (csv) file.
@@ -3807,14 +3810,17 @@ def to_csv(
             sequence should be given if the object uses MultiIndex. If
             False do not print fields for index names. Use index_label=False
             for easier importing in R.
-        mode : {{'w', 'x', 'a'}}, default 'w'
+        mode : {{'w', 'x', 'a'}}, default 'w' (Python engine) or 'wb' (Pyarrow engine)
             Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
             the file opening. Typical values include:
 
             - 'w', truncate the file first.
             - 'x', exclusive creation, failing if the file already exists.
             - 'a', append to the end of file if it exists.
 
+            .. note::
+                The pyarrow engine can only handle binary buffers.
+
         encoding : str, optional
             A string representing the encoding to use in the output file,
             defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
@@ -3862,6 +3868,16 @@ def to_csv(
 
         {storage_options}
 
+        engine : str, default 'python'
+            The engine to use. Available options are "pyarrow" or "python".
+            The pyarrow engine requires the pyarrow library to be installed
+            and is generally faster than the python engine.
+
+            However, the python engine may be more feature complete than the
+            pyarrow engine.
+
+            .. versionadded:: 3.0.0
+
         Returns
         -------
         None or str
@@ -3925,8 +3941,14 @@ def to_csv(
             decimal=decimal,
         )
 
+        if mode is None:
+            mode = "w"
+            if engine == "pyarrow":
+                mode += "b"
+
         return DataFrameRenderer(formatter).to_csv(
             path_or_buf,
+            engine=engine,
             lineterminator=lineterminator,
             sep=sep,
             encoding=encoding,

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -11,17 +11,21 @@
     Sequence,
 )
 import csv as csvlib
+import io
 import os
 from typing import (
+    IO,
     TYPE_CHECKING,
     Any,
+    AnyStr,
     cast,
 )
 
 import numpy as np
 
 from pandas._libs import writers as libwriters
 from pandas._typing import SequenceNotStr
+from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.generic import (
@@ -60,6 +64,7 @@ def __init__(
         self,
         formatter: DataFrameFormatter,
         path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "",
+        engine: str = "python",
         sep: str = ",",
         cols: Sequence[Hashable] | None = None,
         index_label: IndexLabel | None = None,
@@ -81,6 +86,7 @@ def __init__(
         self.obj = self.fmt.frame
 
         self.filepath_or_buffer = path_or_buf
+        self.engine = engine
         self.encoding = encoding
         self.compression: CompressionOptions = compression
         self.mode = mode
@@ -247,6 +253,11 @@ def save(self) -> None:
         """
         Create the writer & save.
         """
+        if self.engine == "pyarrow" and (
+            "b" not in self.mode or isinstance(self.filepath_or_buffer, io.TextIOBase)
+        ):
+            raise ValueError("The pyarrow engine can only open files in binary mode.")
+
         # apply compression and byte/text conversion
         with get_handle(
             self.filepath_or_buffer,
@@ -255,26 +266,99 @@ def save(self) -> None:
             errors=self.errors,
             compression=self.compression,
             storage_options=self.storage_options,
+            # pyarrow engine exclusively writes bytes
+            is_text=self.engine == "python",
         ) as handles:
             # Note: self.encoding is irrelevant here
-            # error: Argument "quoting" to "writer" has incompatible type "int";
-            # expected "Literal[0, 1, 2, 3]"
+
+            # This is a mypy bug?
+            # error: Cannot infer type argument 1 of "_save" of "CSVFormatter"  [misc]
+            self._save(handles.handle)  # type: ignore[misc]
+
+    def _save_pyarrow(self, handle: IO[AnyStr]) -> None:
+        pa = import_optional_dependency("pyarrow")
+        pa_csv = import_optional_dependency("pyarrow.csv")
+
+        if self.quotechar is not None and self.quotechar != '"':
+            raise ValueError('The pyarrow engine only supports " as a quotechar.')
+
+        unsupported_options = [
+            # each pair is (option value, default, option name)
+            (self.decimal, ".", "decimal"),
+            (self.float_format, None, "float_format"),
+            (self.na_rep, "", "na_rep"),
+            (self.date_format, None, "date_foramt"),
+            (self.lineterminator, os.linesep, "lineterminator"),
+            (self.encoding, None, "encoding"),
+            (self.errors, "strict", "errors"),
+            (self.escapechar, None, "escapechar"),
+        ]
+
+        for opt_val, default, option in unsupported_options:
+            if opt_val != default:
+                raise ValueError(
+                    f"The {option} option is not supported with the pyarrow engine."
+                )
+
+        # Convert index to column and rename name to empty string
+        # since we serialize the index as basically a column with no name
+        # TODO: this won't work for multi-indexes (without names)
+        obj = self.obj
+        if self.index:
+            new_names = [
+                label if label is not None else "" for label in self.obj.index.names
+            ]
+            obj = self.obj.reset_index(names=new_names)
+
+        table = pa.Table.from_pandas(obj)
+
+        # Map quoting arg to pyarrow equivalents
+        if self.quoting == csvlib.QUOTE_MINIMAL:
+            pa_quoting = "needed"
+        elif self.quotechar is None:
+            raise TypeError("quotechar must be set if quoting enabled")
+        elif self.quoting == csvlib.QUOTE_ALL:
+            # TODO: Is this a 1-1 mapping?
+            # This doesn't quote nulls, check if Python does this
+            pa_quoting = "all_valid"
+        elif self.quoting == csvlib.QUOTE_NONE:
+            pa_quoting = "none"
+        else:
+            raise NotImplementedError(
+                f"Quoting option {self.quoting} is not supported with engine='pyarrow'"
+            )
+
+        kwargs: dict[str, Any] = {
+            "include_header": self._need_to_save_header,
+            "batch_size": self.chunksize,
+        }
+        kwargs["delimiter"] = self.sep
+        kwargs["quoting_style"] = pa_quoting
+
+        write_options = pa_csv.WriteOptions(**kwargs)
+        pa_csv.write_csv(table, handle, write_options)
+
+    def _save(self, handle: IO[AnyStr]) -> None:
+        if self.engine == "pyarrow":
+            self._save_pyarrow(handle)
+        else:
             self.writer = csvlib.writer(
-                handles.handle,
+                # error: Argument of type "IO[AnyStr@_save]" cannot be assigned
+                # to parameter "csvfile" of type "SupportsWrite[str]"
+                # in function "writer"
+                # error: Argument "quoting" to "writer" has incompatible type "int";
+                # expected "Literal[0, 1, 2, 3]"
+                handle,  # type: ignore[arg-type]
                 lineterminator=self.lineterminator,
                 delimiter=self.sep,
                 quoting=self.quoting,  # type: ignore[arg-type]
                 doublequote=self.doublequote,
                 escapechar=self.escapechar,
                 quotechar=self.quotechar,
             )
-
-            self._save()
-
-    def _save(self) -> None:
-        if self._need_to_save_header:
-            self._save_header()
-        self._save_body()
+            if self._need_to_save_header:
+                self._save_header()
+            self._save_body()
 
     def _save_header(self) -> None:
         if not self.has_mi_columns or self._has_aliases:

diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -16,14 +16,18 @@
 from csv import QUOTE_NONE
 from decimal import Decimal
 from functools import partial
-from io import StringIO
+from io import (
+    BytesIO,
+    StringIO,
+)
 import math
 import re
 from shutil import get_terminal_size
 from typing import (
     TYPE_CHECKING,
     Any,
     Final,
+    Union,
     cast,
 )
 
@@ -977,6 +981,7 @@ def to_string(
     def to_csv(
         self,
         path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        engine: str = "python",
         encoding: str | None = None,
         sep: str = ",",
         columns: Sequence[Hashable] | None = None,
@@ -1000,12 +1005,13 @@ def to_csv(
 
         if path_or_buf is None:
             created_buffer = True
-            path_or_buf = StringIO()
+            path_or_buf = StringIO() if engine == "python" else BytesIO()
         else:
             created_buffer = False
 
         csv_formatter = CSVFormatter(
             path_or_buf=path_or_buf,
+            engine=engine,
             lineterminator=lineterminator,
             sep=sep,
             encoding=encoding,
@@ -1026,8 +1032,12 @@ def to_csv(
         csv_formatter.save()
 
         if created_buffer:
-            assert isinstance(path_or_buf, StringIO)
+            path_or_buf = cast(Union[BytesIO, StringIO], path_or_buf)
             content = path_or_buf.getvalue()
+            if isinstance(content, bytes):
+                # Need to decode into string since the
+                # pyarrow engine only writes binary data
+                content = content.decode("utf-8")
             path_or_buf.close()
             return content