Skip to content

Backport PR #56587 on branch 2.2.x (ENH: support the Arrow PyCapsule Interface on pandas.DataFrame (export)) #56944

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/compat/_optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,8 @@ def import_optional_dependency(
The imported module, when found and the version is correct.
None is returned when the package is not found and `errors`
is False, or when the package's version is too old and `errors`
is ``'warn'``.
is ``'warn'`` or ``'ignore'``.
"""

assert errors in {"warn", "raise", "ignore"}

package_name = INSTALL_MAPPING.get(name)
Expand Down Expand Up @@ -163,5 +162,7 @@ def import_optional_dependency(
return None
elif errors == "raise":
raise ImportError(msg)
else:
return None

return module
27 changes: 27 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,33 @@ def __dataframe_consortium_standard__(
)
return convert_to_standard_compliant_dataframe(self, api_version=api_version)

def __arrow_c_stream__(self, requested_schema=None):
"""
Export the pandas DataFrame as an Arrow C stream PyCapsule.

This relies on pyarrow to convert the pandas DataFrame to the Arrow
format (and follows the default behaviour of ``pyarrow.Table.from_pandas``
in its handling of the index, i.e. store the index as a column except
for RangeIndex).
This conversion is not necessarily zero-copy.

Parameters
----------
requested_schema : PyCapsule, default None
The schema to which the dataframe should be casted, passed as a
PyCapsule containing a C ArrowSchema representation of the
requested schema.

Returns
-------
PyCapsule
"""
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
if requested_schema is not None:
requested_schema = pa.Schema._import_from_c_capsule(requested_schema)
table = pa.Table.from_pandas(self, schema=requested_schema)
return table.__arrow_c_stream__()

# ----------------------------------------------------------------------

@property
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/frame/test_arrow_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import ctypes

import pytest

import pandas.util._test_decorators as td

import pandas as pd

pa = pytest.importorskip("pyarrow")


@td.skip_if_no("pyarrow", min_version="14.0")
def test_dataframe_arrow_interface():
df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})

capsule = df.__arrow_c_stream__()
assert (
ctypes.pythonapi.PyCapsule_IsValid(
ctypes.py_object(capsule), b"arrow_array_stream"
)
== 1
)

table = pa.table(df)
expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
assert table.equals(expected)

schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
table = pa.table(df, schema=schema)
expected = expected.cast(schema)
assert table.equals(expected)


@td.skip_if_no("pyarrow", min_version="15.0")
def test_dataframe_to_arrow():
df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})

table = pa.RecordBatchReader.from_stream(df)
expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
assert table.equals(expected)

schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
table = pa.RecordBatchReader.from_stream(df, schema=schema)
expected = expected.cast(schema)
assert table.equals(expected)
14 changes: 14 additions & 0 deletions pandas/tests/test_optional_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ def test_bad_version(monkeypatch):
result = import_optional_dependency("fakemodule")
assert result is module

with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"):
import_optional_dependency("fakemodule", min_version="1.1.0")

with tm.assert_produces_warning(UserWarning):
result = import_optional_dependency(
"fakemodule", errors="warn", min_version="1.1.0"
)
assert result is None

result = import_optional_dependency(
"fakemodule", errors="ignore", min_version="1.1.0"
)
assert result is None


def test_submodule(monkeypatch):
# Create a fake module with a submodule
Expand Down