-
-
Notifications
You must be signed in to change notification settings - Fork 18.8k
ENH: Basis for a StringDtype using Arrow #35259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4c2e37a
d477ee7
206f493
d58dba6
7a9e2c3
ffc4c0f
c1305ab
13a42f7
decd022
3145e44
e22b348
4b8108c
2446562
a0dcc85
5c42173
28c3ef2
4044d4c
1740524
e9bb36f
8ad120b
34bf57d
f92241e
c09382d
bac64c1
0956147
963e1cf
87b8e67
1ed0585
fa954f7
82b84bf
b1a3032
08d34f4
ae49807
2e5d4c7
c8318cc
1a200a2
e10be80
c1d3087
34f563d
f5fc4fd
a5a7c85
f651563
f5419b9
3af5ce0
bdf4ad2
e044c7f
c5625a8
50889fb
0e1773b
7bb9574
fc45ef7
51d7d0a
bd76a75
3cf5c91
07239a0
9a7cfc5
2ba0dcd
97c56e2
d6d3543
ab40dce
d71a895
f342b62
3d05c89
b3c6347
26bca25
9579444
88094a7
ba0cee8
6709ac3
11388b4
eb284e7
27ce19a
9b70709
6757feb
460ea38
7bee5e2
91f3763
36b662a
7a9ef9c
5db8788
c76c39f
87b7863
24a782d
353bff9
be93947
11eb08f
52440a7
bd05c2c
27c8de5
b6713e9
125cb6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,254 @@ | ||
from typing import TYPE_CHECKING, Tuple, Type, Union | ||
|
||
import pyarrow as pa | ||
|
||
from pandas._libs import missing as libmissing | ||
|
||
from pandas.core.dtypes.base import ExtensionDtype | ||
from pandas.core.dtypes.dtypes import register_extension_dtype | ||
|
||
from pandas.core.arrays.base import ExtensionArray | ||
|
||
if TYPE_CHECKING: | ||
import numpy as np | ||
|
||
|
||
@register_extension_dtype | ||
class ArrowStringDtype(ExtensionDtype): | ||
""" | ||
Extension dtype for string data in a ``pyarrow.ChunkedArray``. | ||
|
||
.. versionadded:: 1.1.0 | ||
|
||
.. warning:: | ||
|
||
ArrowStringDtype is considered experimental. The implementation and | ||
parts of the API may change without warning. | ||
|
||
Attributes | ||
---------- | ||
None | ||
|
||
Methods | ||
------- | ||
None | ||
|
||
Examples | ||
-------- | ||
>>> pd.ArrowStringDtype() | ||
ArrowStringDtype | ||
""" | ||
|
||
name = "arrow_string" | ||
|
||
#: StringDtype.na_value uses pandas.NA | ||
na_value = libmissing.NA | ||
|
||
@property | ||
def type(self) -> Type[str]: | ||
return str | ||
|
||
@classmethod | ||
def construct_array_type(cls) -> Type["ArrowStringArray"]: | ||
""" | ||
Return the array type associated with this dtype. | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
return ArrowStringArray | ||
|
||
def __hash__(self) -> int: | ||
return hash("ArrowStringDtype") | ||
|
||
def __repr__(self) -> str: | ||
return "ArrowStringDtype" | ||
|
||
def __from_arrow__( | ||
self, array: Union["pa.Array", "pa.ChunkedArray"] | ||
) -> "ArrowStringArray": | ||
""" | ||
Construct StringArray from pyarrow Array/ChunkedArray. | ||
""" | ||
return ArrowStringArray(array) | ||
|
||
def __eq__(self, other) -> bool: | ||
"""Check whether 'other' is equal to self. | ||
|
||
By default, 'other' is considered equal if | ||
* it's a string matching 'self.name'. | ||
* it's an instance of this type. | ||
|
||
Parameters | ||
---------- | ||
other : Any | ||
|
||
Returns | ||
------- | ||
bool | ||
""" | ||
if isinstance(other, ArrowStringDtype): | ||
return True | ||
elif isinstance(other, str) and other == "arrow_string": | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
class ArrowStringArray(ExtensionArray): | ||
""" | ||
Extension array for string data in a ``pyarrow.ChunkedArray``. | ||
|
||
.. versionadded:: 1.1.0 | ||
|
||
.. warning:: | ||
|
||
ArrowStringArray is considered experimental. The implementation and | ||
parts of the API may change without warning. | ||
|
||
Parameters | ||
---------- | ||
values : pyarrow.Array or pyarrow.ChunkedArray | ||
The array of data. | ||
|
||
Attributes | ||
---------- | ||
None | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
Methods | ||
------- | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
None | ||
|
||
See Also | ||
-------- | ||
array | ||
The recommended function for creating a ArrowStringArray. | ||
Series.str | ||
The string methods are available on Series backed by | ||
a ArrowStringArray. | ||
|
||
Notes | ||
----- | ||
ArrowStringArray returns a BooleanArray for comparison methods. | ||
|
||
Examples | ||
-------- | ||
>>> pd.array(['This is', 'some text', None, 'data.'], dtype="arrow_string") | ||
<ArrowStringArray> | ||
['This is', 'some text', <NA>, 'data.'] | ||
Length: 4, dtype: arrow_string | ||
""" | ||
|
||
def __init__(self, values): | ||
if isinstance(values, pa.Array): | ||
self.data = pa.chunked_array([values]) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif isinstance(values, pa.ChunkedArray): | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.data = values | ||
else: | ||
raise ValueError(f"Unsupported type '{type(values)}' for ArrowStringArray") | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, dtype=None, copy=False): | ||
return cls(pa.array(scalars, type=pa.string())) | ||
|
||
@property | ||
def dtype(self) -> ArrowStringDtype: | ||
""" | ||
An instance of 'ArrowStringDtype'. | ||
""" | ||
return ArrowStringDtype() | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __array__(self, *args, **kwargs) -> "np.ndarray": | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"""Correctly construct numpy arrays when passed to `np.asarray()`.""" | ||
return self.data.__array__(*args, **kwargs) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __arrow_array__(self, type=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you can type |
||
"""Convert myself to a pyarrow Array or ChunkedArray.""" | ||
return self.data | ||
|
||
@property | ||
def size(self) -> int: | ||
""" | ||
Return the number of elements in this array. | ||
|
||
Returns | ||
------- | ||
size : int | ||
""" | ||
return len(self.data) | ||
|
||
@property | ||
def shape(self) -> Tuple[int]: | ||
"""Return the shape of the data.""" | ||
# This may be patched by pandas to support pseudo-2D operations. | ||
return (len(self.data),) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
@property | ||
def ndim(self) -> int: | ||
"""Return the number of dimensions of the underlying data.""" | ||
return 1 | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __len__(self) -> int: | ||
""" | ||
Length of this array. | ||
|
||
Returns | ||
------- | ||
length : int | ||
""" | ||
return len(self.data) | ||
|
||
@classmethod | ||
def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you type input args as much as possile |
||
return cls._from_sequence(strings, dtype=dtype, copy=copy) | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# def _values_for_factorize(self): | ||
# arr = self._ndarray.copy() | ||
# mask = self.isna() | ||
# arr[mask] = -1 | ||
# return arr, -1 | ||
|
||
def __setitem__(self, key, value): | ||
raise NotImplementedError("__setitem__") | ||
|
||
def fillna(self, value=None, method=None, limit=None): | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise NotImplementedError("fillna") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Starting with pyarrow 1.0, there is a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The problem with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have copied the fletcher implementation as a starting point. |
||
|
||
# def astype(self, dtype, copy=True): | ||
# dtype = pandas_dtype(dtype) | ||
# if isinstance(dtype, StringDtype): | ||
# if copy: | ||
# return self.copy() | ||
# return self | ||
# elif isinstance(dtype, _IntegerDtype): | ||
# arr = self._ndarray.copy() | ||
# mask = self.isna() | ||
# arr[mask] = 0 | ||
# values = arr.astype(dtype.numpy_dtype) | ||
# return IntegerArray(values, mask, copy=False) | ||
|
||
# return super().astype(dtype, copy) | ||
|
||
def _reduce(self, name, skipna=True, **kwargs): | ||
if name in ["min", "max"]: | ||
return getattr(self, name)(skipna=skipna) | ||
|
||
raise TypeError(f"Cannot perform reduction '{name}' with string dtype") | ||
|
||
# def value_counts(self, dropna=False): | ||
# from pandas import value_counts | ||
|
||
# return value_counts(self._ndarray, dropna=dropna).astype("Int64") | ||
|
||
@property | ||
def nbytes(self) -> int: | ||
""" | ||
The number of bytes needed to store this object in memory. | ||
""" | ||
size = 0 | ||
for chunk in self.data.chunks: | ||
for buf in chunk.buffers(): | ||
if buf is not None: | ||
size += buf.size | ||
return size | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
import string | ||
|
||
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
from pandas.core.arrays.string_arrow import ArrowStringArray, ArrowStringDtype | ||
from pandas.tests.extension import base | ||
|
||
|
||
@pytest.fixture | ||
def dtype(): | ||
return ArrowStringDtype() | ||
|
||
|
||
@pytest.fixture | ||
def data(): | ||
strings = np.random.choice(list(string.ascii_letters), size=100) | ||
while strings[0] == strings[1]: | ||
strings = np.random.choice(list(string.ascii_letters), size=100) | ||
|
||
return ArrowStringArray._from_sequence(strings) | ||
|
||
|
||
@pytest.fixture | ||
def data_missing(): | ||
"""Length 2 array with [NA, Valid]""" | ||
# TODO(ARROW-9407): Accept pd.NA in Arrow | ||
return ArrowStringArray._from_sequence([pd.NA, "A"]) | ||
|
||
|
||
@pytest.fixture | ||
def data_for_sorting(): | ||
return ArrowStringArray._from_sequence(["B", "C", "A"]) | ||
|
||
|
||
@pytest.fixture | ||
def data_missing_for_sorting(): | ||
# TODO(ARROW-9407): Accept pd.NA in Arrow | ||
return ArrowStringArray._from_sequence(["B", pd.NA, "A"]) | ||
|
||
|
||
@pytest.fixture | ||
def na_value(): | ||
# TODO(ARROW-9407): Accept pd.NA in Arrow | ||
return pd.NA | ||
|
||
|
||
@pytest.fixture | ||
def data_for_grouping(): | ||
# TODO(ARROW-9407): Accept pd.NA in Arrow | ||
return ArrowStringArray._from_sequence(["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]) | ||
|
||
|
||
class TestDtype(base.BaseDtypeTests): | ||
pass | ||
|
||
|
||
class TestInterface(base.BaseInterfaceTests): | ||
pass | ||
|
||
|
||
# class TestConstructors(base.BaseConstructorsTests): | ||
# pass | ||
|
||
|
||
# class TestReshaping(base.BaseReshapingTests): | ||
# pass | ||
|
||
|
||
# class TestGetitem(base.BaseGetitemTests): | ||
# pass | ||
|
||
|
||
# class TestSetitem(base.BaseSetitemTests): | ||
# pass | ||
|
||
|
||
# class TestMissing(base.BaseMissingTests): | ||
# pass | ||
simonjayhawkins marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
|
||
# class TestNoReduce(base.BaseNoReduceTests): | ||
# @pytest.mark.parametrize("skipna", [True, False]) | ||
# def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): | ||
# op_name = all_numeric_reductions | ||
# | ||
# if op_name in ["min", "max"]: | ||
# return None | ||
# | ||
# s = pd.Series(data) | ||
# with pytest.raises(TypeError): | ||
# getattr(s, op_name)(skipna=skipna) | ||
|
||
|
||
# class TestMethods(base.BaseMethodsTests): | ||
# @pytest.mark.skip(reason="returns nullable") | ||
# def test_value_counts(self, all_data, dropna): | ||
# return super().test_value_counts(all_data, dropna) | ||
|
||
|
||
# class TestCasting(base.BaseCastingTests): | ||
# pass | ||
|
||
|
||
# class TestComparisonOps(base.BaseComparisonOpsTests): | ||
# def _compare_other(self, s, data, op_name, other): | ||
# result = getattr(s, op_name)(other) | ||
# expected = getattr(s.astype(object), op_name)(other).astype("boolean") | ||
# self.assert_series_equal(result, expected) | ||
|
||
# def test_compare_scalar(self, data, all_compare_operators): | ||
# op_name = all_compare_operators | ||
# s = pd.Series(data) | ||
# self._compare_other(s, data, op_name, "abc") | ||
|
||
|
||
# class TestParsing(base.BaseParsingTests): | ||
# pass | ||
|
||
|
||
# class TestPrinting(base.BasePrintingTests): | ||
# pass | ||
|
||
|
||
# class TestGroupBy(base.BaseGroupbyTests): | ||
# pass |
Uh oh!
There was an error while loading. Please reload this page.