Skip to content

REF: put EA concat logic in _concat_arrays #33535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Next Next commit
REF: implement _concat_arrays, _concat_same_dtype in some EAs
  • Loading branch information
jbrockmendel committed Apr 13, 2020
commit 78ec7562dba8e4a607f7804167eb10e506605b7c
79 changes: 76 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ensure_platform_int,
is_categorical_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dict_like,
is_dtype_equal,
is_extension_array_dtype,
Expand Down Expand Up @@ -2348,10 +2349,82 @@ def _can_hold_na(self):
return True

@classmethod
def _concat_same_type(self, to_concat):
from pandas.core.dtypes.concat import concat_categorical
def _concat_same_type(cls, to_concat):
return cls._concat_arrays(to_concat)
# TODO: lock down stricter behavior?

return concat_categorical(to_concat)
@classmethod
def _concat_same_dtype(
cls,
to_concat,
axis: int = 0,
sort_categories: bool = False,
ignore_order: bool = False,
):
"""
Like _concat_same_type, but with the added restriction of matching dtypes.
"""
ordered = False

first = to_concat[0]

# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_concat[1:]):
new_codes = np.concatenate([c.codes for c in to_concat])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_concat[1:]
]
new_codes = np.concatenate(codes)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

new_codes = take_1d(indexer, new_codes, fill_value=-1)

if ignore_order:
ordered = False

return cls(new_codes, categories=categories, ordered=ordered, fastpath=True)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
from pandas.core.dtypes.concat import concat_compat, union_categoricals

categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]

result = concat_compat(to_concat)
if axis == 1:
# TODO(EA2D): this is a kludge for 1D EAs
result = result.reshape(1, len(result))
return result

def isin(self, values):
"""
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1023,6 +1023,19 @@ def _concat_same_type(cls, to_concat):

return cls(data, sparse_index=sp_index, fill_value=fill_value)

@classmethod
def _concat_arrays(cls, to_concat, axis: int = 0):
fill_values = [x.fill_value for x in to_concat if isinstance(x, cls)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x if isinstance(x, cls) else cls(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return cls._concat_same_type(to_concat)

def astype(self, dtype=None, copy=True):
"""
Change the dtype of a SparseArray.
Expand Down
88 changes: 11 additions & 77 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,18 @@ def is_nonempty(x) -> bool:
if "category" in typs:
# this must be prior to concat_datetime,
# to support Categorical + datetime-like
return concat_categorical(to_concat, axis=axis)
from pandas import Categorical

return Categorical._concat_arrays(to_concat, axis=axis)

elif _contains_datetime or "timedelta" in typs or _contains_period:
return concat_datetime(to_concat, axis=axis, typs=typs)

# these are mandated to handle empties as well
elif "sparse" in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
from pandas.core.arrays import SparseArray

return SparseArray._concat_arrays(to_concat, axis=axis)

all_empty = not len(non_empties)
single_dtype = len({x.dtype for x in to_concat}) == 1
Expand Down Expand Up @@ -165,30 +169,9 @@ def concat_categorical(to_concat, axis: int = 0):
# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)]
from pandas import Categorical

# validate the categories
if len(categoricals) != len(to_concat):
pass
else:
# when all categories are identical
first = to_concat[0]
if all(first.is_dtype_equal(other) for other in to_concat[1:]):
return union_categoricals(categoricals)

# extract the categoricals & coerce to object if needed
to_concat = [
x._internal_get_values()
if is_categorical_dtype(x.dtype)
else np.asarray(x).ravel()
if not is_datetime64tz_dtype(x)
else np.asarray(x.astype(object))
for x in to_concat
]
result = concat_compat(to_concat)
if axis == 1:
result = result.reshape(1, len(result))
return result
return Categorical._concat_arrays(to_concat, axis=axis)


def union_categoricals(
Expand Down Expand Up @@ -318,28 +301,10 @@ def _maybe_unwrap(x):
ordered = False
if all(first.is_dtype_equal(other) for other in to_union[1:]):
# identical categories - fastpath
categories = first.categories
ordered = first.ordered

if all(first.categories.equals(other.categories) for other in to_union[1:]):
new_codes = np.concatenate([c.codes for c in to_union])
else:
codes = [first.codes] + [
recode_for_categories(other.codes, other.categories, first.categories)
for other in to_union[1:]
]
new_codes = np.concatenate(codes)

if sort_categories and not ignore_order and ordered:
raise TypeError("Cannot use sort_categories=True with ordered Categoricals")

if sort_categories and not categories.is_monotonic_increasing:
categories = categories.sort_values()
indexer = categories.get_indexer(first.categories)

from pandas.core.algorithms import take_1d
return Categorical._concat_same_dtype(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also move the full of union_categoricals do the categorical array module?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that would be my preference too, but trying to keep the already-broad scope/diff limited

to_union, sort_categories=sort_categories, ignore_order=ignore_order,
)

new_codes = take_1d(indexer, new_codes, fill_value=-1)
elif ignore_order or all(not c.ordered for c in to_union):
# different categories - union and recode
cats = first.categories.append([c.categories for c in to_union[1:]])
Expand Down Expand Up @@ -454,34 +419,3 @@ def _concat_datetimetz(to_concat, name=None):
return sample._concat_same_dtype(to_concat, name=name)
elif isinstance(sample, ABCDatetimeArray):
return sample._concat_same_type(to_concat)


def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype

Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes

Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.core.arrays import SparseArray

fill_values = [x.fill_value for x in to_concat if isinstance(x, SparseArray)]
fill_value = fill_values[0]

# TODO: Fix join unit generation so we aren't passed this.
to_concat = [
x
if isinstance(x, SparseArray)
else SparseArray(x.squeeze(), fill_value=fill_value)
for x in to_concat
]

return SparseArray._concat_same_type(to_concat)