Skip to content

REGR: preserve reindexed array object (instead of creating new array) for concat with all-NA array #47762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
REGR: preserve array object for concat with all-NA array
  • Loading branch information
jorisvandenbossche committed Jul 17, 2022
commit e89624fcf2b96f5b4c72e1cc03dc12b1e0d791ca
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- Fixed regression in :func:`concat` materializing :class:`Index` during sorting even if :class:`Index` was already sorted (:issue:`47501`)
- Fixed regression in :func:`concat` or :func:`merge` handling of all-NaN ExtensionArrays with custom attributes
-

.. ---------------------------------------------------------------------------
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike:
if len(values) and values[0] is None:
fill_value = None

if is_datetime64tz_dtype(empty_dtype):
if blk_dtype == empty_dtype and self.indexers:
# avoid creating new empty array if we already have an array
# with correct dtype that can be reindexed
pass
elif is_datetime64tz_dtype(empty_dtype):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this didn't need to change? (just a leftover from the code move?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, good catch

i8values = np.full(self.shape, fill_value.value)
return DatetimeArray(i8values, dtype=empty_dtype)

Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/array_with_attr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from pandas.tests.extension.array_with_attr.array import (
FloatAttrArray,
FloatAttrDtype,
make_data,
)

__all__ = ["FloatAttrArray", "FloatAttrDtype", "make_data"]
88 changes: 88 additions & 0 deletions pandas/tests/extension/array_with_attr/array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
Test extension array that has custom attribute information (not stored on the dtype).

"""
from __future__ import annotations

import numbers

import numpy as np

from pandas._typing import type_t

from pandas.core.dtypes.base import ExtensionDtype

import pandas as pd
from pandas.core.arrays import ExtensionArray


class FloatAttrDtype(ExtensionDtype):
type = int
name = "int_attr"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these should be float?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't matter for the test, but yet that's less confusing ;)

na_value = np.nan

@classmethod
def construct_array_type(cls) -> type_t[FloatAttrArray]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return FloatAttrArray


class FloatAttrArray(ExtensionArray):
dtype = FloatAttrDtype()
__array_priority__ = 1000

def __init__(self, values, attr=None) -> None:
if not isinstance(values, np.ndarray):
raise TypeError("Need to pass a numpy array of float64 dtype as values")
if not values.dtype == "float64":
raise TypeError("Need to pass a numpy array of float64 dtype as values")
self.data = values
self.attr = attr

@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
data = np.array(scalars, dtype="float64", copy=copy)
return cls(data)

def __getitem__(self, item):
if isinstance(item, numbers.Integral):
return self.data[item]
else:
# slice, list-like, mask
item = pd.api.indexers.check_array_indexer(self, item)
return type(self)(self.data[item], self.attr)

def __len__(self) -> int:
return len(self.data)

def isna(self):
return np.isnan(self.data)

def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take

data = self.data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill)
return type(self)(result, self.attr)

def copy(self):
return type(self)(self.data.copy(), self.attr)

@classmethod
def _concat_same_type(cls, to_concat):
data = np.concatenate([x.data for x in to_concat])
attr = to_concat[0].attr if len(to_concat) else None
return cls(data, attr)


def make_data():
return np.arange(100, dtype="float64")
33 changes: 33 additions & 0 deletions pandas/tests/extension/array_with_attr/test_array_with_attr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.array_with_attr import (
FloatAttrArray,
FloatAttrDtype,
make_data,
)


@pytest.fixture
def dtype():
return FloatAttrDtype()


@pytest.fixture
def data():
return FloatAttrArray(make_data())


def test_concat_with_all_na(data):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't use the fixture

# https://github.com/pandas-dev/pandas/issues/28840
#
arr = FloatAttrArray(np.array([np.nan, np.nan], dtype="float64"), attr="test")
df1 = pd.DataFrame({"col": arr, "key": [0, 1]})
df2 = pd.DataFrame({"key": [0, 1], "col2": [1, 2]})

result = pd.merge(df1, df2, on="key")
expected = pd.DataFrame({"col": arr, "key": [0, 1], "col2": [1, 2]})
tm.assert_frame_equal(result, expected)
assert result["col"].array.attr == "test"