Skip to content

BUG: DataFrame.astype(series) with duplicate columns #44417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Nov 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,7 @@ Styler

Other
^^^^^
- Bug in :meth:`DataFrame.astype` with non-unique columns and a :class:`Series` ``dtype`` argument (:issue:`44417`)
- Bug in :meth:`CustomBusinessMonthBegin.__add__` (:meth:`CustomBusinessMonthEnd.__add__`) not applying the extra ``offset`` parameter when beginning (end) of the target month is already a business day (:issue:`41356`)
- Bug in :meth:`RangeIndex.union` with another ``RangeIndex`` with matching (even) ``step`` and starts differing by strictly less than ``step / 2`` (:issue:`44019`)
- Bug in :meth:`RangeIndex.difference` with ``sort=None`` and ``step<0`` failing to sort (:issue:`44085`)
Expand Down
20 changes: 14 additions & 6 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5826,14 +5826,22 @@ def astype(
"Only a column name can be used for the "
"key in a dtype mappings argument."
)

# GH#44417 cast to Series so we can use .iat below, which will be
# robust in case we
from pandas import Series

dtype_ser = Series(dtype, dtype=object)
dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)

results = []
for col_name, col in self.items():
if col_name in dtype:
results.append(
col.astype(dtype=dtype[col_name], copy=copy, errors=errors)
)
for i, (col_name, col) in enumerate(self.items()):
cdt = dtype_ser.iat[i]
if isna(cdt):
res_col = col.copy() if copy else col
else:
results.append(col.copy() if copy else col)
res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
results.append(res_col)

elif is_extension_array_dtype(dtype) and self.ndim > 1:
# GH 18099/22869: columnwise conversion to extension dtype
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -992,7 +992,7 @@ def _wrap_applied_output(
result = self.obj._constructor(
index=self.grouper.result_index, columns=data.columns
)
result = result.astype(data.dtypes.to_dict(), copy=False)
result = result.astype(data.dtypes, copy=False)
return result

# GH12824
Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,26 @@ def test_astype_duplicate_col(self):
expected = concat([a1_str, b, a2_str], axis=1)
tm.assert_frame_equal(result, expected)

def test_astype_duplicate_col_series_arg(self):
# GH#44417
vals = np.random.randn(3, 4)
df = DataFrame(vals, columns=["A", "B", "C", "A"])
dtypes = df.dtypes
dtypes.iloc[0] = str
dtypes.iloc[2] = "Float64"

result = df.astype(dtypes)
expected = DataFrame(
{
0: vals[:, 0].astype(str),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
}
)
expected.columns = df.columns
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype",
[
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,6 +2031,16 @@ def get_result():
tm.assert_equal(result, expected)


def test_empty_groupby_apply_nonunique_columns():
# GH#44417
df = DataFrame(np.random.randn(0, 4))
df[3] = df[3].astype(np.int64)
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
res = gb.apply(lambda x: x)
assert (res.dtypes == df.dtypes).all()


def test_tuple_as_grouping():
# https://github.com/pandas-dev/pandas/issues/18314
df = DataFrame(
Expand Down