diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 960b205c49c617..dd06bade2a2038 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1639,6 +1639,7 @@ Conversion - Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) - Bug in :meth:`DataFrame.clip` in which column types are not preserved and casted to float (:issue:`24162`) - Bug in :meth:`DataFrame.clip` when order of columns of dataframes doesn't match, result observed is wrong in numeric values (:issue:`20911`) +- Bug in :meth:`DataFrame.astype` where converting to an extension dtype when duplicate column names are present causes a ``RecursionError`` (:issue:`24704`) Strings ^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1e6ae716606179..a0ee9cb253fef0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5670,9 +5670,10 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): results.append(results.append(col.copy() if copy else col)) elif is_extension_array_dtype(dtype) and self.ndim > 1: - # GH 18099: columnwise conversion to categorical - # and extension dtype - results = (self[col].astype(dtype, copy=copy) for col in self) + # GH 18099/22869: columnwise conversion to extension dtype + # GH 24704: use iloc to handle duplicate column names + results = (self.iloc[:, i].astype(dtype, copy=copy) + for i in range(len(self.columns))) else: # else, only a single dtype is given diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 70de148dd8fd28..a9f8ab47b16de2 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -709,6 +709,17 @@ def test_astype_extension_dtypes_1d(self, dtype): tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) + @pytest.mark.parametrize("dtype", ['category', 'Int64']) + def test_astype_extension_dtypes_duplicate_col(self, dtype): + # GH 24704 + a1 = Series([0, np.nan, 4], name='a') + a2 = Series([np.nan, 3, 5], name='a') + df = concat([a1, a2], axis=1) + + result = df.astype(dtype) + expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) + assert_frame_equal(result, expected) + @pytest.mark.parametrize('dtype', [ {100: 'float64', 200: 'uint64'}, 'category', 'float64']) def test_astype_column_metadata(self, dtype):