From f90753b69b0923adb1625d03787bcedd45cd533e Mon Sep 17 00:00:00 2001 From: tylerriccio33 <83321774+tylerriccio33@users.noreply.github.com> Date: Mon, 15 Jul 2024 02:44:59 -0400 Subject: [PATCH] fix(python): Raise for overlapping index/column names in pandas dataframes post string coercion (#17628) --- .../polars/_utils/construction/dataframe.py | 28 ++++++++++++++----- .../tests/unit/interop/test_from_pandas.py | 17 +++++++++-- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index d41fd0da3529..8c94a0e31e78 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -1029,6 +1029,26 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr return df._df +def _check_pandas_columns(data: pd.DataFrame) -> None: + """Check pandas dataframe columns can be converted to polars.""" + stringified_cols: set[str] = {str(col) for col in data.columns} + stringified_index: set[str] = {str(idx) for idx in data.index.names} + + non_unique_cols: bool = len(stringified_cols) < len(data.columns) + non_unique_indices: bool = len(stringified_index) < len(data.index.names) + if non_unique_cols or non_unique_indices: + msg = ( + "Pandas dataframe contains non-unique indices and/or column names. " + "Polars dataframes require unique string names for columns." + ) + raise ValueError(msg) + + overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index + if len(overlapping_cols_and_indices) > 0: + msg = "Pandas indices and column names must not overlap." + raise ValueError(msg) + + def pandas_to_pydf( data: pd.DataFrame, schema: SchemaDefinition | None = None, @@ -1040,13 +1060,7 @@ def pandas_to_pydf( include_index: bool = False, ) -> PyDataFrame: """Construct a PyDataFrame from a pandas DataFrame.""" - stringified_cols = {str(col) for col in data.columns} - if len(stringified_cols) < len(data.columns): - msg = ( - "Polars dataframes must have unique string column names." - "Please check your pandas dataframe for duplicates." - ) - raise ValueError(msg) + _check_pandas_columns(data) convert_index = include_index and not _pandas_has_default_index(data) if not convert_index and all( diff --git a/py-polars/tests/unit/interop/test_from_pandas.py b/py-polars/tests/unit/interop/test_from_pandas.py index 7a49a139b163..624ab851c5df 100644 --- a/py-polars/tests/unit/interop/test_from_pandas.py +++ b/py-polars/tests/unit/interop/test_from_pandas.py @@ -15,6 +15,13 @@ from polars._typing import PolarsDataType +def index_not_silently_excluded() -> None: + ddict = {"a": [1, 2, 3], "b": [4, 5, 6]} + df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a")) + with pytest.raises(ValueError, match="indices and column names must not overlap"): + pl.from_pandas(df, include_index=True) + + def test_from_pandas() -> None: df = pd.DataFrame( { @@ -174,13 +181,19 @@ def test_from_pandas_include_indexes() -> None: def test_duplicate_cols_diff_types() -> None: df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1]) - with pytest.raises(ValueError, match="Polars dataframes must have unique string"): + with pytest.raises( + ValueError, + match="Pandas dataframe contains non-unique indices and/or column names", + ): pl.from_pandas(df) def test_from_pandas_duplicated_columns() -> None: df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"]) - with pytest.raises(ValueError, match="Polars dataframes must have unique string"): + with pytest.raises( + ValueError, + match="Pandas dataframe contains non-unique indices and/or column names", + ): pl.from_pandas(df)