From f90753b69b0923adb1625d03787bcedd45cd533e Mon Sep 17 00:00:00 2001
From: tylerriccio33 <83321774+tylerriccio33@users.noreply.github.com>
Date: Mon, 15 Jul 2024 02:44:59 -0400
Subject: [PATCH] fix(python): Raise for overlapping index/column names in
 pandas dataframes post string coercion (#17628)

---
 .../polars/_utils/construction/dataframe.py   | 28 ++++++++++++++-----
 .../tests/unit/interop/test_from_pandas.py    | 17 +++++++++--
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py
index d41fd0da3529..8c94a0e31e78 100644
--- a/py-polars/polars/_utils/construction/dataframe.py
+++ b/py-polars/polars/_utils/construction/dataframe.py
@@ -1029,6 +1029,26 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
     return df._df
 
 
+def _check_pandas_columns(data: pd.DataFrame) -> None:
+    """Check pandas dataframe columns can be converted to polars."""
+    stringified_cols: set[str] = {str(col) for col in data.columns}
+    stringified_index: set[str] = {str(idx) for idx in data.index.names}
+
+    non_unique_cols: bool = len(stringified_cols) < len(data.columns)
+    non_unique_indices: bool = len(stringified_index) < len(data.index.names)
+    if non_unique_cols or non_unique_indices:
+        msg = (
+            "Pandas dataframe contains non-unique indices and/or column names. "
+            "Polars dataframes require unique string names for columns."
+        )
+        raise ValueError(msg)
+
+    overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
+    if len(overlapping_cols_and_indices) > 0:
+        msg = "Pandas indices and column names must not overlap."
+        raise ValueError(msg)
+
+
 def pandas_to_pydf(
     data: pd.DataFrame,
     schema: SchemaDefinition | None = None,
@@ -1040,13 +1060,7 @@ def pandas_to_pydf(
     include_index: bool = False,
 ) -> PyDataFrame:
     """Construct a PyDataFrame from a pandas DataFrame."""
-    stringified_cols = {str(col) for col in data.columns}
-    if len(stringified_cols) < len(data.columns):
-        msg = (
-            "Polars dataframes must have unique string column names."
-            "Please check your pandas dataframe for duplicates."
-        )
-        raise ValueError(msg)
+    _check_pandas_columns(data)
 
     convert_index = include_index and not _pandas_has_default_index(data)
     if not convert_index and all(
diff --git a/py-polars/tests/unit/interop/test_from_pandas.py b/py-polars/tests/unit/interop/test_from_pandas.py
index 7a49a139b163..624ab851c5df 100644
--- a/py-polars/tests/unit/interop/test_from_pandas.py
+++ b/py-polars/tests/unit/interop/test_from_pandas.py
@@ -15,6 +15,13 @@
     from polars._typing import PolarsDataType
 
 
+def index_not_silently_excluded() -> None:
+    ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}
+    df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))
+    with pytest.raises(ValueError, match="indices and column names must not overlap"):
+        pl.from_pandas(df, include_index=True)
+
+
 def test_from_pandas() -> None:
     df = pd.DataFrame(
         {
@@ -174,13 +181,19 @@ def test_from_pandas_include_indexes() -> None:
 
 def test_duplicate_cols_diff_types() -> None:
     df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])
-    with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
+    with pytest.raises(
+        ValueError,
+        match="Pandas dataframe contains non-unique indices and/or column names",
+    ):
         pl.from_pandas(df)
 
 
 def test_from_pandas_duplicated_columns() -> None:
     df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
-    with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
+    with pytest.raises(
+        ValueError,
+        match="Pandas dataframe contains non-unique indices and/or column names",
+    ):
         pl.from_pandas(df)