Skip to content

Commit

Permalink
fix(python): Raise for overlapping index/column names in pandas dataf…
Browse files Browse the repository at this point in the history
…rames post string coercion (#17628)
  • Loading branch information
tylerriccio33 authored Jul 15, 2024
1 parent 7f5d9c7 commit f90753b
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 9 deletions.
28 changes: 21 additions & 7 deletions py-polars/polars/_utils/construction/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,26 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
return df._df


def _check_pandas_columns(data: pd.DataFrame) -> None:
"""Check pandas dataframe columns can be converted to polars."""
stringified_cols: set[str] = {str(col) for col in data.columns}
stringified_index: set[str] = {str(idx) for idx in data.index.names}

non_unique_cols: bool = len(stringified_cols) < len(data.columns)
non_unique_indices: bool = len(stringified_index) < len(data.index.names)
if non_unique_cols or non_unique_indices:
msg = (
"Pandas dataframe contains non-unique indices and/or column names. "
"Polars dataframes require unique string names for columns."
)
raise ValueError(msg)

overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
if len(overlapping_cols_and_indices) > 0:
msg = "Pandas indices and column names must not overlap."
raise ValueError(msg)


def pandas_to_pydf(
data: pd.DataFrame,
schema: SchemaDefinition | None = None,
Expand All @@ -1040,13 +1060,7 @@ def pandas_to_pydf(
include_index: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a pandas DataFrame."""
stringified_cols = {str(col) for col in data.columns}
if len(stringified_cols) < len(data.columns):
msg = (
"Polars dataframes must have unique string column names."
"Please check your pandas dataframe for duplicates."
)
raise ValueError(msg)
_check_pandas_columns(data)

convert_index = include_index and not _pandas_has_default_index(data)
if not convert_index and all(
Expand Down
17 changes: 15 additions & 2 deletions py-polars/tests/unit/interop/test_from_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
from polars._typing import PolarsDataType


def index_not_silently_excluded() -> None:
ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}
df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))
with pytest.raises(ValueError, match="indices and column names must not overlap"):
pl.from_pandas(df, include_index=True)


def test_from_pandas() -> None:
df = pd.DataFrame(
{
Expand Down Expand Up @@ -174,13 +181,19 @@ def test_from_pandas_include_indexes() -> None:

def test_duplicate_cols_diff_types() -> None:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])
with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
with pytest.raises(
ValueError,
match="Pandas dataframe contains non-unique indices and/or column names",
):
pl.from_pandas(df)


def test_from_pandas_duplicated_columns() -> None:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
with pytest.raises(
ValueError,
match="Pandas dataframe contains non-unique indices and/or column names",
):
pl.from_pandas(df)


Expand Down

0 comments on commit f90753b

Please sign in to comment.