Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(python): Raise for overlapping index/column names in pandas dataframes post string coercion #17628

Merged
merged 1 commit into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions py-polars/polars/_utils/construction/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,26 @@ def to_frame_chunk(values: list[Any], schema: SchemaDefinition | None) -> DataFr
return df._df


def _check_pandas_columns(data: pd.DataFrame) -> None:
"""Check pandas dataframe columns can be converted to polars."""
stringified_cols: set[str] = {str(col) for col in data.columns}
stringified_index: set[str] = {str(idx) for idx in data.index.names}

non_unique_cols: bool = len(stringified_cols) < len(data.columns)
non_unique_indices: bool = len(stringified_index) < len(data.index.names)
if non_unique_cols or non_unique_indices:
msg = (
"Pandas dataframe contains non-unique indices and/or column names. "
"Polars dataframes require unique string names for columns."
)
raise ValueError(msg)

overlapping_cols_and_indices: set[str] = stringified_cols & stringified_index
if len(overlapping_cols_and_indices) > 0:
msg = "Pandas indices and column names must not overlap."
raise ValueError(msg)


def pandas_to_pydf(
data: pd.DataFrame,
schema: SchemaDefinition | None = None,
Expand All @@ -1040,13 +1060,7 @@ def pandas_to_pydf(
include_index: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a pandas DataFrame."""
stringified_cols = {str(col) for col in data.columns}
if len(stringified_cols) < len(data.columns):
msg = (
"Polars dataframes must have unique string column names."
"Please check your pandas dataframe for duplicates."
)
raise ValueError(msg)
_check_pandas_columns(data)

convert_index = include_index and not _pandas_has_default_index(data)
if not convert_index and all(
Expand Down
17 changes: 15 additions & 2 deletions py-polars/tests/unit/interop/test_from_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@
from polars._typing import PolarsDataType


def index_not_silently_excluded() -> None:
ddict = {"a": [1, 2, 3], "b": [4, 5, 6]}
df = pd.DataFrame(ddict, index=pd.Index([7, 8, 9], name="a"))
with pytest.raises(ValueError, match="indices and column names must not overlap"):
pl.from_pandas(df, include_index=True)


def test_from_pandas() -> None:
df = pd.DataFrame(
{
Expand Down Expand Up @@ -174,13 +181,19 @@ def test_from_pandas_include_indexes() -> None:

def test_duplicate_cols_diff_types() -> None:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["0", 0, "1", 1])
with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
with pytest.raises(
ValueError,
match="Pandas dataframe contains non-unique indices and/or column names",
):
pl.from_pandas(df)


def test_from_pandas_duplicated_columns() -> None:
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["a", "b", "c", "b"])
with pytest.raises(ValueError, match="Polars dataframes must have unique string"):
with pytest.raises(
ValueError,
match="Pandas dataframe contains non-unique indices and/or column names",
):
pl.from_pandas(df)


Expand Down