From c6ba62c2a35cbfcc142b87404c00085d52319995 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Wed, 27 Mar 2024 22:21:41 +0100 Subject: [PATCH] fix(python): Propagate strictness in `from_dicts` (#15344) --- .../polars/_utils/construction/dataframe.py | 16 +++- py-polars/src/dataframe/construction.rs | 92 ++++++++++--------- .../tests/unit/constructors/test_dataframe.py | 13 +++ py-polars/tests/unit/interop/test_interop.py | 4 +- 4 files changed, 79 insertions(+), 46 deletions(-) diff --git a/py-polars/polars/_utils/construction/dataframe.py b/py-polars/polars/_utils/construction/dataframe.py index 81f100b12ffd..4e231e7a5028 100644 --- a/py-polars/polars/_utils/construction/dataframe.py +++ b/py-polars/polars/_utils/construction/dataframe.py @@ -555,7 +555,7 @@ def _sequence_of_sequence_to_pydf( if unpack_nested: dicts = [nt_unpack(d) for d in data] pydf = PyDataFrame.from_dicts( - dicts, infer_schema_length=infer_schema_length + dicts, strict=strict, infer_schema_length=infer_schema_length ) else: pydf = PyDataFrame.from_rows( @@ -675,6 +675,7 @@ def _sequence_of_dict_to_pydf( data, dicts_schema, schema_overrides, + strict=strict, infer_schema_length=infer_schema_length, ) @@ -774,7 +775,9 @@ def _sequence_of_dataclasses_to_pydf( ) if unpack_nested: dicts = [asdict(md) for md in data] - pydf = PyDataFrame.from_dicts(dicts, infer_schema_length=infer_schema_length) + pydf = PyDataFrame.from_dicts( + dicts, strict=strict, infer_schema_length=infer_schema_length + ) else: rows = [astuple(dc) for dc in data] pydf = PyDataFrame.from_rows( @@ -823,7 +826,9 @@ def _sequence_of_pydantic_models_to_pydf( if old_pydantic else [md.model_dump(mode="python") for md in data] ) - pydf = PyDataFrame.from_dicts(dicts, infer_schema_length=infer_schema_length) + pydf = PyDataFrame.from_dicts( + dicts, strict=strict, infer_schema_length=infer_schema_length + ) elif len(model_fields) > 50: # 'from_rows' is the faster codepath for models with a lot of fields... @@ -836,7 +841,10 @@ def _sequence_of_pydantic_models_to_pydf( # ...and 'from_dicts' is faster otherwise dicts = [md.__dict__ for md in data] pydf = PyDataFrame.from_dicts( - dicts, schema=overrides, infer_schema_length=infer_schema_length + dicts, + schema=overrides, + strict=strict, + infer_schema_length=infer_schema_length, ) if overrides: diff --git a/py-polars/src/dataframe/construction.rs b/py-polars/src/dataframe/construction.rs index 3eeb0fe9fc60..723d02919af5 100644 --- a/py-polars/src/dataframe/construction.rs +++ b/py-polars/src/dataframe/construction.rs @@ -3,6 +3,7 @@ use pyo3::prelude::*; use super::*; use crate::arrow_interop; +use crate::conversion::any_value::py_object_to_any_value; use crate::conversion::{vec_extract_wrapped, Wrap}; #[pymethods] @@ -20,24 +21,20 @@ impl PyDataFrame { } #[staticmethod] - #[pyo3(signature = (data, schema=None, schema_overrides=None, infer_schema_length=None))] + #[pyo3(signature = (data, schema=None, schema_overrides=None, strict=true, infer_schema_length=None))] pub fn from_dicts( py: Python, data: &PyAny, schema: Option>, schema_overrides: Option>, + strict: bool, infer_schema_length: Option, ) -> PyResult { let schema = schema.map(|wrap| wrap.0); let schema_overrides = schema_overrides.map(|wrap| wrap.0); - // If given, read dict fields in schema order. - let mut schema_columns = PlIndexSet::new(); - if let Some(ref s) = schema { - schema_columns.extend(s.iter_names().map(|n| n.to_string())) - } - - let (rows, names) = dicts_to_rows(data, infer_schema_length, schema_columns)?; + let names = get_schema_names(data, schema.as_ref(), infer_schema_length)?; + let rows = dicts_to_rows(data, &names, strict)?; let schema = schema.or_else(|| { Some(columns_names_to_empty_schema( @@ -138,48 +135,61 @@ where Schema::from_iter(fields) } -fn dicts_to_rows( - records: &PyAny, - infer_schema_len: Option, - schema_columns: PlIndexSet, -) -> PyResult<(Vec, Vec)> { - let infer_schema_len = infer_schema_len - .map(|n| std::cmp::max(1, n)) - .unwrap_or(usize::MAX); - let len = records.len()?; - - let key_names = { - if !schema_columns.is_empty() { - schema_columns - } else { - let mut inferred_keys = PlIndexSet::new(); - for d in records.iter()?.take(infer_schema_len) { - let d = d?; - let d = d.downcast::()?; - let keys = d.keys(); - for name in keys { - let name = name.extract::()?; - inferred_keys.insert(name); - } - } - inferred_keys - } - }; +fn dicts_to_rows<'a>(data: &'a PyAny, names: &'a [String], strict: bool) -> PyResult>> { + let len = data.len()?; let mut rows = Vec::with_capacity(len); - - for d in records.iter()? { + for d in data.iter()? { let d = d?; let d = d.downcast::()?; - let mut row = Vec::with_capacity(key_names.len()); - for k in key_names.iter() { + let mut row = Vec::with_capacity(names.len()); + for k in names.iter() { let val = match d.get_item(k)? { None => AnyValue::Null, - Some(val) => val.extract::>()?.0, + Some(val) => py_object_to_any_value(val, strict)?, }; row.push(val) } rows.push(Row(row)) } - Ok((rows, key_names.into_iter().collect())) + Ok(rows) +} + +/// Either read the given schema, or infer the schema names from the data. +fn get_schema_names( + data: &PyAny, + schema: Option<&Schema>, + infer_schema_length: Option, +) -> PyResult> { + if let Some(schema) = schema { + Ok(schema.iter_names().map(|n| n.to_string()).collect()) + } else { + infer_schema_names_from_data(data, infer_schema_length) + } +} + +/// Infer schema names from an iterable of dictionaries. +/// +/// The resulting schema order is determined by the order in which the names are encountered in +/// the data. +fn infer_schema_names_from_data( + data: &PyAny, + infer_schema_length: Option, +) -> PyResult> { + let data_len = data.len()?; + let infer_schema_length = infer_schema_length + .map(|n| std::cmp::max(1, n)) + .unwrap_or(data_len); + + let mut names = PlIndexSet::new(); + for d in data.iter()?.take(infer_schema_length) { + let d = d?; + let d = d.downcast::()?; + let keys = d.keys(); + for name in keys { + let name = name.extract::()?; + names.insert(name); + } + } + Ok(names.into_iter().collect()) } diff --git a/py-polars/tests/unit/constructors/test_dataframe.py b/py-polars/tests/unit/constructors/test_dataframe.py index 445745623ca2..2a88802c0304 100644 --- a/py-polars/tests/unit/constructors/test_dataframe.py +++ b/py-polars/tests/unit/constructors/test_dataframe.py @@ -126,3 +126,16 @@ def test_df_init_from_series_strict() -> None: def test_df_init_rows_overrides_non_existing() -> None: with pytest.raises(pl.SchemaError, match="nonexistent column"): pl.DataFrame([{"a": 1, "b": 2}], schema_overrides={"c": pl.Int8}) + + +# https://github.com/pola-rs/polars/issues/15245 +def test_df_init_nested_mixed_types() -> None: + data = [{"key": [{"value": 1}, {"value": 1.0}]}] + + with pytest.raises(TypeError, match="unexpected value"): + pl.DataFrame(data, strict=True) + + df = pl.DataFrame(data, strict=False) + + assert df.schema == {"key": pl.List(pl.Struct({"value": pl.Float64}))} + assert df.to_dicts() == [{"key": [{"value": 1.0}, {"value": 1.0}]}] diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index 062a68a02b0b..630530f457be 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -308,7 +308,9 @@ def test_from_dicts() -> None: def test_from_dict_no_inference() -> None: schema = {"a": pl.String} data = [{"a": "aa"}] - pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0) + df = pl.from_dicts(data, schema_overrides=schema, infer_schema_length=0) + assert df.schema == schema + assert df.to_dicts() == data def test_from_dicts_schema_override() -> None: