Skip to content

Commit

Permalink
fix: prune emtpy chunks before set operations (pola-rs#13898)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Jan 22, 2024
1 parent ccc30b7 commit 8fbf46a
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 0 deletions.
17 changes: 17 additions & 0 deletions crates/polars-core/src/chunked_array/ops/chunkops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,23 @@ impl<T: PolarsDataType> ChunkedArray<T> {
};
self.slice(-(len as i64), len)
}

/// Remove empty chunks.
pub fn prune_empty_chunks(&mut self) {
let mut count = 0u32;
unsafe {
self.chunks_mut().retain(|arr| {
count += 1;
// Always keep at least one chunk
if count == 1 {
true
} else {
// Remove the empty chunks
arr.len() > 0
}
})
}
}
}

#[cfg(feature = "object")]
Expand Down
4 changes: 4 additions & 0 deletions crates/polars-ops/src/chunked_array/list/sets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ pub fn list_set_operation(
b = b.rechunk();
}

// We will OOB in the kernel otherwise.
a.prune_empty_chunks();
b.prune_empty_chunks();

// we use the unsafe variant because we want to keep the nested logical types type.
unsafe {
arity::try_binary_unchecked_same_type(
Expand Down
15 changes: 15 additions & 0 deletions py-polars/tests/unit/operations/test_sets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import polars as pl


def test_set_intersection_13765() -> None:
df = pl.DataFrame(
{
"a": pl.Series([[1], [1]], dtype=pl.List(pl.UInt32)),
"f": pl.Series([1, 2], dtype=pl.UInt32),
}
)

df = df.join(df, how="cross", suffix="_other")
df = df.filter(pl.col("f") == 1)

df.select(pl.col("a").list.set_intersection("a_other")).to_dict(as_series=False)

0 comments on commit 8fbf46a

Please sign in to comment.