Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Delay selection expansion #18011

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 4 additions & 9 deletions crates/polars-lazy/src/frame/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,12 +426,9 @@ impl LazyFrame {
fn _drop<I, T>(self, columns: I, strict: bool) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
let to_drop = columns
.into_iter()
.map(|s| s.as_ref().to_string())
.collect::<PlHashSet<_>>();
let to_drop = columns.into_iter().map(|c| c.into()).collect();

let opt_state = self.get_opt_state();
let lp = self.get_plan_builder().drop(to_drop, strict).build();
Expand All @@ -444,11 +441,10 @@ impl LazyFrame {
///
/// Any given columns that are not in the schema will give a [`PolarsError::ColumnNotFound`]
/// error while materializing the [`LazyFrame`].
#[inline]
pub fn drop<I, T>(self, columns: I) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
self._drop(columns, true)
}
Expand All @@ -458,11 +454,10 @@ impl LazyFrame {
/// and let the projection pushdown optimize away the unneeded columns.
///
/// If a column name does not exist in the schema, it will quietly be ignored.
#[inline]
pub fn drop_no_validate<I, T>(self, columns: I) -> Self
where
I: IntoIterator<Item = T>,
T: AsRef<str>,
T: Into<Selector>,
{
self._drop(columns, false)
}
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-plan/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ use polars_core::prelude::*;
use polars_core::series::ops::NullBehavior;
use polars_core::series::IsSorted;
use polars_core::utils::try_get_supertype;
pub(crate) use selector::Selector;
pub use selector::Selector;
#[cfg(feature = "dtype-struct")]
pub use struct_::*;
pub use udf::UserDefinedFunction;
Expand Down
27 changes: 25 additions & 2 deletions crates/polars-plan/src/dsl/selector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ pub enum Selector {
}

impl Selector {
#[cfg(feature = "meta")]
pub(crate) fn new(e: Expr) -> Self {
pub fn new(e: Expr) -> Self {
Self::Root(Box::new(e))
}
}
Expand Down Expand Up @@ -56,3 +55,27 @@ impl Sub for Selector {
Selector::Sub(Box::new(self), Box::new(rhs))
}
}

impl From<&str> for Selector {
fn from(value: &str) -> Self {
Selector::new(col(value))
}
}

impl From<String> for Selector {
fn from(value: String) -> Self {
Selector::new(col(value.as_ref()))
}
}

impl From<ColumnName> for Selector {
fn from(value: ColumnName) -> Self {
Selector::new(Expr::Column(value))
}
}

impl From<Expr> for Selector {
fn from(value: Expr) -> Self {
Selector::new(value)
}
}
2 changes: 1 addition & 1 deletion crates/polars-plan/src/plans/builder_dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ impl DslBuilder {
.into()
}

pub fn drop(self, to_drop: PlHashSet<String>, strict: bool) -> Self {
pub fn drop(self, to_drop: Vec<Selector>, strict: bool) -> Self {
self.map_private(DslFunction::Drop(DropFunction { to_drop, strict }))
}

Expand Down
4 changes: 4 additions & 0 deletions crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use polars_io::path_utils::{expand_paths_hive, expanded_from_single_directory};

use super::stack_opt::ConversionOptimizer;
use super::*;
use crate::plans::conversion::expr_expansion::expand_selectors;

fn expand_expressions(
input: Node,
Expand Down Expand Up @@ -646,6 +647,9 @@ pub fn to_alp_impl(
return run_conversion(lp, lp_arena, expr_arena, convert, "fill_nan");
},
DslFunction::Drop(DropFunction { to_drop, strict }) => {
let to_drop = expand_selectors(to_drop, &input_schema, &[])?;
let to_drop = to_drop.iter().map(|s| s.as_ref()).collect::<PlHashSet<_>>();

if strict {
for col_name in to_drop.iter() {
polars_ensure!(
Expand Down
88 changes: 63 additions & 25 deletions crates/polars-plan/src/plans/conversion/expr_expansion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -840,32 +840,70 @@ fn replace_selector(expr: Expr, schema: &Schema, keys: &[Expr]) -> PolarsResult<
let mut swapped = Selector::Root(Box::new(Expr::Wildcard));
std::mem::swap(&mut s, &mut swapped);

let mut members = PlIndexSet::new();
replace_selector_inner(swapped, &mut members, &mut vec![], schema, keys)?;

if members.len() <= 1 {
Ok(Expr::Columns(
members
.into_iter()
.map(|e| {
let Expr::Column(name) = e else {
unreachable!()
};
name
})
.collect(),
))
} else {
// Ensure that multiple columns returned from combined/nested selectors remain in schema order
let selected = schema
.iter_fields()
.map(|field| ColumnName::from(field.name().as_ref()))
.filter(|field_name| members.contains(&Expr::Column(field_name.clone())))
.collect();

Ok(Expr::Columns(selected))
}
let cols = expand_selector(swapped, schema, keys)?;
Ok(Expr::Columns(cols))
},
e => Ok(e),
})
}

pub(super) fn expand_selectors(
s: Vec<Selector>,
schema: &Schema,
keys: &[Expr],
) -> PolarsResult<Arc<[ColumnName]>> {
let mut columns = vec![];

for s in s {
match s {
Selector::Root(e) => match *e {
Expr::Column(name) => columns.push(name),
Expr::Columns(names) => columns.extend_from_slice(names.as_ref()),
Expr::Selector(s) => {
let names = expand_selector(s, schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
e => {
let names = expand_selector(Selector::new(e), schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
},
other => {
let names = expand_selector(other, schema, keys)?;
columns.extend_from_slice(names.as_ref());
},
}
}

Ok(Arc::from(columns))
}

pub(super) fn expand_selector(
s: Selector,
schema: &Schema,
keys: &[Expr],
) -> PolarsResult<Arc<[ColumnName]>> {
let mut members = PlIndexSet::new();
replace_selector_inner(s, &mut members, &mut vec![], schema, keys)?;

if members.len() <= 1 {
Ok(members
.into_iter()
.map(|e| {
let Expr::Column(name) = e else {
unreachable!()
};
name
})
.collect())
} else {
// Ensure that multiple columns returned from combined/nested selectors remain in schema order
let selected = schema
.iter_fields()
.map(|field| ColumnName::from(field.name().as_ref()))
.filter(|field_name| members.contains(&Expr::Column(field_name.clone())))
.collect();

Ok(selected)
}
}
2 changes: 1 addition & 1 deletion crates/polars-plan/src/plans/functions/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub enum DslFunction {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DropFunction {
/// Columns that are going to be dropped
pub(crate) to_drop: PlHashSet<String>,
pub(crate) to_drop: Vec<Selector>,
/// If `true`, performs a check for each item in `to_drop` against the schema. Returns an
/// `ColumnNotFound` error if the column does not exist in the schema.
pub(crate) strict: bool,
Expand Down
1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ polars-stream = { workspace = true }

ahash = { workspace = true }
arboard = { workspace = true, optional = true }
bytemuck = { workspace = true }
ciborium = { workspace = true }
either = { workspace = true }
itoa = { workspace = true }
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4719,7 +4719,7 @@ def drop(
│ 8.0 │
└─────┘
"""
drop_cols = _expand_selectors(self, *columns)
drop_cols = parse_into_list_of_expressions(*columns)
return self._from_pyldf(self._ldf.drop(drop_cols, strict=strict))

def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> LazyFrame:
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ def is_selector(obj: Any) -> bool:
return isinstance(obj, _selector_proxy_) and hasattr(obj, "_attrs")


# TODO: Don't use this. It collects a schema.
# This should all go to IR conversion.
def expand_selector(
target: DataFrame | LazyFrame | Mapping[str, PolarsDataType],
selector: SelectorType | Expr,
Expand Down Expand Up @@ -188,6 +190,8 @@ def expand_selector(
return tuple(target.select(selector).collect_schema())


# TODO: Don't use this. It collects a schema.
# This should all go to IR conversion.
def _expand_selectors(frame: DataFrame | LazyFrame, *items: Any) -> list[Any]:
"""
Internal function that expands any selectors to column names in the given input.
Expand Down
20 changes: 18 additions & 2 deletions py-polars/src/expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ mod serde;
mod string;
mod r#struct;

use std::mem::ManuallyDrop;

use polars::lazy::dsl::Expr;
use pyo3::prelude::*;

Expand All @@ -35,7 +37,14 @@ pub(crate) trait ToExprs {
impl ToExprs for Vec<PyExpr> {
fn to_exprs(self) -> Vec<Expr> {
// SAFETY: repr is transparent.
unsafe { std::mem::transmute(self) }
unsafe {
let length = self.len();
let capacity = self.capacity();
let mut manual_drop_vec = ManuallyDrop::new(self);
let vec_ptr: *mut PyExpr = manual_drop_vec.as_mut_ptr();
let ptr: *mut Expr = vec_ptr as *mut Expr;
Vec::from_raw_parts(ptr, length, capacity)
}
}
}

Expand All @@ -46,6 +55,13 @@ pub(crate) trait ToPyExprs {
impl ToPyExprs for Vec<Expr> {
fn to_pyexprs(self) -> Vec<PyExpr> {
// SAFETY: repr is transparent.
unsafe { std::mem::transmute(self) }
unsafe {
let length = self.len();
let capacity = self.capacity();
let mut manual_drop_vec = ManuallyDrop::new(self);
let vec_ptr: *mut Expr = manual_drop_vec.as_mut_ptr();
let ptr: *mut PyExpr = vec_ptr as *mut PyExpr;
Vec::from_raw_parts(ptr, length, capacity)
}
}
}
3 changes: 2 additions & 1 deletion py-polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1162,8 +1162,9 @@ impl PyLazyFrame {
.into()
}

fn drop(&self, columns: Vec<String>, strict: bool) -> Self {
fn drop(&self, columns: Vec<PyExpr>, strict: bool) -> Self {
let ldf = self.ldf.clone();
let columns = columns.to_exprs();
if strict {
ldf.drop(columns)
} else {
Expand Down