Skip to content

Commit

Permalink
perf: Use bitmask to filter Parquet predicate-pushdown items (#17993)
Browse files Browse the repository at this point in the history
  • Loading branch information
coastalwhite authored Aug 5, 2024
1 parent fd00ee6 commit ca6d46c
Show file tree
Hide file tree
Showing 27 changed files with 505 additions and 155 deletions.
4 changes: 4 additions & 0 deletions crates/polars-expr/src/expressions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,10 @@ impl PhysicalIoExpr for PhysicalIoHelper {
self.expr.evaluate(df, &state)
}

fn live_variables(&self) -> Option<Vec<Arc<str>>> {
Some(expr_to_leaf_column_names(self.expr.as_expression()?))
}

#[cfg(feature = "parquet")]
fn as_stats_evaluator(&self) -> Option<&dyn polars_io::predicates::StatsEvaluator> {
self.expr.as_stats_evaluator()
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-io/src/parquet/read/mmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ fn _mmap_single_column<'a>(
pub(super) fn to_deserializer<'a>(
columns: Vec<(&ColumnChunkMetaData, MemSlice)>,
field: Field,
num_rows: usize,
filter: Option<Filter>,
) -> PolarsResult<ArrayIter<'a>> {
let (columns, types): (Vec<_>, Vec<_>) = columns
.into_iter()
Expand All @@ -87,5 +87,5 @@ pub(super) fn to_deserializer<'a>(
})
.unzip();

column_iter_to_arrays(columns, types, field, Some(Filter::new_limited(num_rows)))
column_iter_to_arrays(columns, types, field, filter)
}
8 changes: 8 additions & 0 deletions crates/polars-io/src/parquet/read/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ pub enum ParallelStrategy {
Columns,
/// Parallelize over the row groups
RowGroups,
/// First evaluates the pushed-down predicates in parallel and determines a mask of which rows
/// to read. Then, it parallelizes over both the columns and the row groups while filtering out
/// rows that do not need to be read. This can provide significant speedups for large files
/// (i.e. many row-groups) with a predicate that filters clustered rows or filters heavily. In
/// other cases, this may slow down the scan compared other strategies.
///
/// If no predicate is given, this falls back to back to [`ParallelStrategy::Auto`].
Prefiltered,
/// Automatically determine over which unit to parallelize
/// This will choose the most occurring unit.
#[default]
Expand Down
Loading

0 comments on commit ca6d46c

Please sign in to comment.