perf: Use bitmask to filter Parquet predicate-pushdown items (#17993)

pola-rs · Aug 5, 2024 · ca6d46c · ca6d46c
1 parent fd00ee6
commit ca6d46c
Show file tree

Hide file tree

Showing 27 changed files with 505 additions and 155 deletions.
diff --git a/crates/polars-expr/src/expressions/mod.rs b/crates/polars-expr/src/expressions/mod.rs
@@ -613,6 +613,10 @@ impl PhysicalIoExpr for PhysicalIoHelper {
         self.expr.evaluate(df, &state)
     }
 
+    fn live_variables(&self) -> Option<Vec<Arc<str>>> {
+        Some(expr_to_leaf_column_names(self.expr.as_expression()?))
+    }
+
     #[cfg(feature = "parquet")]
     fn as_stats_evaluator(&self) -> Option<&dyn polars_io::predicates::StatsEvaluator> {
         self.expr.as_stats_evaluator()

diff --git a/crates/polars-io/src/parquet/read/mmap.rs b/crates/polars-io/src/parquet/read/mmap.rs
@@ -65,7 +65,7 @@ fn _mmap_single_column<'a>(
 pub(super) fn to_deserializer<'a>(
     columns: Vec<(&ColumnChunkMetaData, MemSlice)>,
     field: Field,
-    num_rows: usize,
+    filter: Option<Filter>,
 ) -> PolarsResult<ArrayIter<'a>> {
     let (columns, types): (Vec<_>, Vec<_>) = columns
         .into_iter()
@@ -87,5 +87,5 @@ pub(super) fn to_deserializer<'a>(
         })
         .unzip();
 
-    column_iter_to_arrays(columns, types, field, Some(Filter::new_limited(num_rows)))
+    column_iter_to_arrays(columns, types, field, filter)
 }
diff --git a/crates/polars-io/src/parquet/read/options.rs b/crates/polars-io/src/parquet/read/options.rs
@@ -18,6 +18,14 @@ pub enum ParallelStrategy {
     Columns,
     /// Parallelize over the row groups
     RowGroups,
+    /// First evaluates the pushed-down predicates in parallel and determines a mask of which rows
+    /// to read. Then, it parallelizes over both the columns and the row groups while filtering out
+    /// rows that do not need to be read. This can provide significant speedups for large files
+    /// (i.e. many row-groups) with a predicate that filters clustered rows or filters heavily. In
+    /// other cases, this may slow down the scan compared other strategies.
+    ///
+    /// If no predicate is given, this falls back to back to [`ParallelStrategy::Auto`].
+    Prefiltered,
     /// Automatically determine over which unit to parallelize
     /// This will choose the most occurring unit.
     #[default]