fix: Parquet predicate pushdown for lit(_) != (#19246)

pola-rs · Oct 16, 2024 · 109c404 · 109c404
1 parent d89fdcd
commit 109c404
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 22 deletions.
diff --git a/crates/polars-expr/src/expressions/apply.rs b/crates/polars-expr/src/expressions/apply.rs
@@ -525,14 +525,6 @@ fn apply_multiple_elementwise<'a>(
 impl StatsEvaluator for ApplyExpr {
     fn should_read(&self, stats: &BatchStats) -> PolarsResult<bool> {
         let read = self.should_read_impl(stats)?;
-        if ExecutionState::new().verbose() {
-            if read {
-                eprintln!("parquet file must be read, statistics not sufficient for predicate.")
-            } else {
-                eprintln!("parquet file can be skipped, the statistics were sufficient to apply the predicate.")
-            }
-        }
-
         Ok(read)
     }
 }

diff --git a/crates/polars-expr/src/expressions/binary.rs b/crates/polars-expr/src/expressions/binary.rs
@@ -351,7 +351,7 @@ mod stats {
         use ChunkCompareIneq as C;
         match op {
             Operator::Eq => apply_operator_stats_eq(min_max, literal),
-            Operator::NotEq => apply_operator_stats_eq(min_max, literal),
+            Operator::NotEq => apply_operator_stats_neq(min_max, literal),
             Operator::Gt => {
                 // Literal is bigger than max value, selection needs all rows.
                 C::gt(literal, min_max).map(|ca| ca.any()).unwrap_or(false)
@@ -454,10 +454,6 @@ mod stats {
 
     impl StatsEvaluator for BinaryExpr {
         fn should_read(&self, stats: &BatchStats) -> PolarsResult<bool> {
-            if std::env::var("POLARS_NO_PARQUET_STATISTICS").is_ok() {
-                return Ok(true);
-            }
-
             use Operator::*;
             match (
                 self.left.as_stats_evaluator(),

diff --git a/crates/polars-io/src/parquet/read/predicates.rs b/crates/polars-io/src/parquet/read/predicates.rs
@@ -1,3 +1,4 @@
+use polars_core::config;
 use polars_core::prelude::*;
 use polars_parquet::read::statistics::{deserialize, Statistics};
 use polars_parquet::read::RowGroupMetadata;
@@ -50,18 +51,38 @@ pub fn read_this_row_group(
     md: &RowGroupMetadata,
     schema: &ArrowSchema,
 ) -> PolarsResult<bool> {
+    if std::env::var("POLARS_NO_PARQUET_STATISTICS").is_ok() {
+        return Ok(true);
+    }
+
+    let mut should_read = true;
+
     if let Some(pred) = predicate {
         if let Some(pred) = pred.as_stats_evaluator() {
             if let Some(stats) = collect_statistics(md, schema)? {
-                let should_read = pred.should_read(&stats);
+                let pred_result = pred.should_read(&stats);
+
                 // a parquet file may not have statistics of all columns
-                if matches!(should_read, Ok(false)) {
-                    return Ok(false);
-                } else if !matches!(should_read, Err(PolarsError::ColumnNotFound(_))) {
-                    let _ = should_read?;
+                match pred_result {
+                    Err(PolarsError::ColumnNotFound(errstr)) => {
+                        return Err(PolarsError::ColumnNotFound(errstr))
+                    },
+                    Ok(false) => should_read = false,
+                    _ => {},
                 }
             }
         }
+
+        if config::verbose() {
+            if should_read {
+                eprintln!(
+                    "parquet row group must be read, statistics not sufficient for predicate."
+                );
+            } else {
+                eprintln!("parquet row group can be skipped, the statistics were sufficient to apply the predicate.");
+            }
+        }
     }
-    Ok(true)
+
+    Ok(should_read)
 }
diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py
@@ -235,12 +235,12 @@ def test_parquet_is_in_statistics(monkeypatch: Any, capfd: Any, tmp_path: Path)
 
     captured = capfd.readouterr().err
     assert (
-        "parquet file must be read, statistics not sufficient for predicate."
+        "parquet row group must be read, statistics not sufficient for predicate."
         in captured
     )
     assert (
-        "parquet file can be skipped, the statistics were sufficient"
-        " to apply the predicate." in captured
+        "parquet row group can be skipped, the statistics were sufficient to apply the predicate."
+        in captured
     )