1515// specific language governing permissions and limitations
1616// under the License.
1717
18- //! This module contains code to prune "containers" of row groups
19- //! based on statistics prior to execution. This can lead to
20- //! significant performance improvements by avoiding the need
21- //! to evaluate a plan on entire containers (e.g. an entire file)
18+ //! [`PruningPredicate`] to apply filter [`Expr`] to prune "containers"
19+ //! based on statistics (e.g. Parquet Row Groups)
2220//!
23- //! For example, DataFusion uses this code to prune (skip) row groups
24- //! while reading parquet files if it can be determined from the
25- //! predicate that nothing in the row group can match.
26- //!
27- //! This code can also be used by other systems to prune other
28- //! entities (e.g. entire files) if the statistics are known via some
29- //! other source (e.g. a catalog)
30-
21+ //! [`Expr`]: crate::prelude::Expr
3122use std:: collections:: HashSet ;
3223use std:: convert:: TryFrom ;
3324use std:: sync:: Arc ;
@@ -53,18 +44,21 @@ use datafusion_physical_expr::utils::collect_columns;
5344use datafusion_physical_expr:: { expressions as phys_expr, PhysicalExprRef } ;
5445use log:: trace;
5546
56- /// Interface to pass statistics information to [`PruningPredicate`]
47+ /// Interface to pass statistics (min/max/nulls) information to [`PruningPredicate`].
5748///
58- /// Returns statistics for containers / files of data in Arrays.
49+ /// Returns statistics for containers / files as Arrow [`ArrayRef`], so the
50+ /// evaluation happens once on a single `RecordBatch`, amortizing the overhead
51+ /// of evaluating of the predicate. This is important when pruning 1000s of
52+ /// containers which often happens in analytic systems.
5953///
60- /// For example, for the following three files with a single column
54+ /// For example, for the following three files with a single column `a`:
6155/// ```text
6256/// file1: column a: min=5, max=10
6357/// file2: column a: No stats
6458/// file2: column a: min=20, max=30
6559/// ```
6660///
67- /// PruningStatistics should return:
61+ /// PruningStatistics would return:
6862///
6963/// ```text
7064/// min_values("a") -> Some([5, Null, 20])
@@ -91,10 +85,44 @@ pub trait PruningStatistics {
9185 fn null_counts ( & self , column : & Column ) -> Option < ArrayRef > ;
9286}
9387
94- /// Evaluates filter expressions on statistics in order to
95- /// prune data containers (e.g. parquet row group)
88+ /// Evaluates filter expressions on statistics, rather than the actual data. If
89+ /// no rows could possibly pass the filter entire containers can be "pruned"
90+ /// (skipped), without reading any actual data, leading to significant
91+ /// performance improvements.
92+ ///
93+ /// [`PruningPredicate`]s are used to prune (avoid scanning) Parquet Row Groups
94+ /// based on the min/max values found in the Parquet metadata. If the
95+ /// `PruningPredicate` can guarantee that no rows in the Row Group match the
96+ /// filter, the entire Row Group is skipped during query execution.
97+ ///
98+ /// Note that this API is designed to be general, as it works:
99+ ///
100+ /// 1. Arbitrary expressions expressions (including user defined functions)
101+ ///
102+ /// 2. Anything that implements the [`PruningStatistics`] trait, not just
103+ /// Parquet metadata, allowing it to be used by other systems to prune entities
104+ /// (e.g. entire files) if the statistics are known via some other source, such
105+ /// as a catalog.
106+ ///
107+ /// # Example
108+ ///
109+ /// Given an expression like `x = 5` and statistics for 3 containers (Row
110+ /// Groups, files, etc) `A`, `B`, and `C`:
111+ ///
112+ /// ```text
113+ /// A: {x_min = 0, x_max = 4}
114+ /// B: {x_min = 2, x_max = 10}
115+ /// C: {x_min = 5, x_max = 8}
116+ /// ```
117+ ///
118+ /// Applying the `PruningPredicate` will concludes that `A` can be pruned:
96119///
97- /// See [`PruningPredicate::try_new`] for more information.
120+ /// ```text
121+ /// A: false (no rows could possibly match x = 5)
122+ /// B: true (rows might match x = 5)
123+ /// C: true (rows might match x = 5)
124+ /// ```
125+ /// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information.
98126#[ derive( Debug , Clone ) ]
99127pub struct PruningPredicate {
100128 /// The input schema against which the predicate will be evaluated
@@ -146,17 +174,14 @@ impl PruningPredicate {
146174 ///
147175 /// `true`: There MAY be rows that match the predicate
148176 ///
149- /// `false`: There are no rows that could match the predicate
177+ /// `false`: There are no rows that could possibly match the predicate
150178 ///
151- /// Note this function takes a slice of statistics as a parameter
152- /// to amortize the cost of the evaluation of the predicate
153- /// against a single record batch.
154- ///
155- /// Note: the predicate passed to `prune` should be simplified as
179+ /// Note: the predicate passed to `prune` should already be simplified as
156180 /// much as possible (e.g. this pass doesn't handle some
157181 /// expressions like `b = false`, but it does handle the
158- /// simplified version `b`. The predicates are simplified via the
159- /// ConstantFolding optimizer pass
182+ /// simplified version `b`. See [`ExprSimplifier`] to simplify expressions.
183+ ///
184+ /// [`ExprSimplifier`]: crate::optimizer::simplify_expressions::ExprSimplifier
160185 pub fn prune < S : PruningStatistics > ( & self , statistics : & S ) -> Result < Vec < bool > > {
161186 // build a RecordBatch that contains the min/max values in the
162187 // appropriate statistics columns
@@ -909,7 +934,7 @@ fn build_statistics_expr(
909934 _ => {
910935 return plan_err ! (
911936 "expressions other than (neq, eq, gt, gteq, lt, lteq) are not supported"
912- )
937+ ) ;
913938 }
914939 } ;
915940 Ok ( statistics_expr)
0 commit comments