Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate expression manipulation functions into datafusion_optimizer #3809

Merged
merged 4 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions benchmarks/src/bin/parquet_filter_pushdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ use datafusion::datasource::object_store::ObjectStoreUrl;
use datafusion::execution::context::ExecutionProps;
use datafusion::logical_expr::{lit, or, Expr};
use datafusion::logical_plan::ToDFSchema;
use datafusion::optimizer::utils::combine_filters_disjunctive;
use datafusion::physical_expr::create_physical_expr;
use datafusion::physical_plan::collect;
use datafusion::physical_plan::file_format::{
FileScanConfig, ParquetExec, ParquetScanOptions,
};
use datafusion::physical_plan::filter::FilterExec;
use datafusion::prelude::{col, combine_filters, SessionConfig, SessionContext};
use datafusion::prelude::{col, SessionConfig, SessionContext};
use object_store::path::Path;
use object_store::ObjectMeta;
use parquet::arrow::ArrowWriter;
Expand Down Expand Up @@ -143,7 +144,7 @@ async fn run_benchmarks(
col("response_status").eq(lit(403_u16)),
)),
// Many filters
combine_filters(&[
combine_filters_disjunctive(&[
col("request_method").not_eq(lit("GET")),
col("response_status").eq(lit(400_u16)),
// TODO this fails in the FilterExec with Error: Internal("The type of Dictionary(Int32, Utf8) = Utf8 of binary physical should be same")
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/file_format/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use arrow::datatypes::SchemaRef;
use async_trait::async_trait;
use bytes::{BufMut, BytesMut};
use datafusion_common::DataFusionError;
use datafusion_optimizer::utils::combine_filters;
use hashbrown::HashMap;
use object_store::{ObjectMeta, ObjectStore};
use parquet::arrow::parquet_to_arrow_schema;
Expand All @@ -40,7 +41,6 @@ use crate::arrow::array::{
use crate::arrow::datatypes::{DataType, Field};
use crate::datasource::{create_max_min_accs, get_col_stats};
use crate::error::Result;
use crate::logical_plan::combine_filters;
use crate::logical_plan::Expr;
use crate::physical_plan::expressions::{MaxAccumulator, MinAccumulator};
use crate::physical_plan::file_format::{ParquetExec, SchemaAdapter};
Expand Down
9 changes: 4 additions & 5 deletions datafusion/core/src/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ pub use datafusion_common::{
pub use datafusion_expr::{
abs, acos, and, approx_distinct, approx_percentile_cont, array, ascii, asin, atan,
atan2, avg, bit_length, btrim, call_fn, case, cast, ceil, character_length, chr,
coalesce, col, combine_filters, concat, concat_expr, concat_ws, concat_ws_expr, cos,
count, count_distinct, create_udaf, create_udf, date_part, date_trunc, digest,
exists, exp, expr_rewriter,
coalesce, col, concat, concat_expr, concat_ws, concat_ws_expr, cos, count,
count_distinct, create_udaf, create_udf, date_part, date_trunc, digest, exists, exp,
expr_rewriter,
expr_rewriter::{
normalize_col, normalize_col_with_schemas, normalize_cols, replace_col,
rewrite_sort_cols_by_aggs, unnormalize_col, unnormalize_cols, ExprRewritable,
Expand All @@ -53,6 +53,5 @@ pub use datafusion_expr::{
reverse, right, round, rpad, rtrim, scalar_subquery, sha224, sha256, sha384, sha512,
signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, to_hex,
to_timestamp_micros, to_timestamp_millis, to_timestamp_seconds, translate, trim,
trunc, unalias, upper, when, Expr, ExprSchemable, Literal, Operator,
trunc, upper, when, Expr, ExprSchemable, Literal, Operator,
};
pub use datafusion_optimizer::expr_simplifier::SimplifyInfo;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as the datafusion_optimizer is publically exported there is no reason to also re-export it here as well

3 changes: 2 additions & 1 deletion datafusion/core/src/physical_plan/file_format/row_filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ use arrow::record_batch::RecordBatch;
use datafusion_common::{Column, DataFusionError, Result, ScalarValue, ToDFSchema};
use datafusion_expr::expr_rewriter::{ExprRewritable, ExprRewriter, RewriteRecursion};

use datafusion_expr::{uncombine_filter, Expr};
use datafusion_expr::Expr;
use datafusion_optimizer::utils::uncombine_filter;
use datafusion_physical_expr::execution_props::ExecutionProps;
use datafusion_physical_expr::{create_physical_expr, PhysicalExpr};
use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter};
Expand Down
3 changes: 2 additions & 1 deletion datafusion/core/src/physical_plan/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ use crate::logical_expr::{
TableScan, Window,
};
use crate::logical_plan::{
unalias, unnormalize_cols, CrossJoin, DFSchema, Expr, LogicalPlan,
unnormalize_cols, CrossJoin, DFSchema, Expr, LogicalPlan,
Partitioning as LogicalPartitioning, PlanType, Repartition, ToStringifiedPlan, Union,
UserDefinedLogicalNode,
};
Expand Down Expand Up @@ -63,6 +63,7 @@ use datafusion_common::ScalarValue;
use datafusion_expr::expr::GroupingSet;
use datafusion_expr::utils::{expand_wildcard, expr_to_columns};
use datafusion_expr::WindowFrameUnits;
use datafusion_optimizer::utils::unalias;
use datafusion_physical_expr::expressions::Literal;
use datafusion_sql::utils::window_expr_common_partition_keys;
use futures::future::BoxFuture;
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/tests/simplification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ use datafusion::logical_plan::ExprSchemable;
use datafusion::{
error::Result,
execution::context::ExecutionProps,
logical_plan::{DFSchema, Expr, SimplifyInfo},
logical_plan::{DFSchema, Expr},
prelude::*,
};
use datafusion_optimizer::expr_simplifier::ExprSimplifier;
use datafusion_optimizer::expr_simplifier::{ExprSimplifier, SimplifyInfo};

/// In order to simplify expressions, DataFusion must have information
/// about the expressions.
Expand Down
130 changes: 0 additions & 130 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -468,60 +468,6 @@ pub fn when(when: Expr, then: Expr) -> CaseBuilder {
CaseBuilder::new(None, vec![when], vec![then], None)
}

/// Combines an array of filter expressions into a single filter expression
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code was moved

/// consisting of the input filter expressions joined with logical AND.
/// Returns None if the filters array is empty.
pub fn combine_filters(filters: &[Expr]) -> Option<Expr> {
if filters.is_empty() {
return None;
}
let combined_filter = filters
.iter()
.skip(1)
.fold(filters[0].clone(), |acc, filter| and(acc, filter.clone()));
Some(combined_filter)
}

/// Take combined filter (multiple boolean expressions ANDed together)
/// and break down into distinct filters. This should be the inverse of
/// `combine_filters`
pub fn uncombine_filter(filter: Expr) -> Vec<Expr> {
match filter {
Expr::BinaryExpr {
left,
op: Operator::And,
right,
} => {
let mut exprs = uncombine_filter(*left);
exprs.extend(uncombine_filter(*right));
exprs
}
expr => {
vec![expr]
}
}
}

/// Combines an array of filter expressions into a single filter expression
/// consisting of the input filter expressions joined with logical OR.
/// Returns None if the filters array is empty.
pub fn combine_filters_disjunctive(filters: &[Expr]) -> Option<Expr> {
if filters.is_empty() {
return None;
}

filters.iter().cloned().reduce(or)
}

/// Recursively un-alias an expressions
#[inline]
pub fn unalias(expr: Expr) -> Expr {
match expr {
Expr::Alias(sub_expr, _) => unalias(*sub_expr),
_ => expr,
}
}

/// Creates a new UDF with a specific signature and specific return type.
/// This is a helper function to create a new UDF.
/// The function `create_udf` returns a subset of all possible `ScalarFunction`:
Expand Down Expand Up @@ -582,7 +528,6 @@ pub fn call_fn(name: impl AsRef<str>, args: Vec<Expr>) -> Result<Expr> {
#[cfg(test)]
mod test {
use super::*;
use arrow::datatypes::{Field, Schema};

#[test]
fn filter_is_null_and_is_not_null() {
Expand Down Expand Up @@ -737,79 +682,4 @@ mod test {
unreachable!();
}
}

#[test]
fn combine_zero_filters() {
let result = combine_filters(&[]);
assert_eq!(result, None);
}

#[test]
fn combine_one_filter() {
let filter = binary_expr(col("c1"), Operator::Lt, lit(1));
let result = combine_filters(&[filter.clone()]);
assert_eq!(result, Some(filter));
}

#[test]
fn combine_multiple_filters() {
let filter1 = binary_expr(col("c1"), Operator::Lt, lit(1));
let filter2 = binary_expr(col("c2"), Operator::Lt, lit(2));
let filter3 = binary_expr(col("c3"), Operator::Lt, lit(3));
let result =
combine_filters(&[filter1.clone(), filter2.clone(), filter3.clone()]);
assert_eq!(result, Some(and(and(filter1, filter2), filter3)));
}

fn assert_predicates(actual: Vec<Expr>, expected: Vec<Expr>) {
assert_eq!(
actual.len(),
expected.len(),
"Predicates are not equal, found {} predicates but expected {}",
actual.len(),
expected.len()
);

for expr in expected.into_iter() {
assert!(
actual.contains(&expr),
"Predicates are not equal, predicate {:?} not found in {:?}",
expr,
actual
);
}
}

#[test]
fn test_uncombine_filter() {
let _schema = Schema::new(vec![
Field::new("a", DataType::Utf8, true),
Field::new("b", DataType::Utf8, true),
Field::new("c", DataType::Utf8, true),
]);

let expr = col("a").eq(lit("s"));
let actual = uncombine_filter(expr);

assert_predicates(actual, vec![col("a").eq(lit("s"))]);
}

#[test]
fn test_uncombine_filter_recursively() {
let _schema = Schema::new(vec![
Field::new("a", DataType::Utf8, true),
Field::new("b", DataType::Utf8, true),
Field::new("c", DataType::Utf8, true),
]);

let expr = and(col("a"), col("b"));
let actual = uncombine_filter(expr);

assert_predicates(actual, vec![col("a"), col("b")]);

let expr = col("a").and(col("b")).or(col("c"));
let actual = uncombine_filter(expr.clone());

assert_predicates(actual, vec![expr]);
}
}
5 changes: 3 additions & 2 deletions datafusion/optimizer/src/decorrelate_where_exists.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
// under the License.

use crate::utils::{
exprs_to_join_cols, find_join_exprs, split_conjunction, verify_not_disjunction,
combine_filters, exprs_to_join_cols, find_join_exprs, split_conjunction,
verify_not_disjunction,
};
use crate::{utils, OptimizerConfig, OptimizerRule};
use datafusion_common::{context, plan_err, DataFusionError};
use datafusion_expr::logical_plan::{Filter, JoinType, Subquery};
use datafusion_expr::{combine_filters, Expr, LogicalPlan, LogicalPlanBuilder};
use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
use std::sync::Arc;

/// Optimizer rule for rewriting subquery filters to joins
Expand Down
6 changes: 3 additions & 3 deletions datafusion/optimizer/src/decorrelate_where_in.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
// under the License.

use crate::utils::{
alias_cols, exprs_to_join_cols, find_join_exprs, merge_cols, only_or_err,
split_conjunction, swap_table, verify_not_disjunction,
alias_cols, combine_filters, exprs_to_join_cols, find_join_exprs, merge_cols,
only_or_err, split_conjunction, swap_table, verify_not_disjunction,
};
use crate::{utils, OptimizerConfig, OptimizerRule};
use datafusion_common::context;
use datafusion_expr::logical_plan::{Filter, JoinType, Projection, Subquery};
use datafusion_expr::{combine_filters, Expr, LogicalPlan, LogicalPlanBuilder};
use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder};
use log::debug;
use std::sync::Arc;

Expand Down
4 changes: 2 additions & 2 deletions datafusion/optimizer/src/scalar_subquery_to_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
// under the License.

use crate::utils::{
exprs_to_join_cols, find_join_exprs, only_or_err, split_conjunction,
combine_filters, exprs_to_join_cols, find_join_exprs, only_or_err, split_conjunction,
verify_not_disjunction,
};
use crate::{utils, OptimizerConfig, OptimizerRule};
use datafusion_common::{context, plan_err, Column, Result};
use datafusion_expr::logical_plan::{Filter, JoinType, Limit, Subquery};
use datafusion_expr::{combine_filters, Expr, LogicalPlan, LogicalPlanBuilder, Operator};
use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, Operator};
use log::debug;
use std::sync::Arc;

Expand Down
Loading