Skip to content

Commit c6656f2

Browse files
UBarneyalamb
andauthored
Implement predicate pruning for not like expressions (#14567)
* Implement predicate pruning for not like expressions * add split_constant_prefix * Update datafusion/physical-optimizer/src/pruning.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * add more testcase --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent a104661 commit c6656f2

File tree

2 files changed

+228
-13
lines changed

2 files changed

+228
-13
lines changed

datafusion/core/tests/fuzz_cases/pruning.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,27 @@ async fn test_utf8_not_like_prefix() {
110110
.await;
111111
}
112112

113+
#[tokio::test]
114+
async fn test_utf8_not_like_ecsape() {
115+
Utf8Test::new(|value| col("a").not_like(lit(format!("\\%{}%", value))))
116+
.run()
117+
.await;
118+
}
119+
113120
#[tokio::test]
114121
async fn test_utf8_not_like_suffix() {
115122
Utf8Test::new(|value| col("a").not_like(lit(format!("{}%", value))))
116123
.run()
117124
.await;
118125
}
119126

127+
#[tokio::test]
128+
async fn test_utf8_not_like_suffix_one() {
129+
Utf8Test::new(|value| col("a").not_like(lit(format!("{}_", value))))
130+
.run()
131+
.await;
132+
}
133+
120134
/// Fuzz testing for UTF8 predicate pruning
121135
/// The basic idea is that query results should always be the same with or without stats/pruning
122136
/// If we get this right we at least guarantee that there are no incorrect results

datafusion/physical-optimizer/src/pruning.rs

Lines changed: 214 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
15901590
)),
15911591
))
15921592
}
1593+
Operator::NotLikeMatch => build_not_like_match(expr_builder)?,
15931594
Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| {
15941595
plan_datafusion_err!(
15951596
"LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
16381639
Ok(statistics_expr)
16391640
}
16401641

1642+
/// returns the string literal of the scalar value if it is a string
1643+
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1644+
s.try_as_str().flatten()
1645+
}
1646+
1647+
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1648+
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1649+
let s = unpack_string(lit.value())?;
1650+
return Some(s);
1651+
}
1652+
None
1653+
}
1654+
16411655
/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421656
/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431657
/// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
16501664
// column LIKE '%foo%' => min <= '' && '' <= max => true
16511665
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521666

1653-
/// returns the string literal of the scalar value if it is a string
1654-
fn unpack_string(s: &ScalarValue) -> Option<&str> {
1655-
s.try_as_str().flatten()
1656-
}
1657-
1658-
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&str> {
1659-
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1660-
let s = unpack_string(lit.value())?;
1661-
return Some(s);
1662-
}
1663-
None
1664-
}
1665-
16661667
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671668
// this may involve building the physical expressions that call lower() and upper()
16681669
let min_column_expr = expr_builder.min_column_expr().ok()?;
@@ -1710,6 +1711,80 @@ fn build_like_match(
17101711
Some(combined)
17111712
}
17121713

1714+
// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`.
1715+
//
1716+
// The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means
1717+
// **all** data in this row group begins with `const_prefix` as well (and therefore the predicate
1718+
// looking for rows that don't begin with `const_prefix` can never be true)
1719+
fn build_not_like_match(
1720+
expr_builder: &mut PruningExpressionBuilder<'_>,
1721+
) -> Result<Arc<dyn PhysicalExpr>> {
1722+
// col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%')
1723+
1724+
let min_column_expr = expr_builder.min_column_expr()?;
1725+
let max_column_expr = expr_builder.max_column_expr()?;
1726+
1727+
let scalar_expr = expr_builder.scalar_expr();
1728+
1729+
let pattern = extract_string_literal(scalar_expr).ok_or_else(|| {
1730+
plan_datafusion_err!("cannot extract literal from NOT LIKE expression")
1731+
})?;
1732+
1733+
let (const_prefix, remaining) = split_constant_prefix(pattern);
1734+
if const_prefix.is_empty() || remaining != "%" {
1735+
// we can not handle `%` at the beginning or in the middle of the pattern
1736+
// Example: For pattern "foo%bar", the row group might include values like
1737+
// ["foobar", "food", "foodbar"], making it unsafe to prune.
1738+
// Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1739+
// match the pattern, intermediate values like "food" may not
1740+
// match the full pattern "foo%bar", making pruning unsafe.
1741+
// (truncate foo%bar to foo% have same problem)
1742+
1743+
// we can not handle pattern containing `_`
1744+
// Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1745+
// which means not every row is guaranteed to match the pattern.
1746+
return Err(plan_datafusion_err!(
1747+
"NOT LIKE expressions only support constant_prefix+wildcard`%`"
1748+
));
1749+
}
1750+
1751+
let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new(
1752+
true,
1753+
false,
1754+
Arc::clone(&min_column_expr),
1755+
Arc::clone(scalar_expr),
1756+
));
1757+
1758+
let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new(
1759+
true,
1760+
false,
1761+
Arc::clone(&max_column_expr),
1762+
Arc::clone(scalar_expr),
1763+
));
1764+
1765+
Ok(Arc::new(phys_expr::BinaryExpr::new(
1766+
min_col_not_like_epxr,
1767+
Operator::Or,
1768+
max_col_not_like_expr,
1769+
)))
1770+
}
1771+
1772+
/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
1773+
fn split_constant_prefix(pattern: &str) -> (&str, &str) {
1774+
let char_indices = pattern.char_indices().collect::<Vec<_>>();
1775+
for i in 0..char_indices.len() {
1776+
let (idx, char) = char_indices[i];
1777+
if char == '%' || char == '_' {
1778+
if i != 0 && char_indices[i - 1].1 == '\\' {
1779+
// ecsaped by `\`
1780+
continue;
1781+
}
1782+
return (&pattern[..idx], &pattern[idx..]);
1783+
}
1784+
}
1785+
(pattern, "")
1786+
}
1787+
17131788
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141789
/// This makes it so that the returned string will always compare greater than the input string
17151790
/// or any other string with the same prefix.
@@ -4061,6 +4136,132 @@ mod tests {
40614136
prune_with_expr(expr, &schema, &statistics, expected_ret);
40624137
}
40634138

4139+
#[test]
4140+
fn prune_utf8_not_like_one() {
4141+
let (schema, statistics) = utf8_setup();
4142+
4143+
let expr = col("s1").not_like(lit("A\u{10ffff}_"));
4144+
#[rustfmt::skip]
4145+
let expected_ret = &[
4146+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4147+
true,
4148+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4149+
true,
4150+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4151+
true,
4152+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4153+
true,
4154+
// s1 [NULL, NULL] ==> unknown (must keep)
4155+
true,
4156+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4157+
true,
4158+
// s1 ["", "A"] ==> some rows could pass (must keep)
4159+
true,
4160+
// s1 ["", ""] ==> some rows could pass (must keep)
4161+
true,
4162+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4163+
true,
4164+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4165+
// orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4166+
true,
4167+
];
4168+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4169+
}
4170+
4171+
#[test]
4172+
fn prune_utf8_not_like_many() {
4173+
let (schema, statistics) = utf8_setup();
4174+
4175+
let expr = col("s1").not_like(lit("A\u{10ffff}%"));
4176+
#[rustfmt::skip]
4177+
let expected_ret = &[
4178+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4179+
true,
4180+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4181+
true,
4182+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4183+
true,
4184+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4185+
true,
4186+
// s1 [NULL, NULL] ==> unknown (must keep)
4187+
true,
4188+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4189+
true,
4190+
// s1 ["", "A"] ==> some rows could pass (must keep)
4191+
true,
4192+
// s1 ["", ""] ==> some rows could pass (must keep)
4193+
true,
4194+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4195+
true,
4196+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4197+
false,
4198+
];
4199+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4200+
4201+
let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}"));
4202+
#[rustfmt::skip]
4203+
let expected_ret = &[
4204+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4205+
true,
4206+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4207+
true,
4208+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4209+
true,
4210+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4211+
true,
4212+
// s1 [NULL, NULL] ==> unknown (must keep)
4213+
true,
4214+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4215+
true,
4216+
// s1 ["", "A"] ==> some rows could pass (must keep)
4217+
true,
4218+
// s1 ["", ""] ==> some rows could pass (must keep)
4219+
true,
4220+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4221+
true,
4222+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4223+
true,
4224+
];
4225+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4226+
4227+
let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}_"));
4228+
#[rustfmt::skip]
4229+
let expected_ret = &[
4230+
// s1 ["A", "Z"] ==> some rows could pass (must keep)
4231+
true,
4232+
// s1 ["A", "L"] ==> some rows could pass (must keep)
4233+
true,
4234+
// s1 ["N", "Z"] ==> some rows could pass (must keep)
4235+
true,
4236+
// s1 ["M", "M"] ==> some rows could pass (must keep)
4237+
true,
4238+
// s1 [NULL, NULL] ==> unknown (must keep)
4239+
true,
4240+
// s1 ["A", NULL] ==> some rows could pass (must keep)
4241+
true,
4242+
// s1 ["", "A"] ==> some rows could pass (must keep)
4243+
true,
4244+
// s1 ["", ""] ==> some rows could pass (must keep)
4245+
true,
4246+
// s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4247+
true,
4248+
// s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4249+
true,
4250+
];
4251+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4252+
4253+
let expr = col("s1").not_like(lit("A\\%%"));
4254+
let statistics = TestStatistics::new().with(
4255+
"s1",
4256+
ContainerStats::new_utf8(
4257+
vec![Some("A%a"), Some("A")],
4258+
vec![Some("A%c"), Some("A")],
4259+
),
4260+
);
4261+
let expected_ret = &[false, true];
4262+
prune_with_expr(expr, &schema, &statistics, expected_ret);
4263+
}
4264+
40644265
#[test]
40654266
fn test_rewrite_expr_to_prunable() {
40664267
let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);

0 commit comments

Comments
 (0)