Skip to content

Commit c6285bf

Browse files
committed
Expand LIKE simplification
- cover expression known not to be null - cover NULL pattern - cover repeated '%%' in pattern
1 parent f4798a1 commit c6285bf

File tree

5 files changed

+215
-91
lines changed

5 files changed

+215
-91
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/optimizer/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ itertools = { workspace = true }
4848
log = { workspace = true }
4949
paste = "1.0.14"
5050
regex-syntax = "0.8.0"
51+
regex = "1.11.0"
5152

5253
[dev-dependencies]
5354
arrow-buffer = { workspace = true }

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 196 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ use crate::analyzer::type_coercion::TypeCoercionRewriter;
4949
use crate::simplify_expressions::guarantees::GuaranteeRewriter;
5050
use crate::simplify_expressions::regex::simplify_regex_expr;
5151
use crate::simplify_expressions::SimplifyInfo;
52+
use regex::Regex;
5253

5354
use super::inlist_simplifier::ShortenInListSimplifier;
5455
use super::utils::*;
@@ -1470,34 +1471,54 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
14701471
}) => Transformed::yes(simplify_regex_expr(left, op, right)?),
14711472

14721473
// Rules for Like
1473-
Expr::Like(Like {
1474-
expr,
1475-
pattern,
1476-
negated,
1477-
escape_char: _,
1478-
case_insensitive: _,
1479-
}) if matches!(
1480-
pattern.as_ref(),
1481-
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
1482-
) || matches!(
1483-
pattern.as_ref(),
1484-
Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%"
1485-
) || matches!(
1486-
pattern.as_ref(),
1487-
Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%"
1488-
) =>
1489-
{
1490-
// exp LIKE '%' is
1491-
// - when exp is not NULL, it's true
1492-
// - when exp is NULL, it's NULL
1493-
// exp NOT LIKE '%' is
1494-
// - when exp is not NULL, it's false
1495-
// - when exp is NULL, it's NULL
1496-
Transformed::yes(Expr::Case(Case {
1497-
expr: Some(Box::new(Expr::IsNotNull(expr))),
1498-
when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))],
1499-
else_expr: None,
1500-
}))
1474+
Expr::Like(like) => {
1475+
match as_string_scalar(&like.pattern) {
1476+
Some((data_type, pattern_str)) => {
1477+
match pattern_str {
1478+
None => return Ok(Transformed::yes(lit_bool_null())),
1479+
Some(pattern_str) if pattern_str == "%" => {
1480+
// exp LIKE '%' is
1481+
// - when exp is not NULL, it's true
1482+
// - when exp is NULL, it's NULL
1483+
// exp NOT LIKE '%' is
1484+
// - when exp is not NULL, it's false
1485+
// - when exp is NULL, it's NULL
1486+
let result_for_non_null = lit(!like.negated);
1487+
Transformed::yes(if !info.nullable(&like.expr)? {
1488+
result_for_non_null
1489+
} else {
1490+
Expr::Case(Case {
1491+
expr: Some(Box::new(Expr::IsNotNull(like.expr))),
1492+
when_then_expr: vec![(
1493+
Box::new(lit(true)),
1494+
Box::new(result_for_non_null),
1495+
)],
1496+
else_expr: None,
1497+
})
1498+
})
1499+
}
1500+
Some(pattern_str)
1501+
if pattern_str.contains("%%") &&
1502+
// TODO support more complete unescaping
1503+
(like.escape_char.is_none() || pattern_str.contains(like.escape_char.unwrap())) =>
1504+
{
1505+
let simplified_pattern = Regex::new("%%+")
1506+
.unwrap()
1507+
.replace_all(&pattern_str, "%")
1508+
.to_string();
1509+
Transformed::yes(Expr::Like(Like {
1510+
pattern: Box::new(to_string_scalar(
1511+
data_type,
1512+
Some(simplified_pattern),
1513+
)),
1514+
..like
1515+
}))
1516+
}
1517+
Some(_pattern_str) => Transformed::no(Expr::Like(like)),
1518+
}
1519+
}
1520+
None => Transformed::no(Expr::Like(like)),
1521+
}
15011522
}
15021523

15031524
// a is not null/unknown --> true (if a is not nullable)
@@ -1696,6 +1717,24 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
16961717
}
16971718
}
16981719

1720+
fn as_string_scalar(expr: &Expr) -> Option<(DataType, &Option<String>)> {
1721+
match expr {
1722+
Expr::Literal(ScalarValue::Utf8(s)) => Some((DataType::Utf8, s)),
1723+
Expr::Literal(ScalarValue::LargeUtf8(s)) => Some((DataType::LargeUtf8, s)),
1724+
Expr::Literal(ScalarValue::Utf8View(s)) => Some((DataType::Utf8View, s)),
1725+
_ => None,
1726+
}
1727+
}
1728+
1729+
fn to_string_scalar(data_type: DataType, value: Option<String>) -> Expr {
1730+
match data_type {
1731+
DataType::Utf8 => Expr::Literal(ScalarValue::Utf8(value)),
1732+
DataType::LargeUtf8 => Expr::Literal(ScalarValue::LargeUtf8(value)),
1733+
DataType::Utf8View => Expr::Literal(ScalarValue::Utf8View(value)),
1734+
_ => unreachable!(),
1735+
}
1736+
}
1737+
16991738
fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool {
17001739
let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect();
17011740
iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile())
@@ -2810,10 +2849,16 @@ mod tests {
28102849
);
28112850

28122851
// single character
2813-
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
2852+
assert_change(
2853+
regex_match(col("c1"), lit("x")),
2854+
like(col("c1"), lit("%x%")),
2855+
);
28142856

28152857
// single word
2816-
assert_change(regex_match(col("c1"), lit("foo")), like(col("c1"), "%foo%"));
2858+
assert_change(
2859+
regex_match(col("c1"), lit("foo")),
2860+
like(col("c1"), lit("%foo%")),
2861+
);
28172862

28182863
// regular expressions that match an exact literal
28192864
assert_change(regex_match(col("c1"), lit("^$")), col("c1").eq(lit("")));
@@ -2900,44 +2945,50 @@ mod tests {
29002945
assert_no_change(regex_match(col("c1"), lit("$foo^")));
29012946

29022947
// regular expressions that match a partial literal
2903-
assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), "foo%"));
2904-
assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), "%foo"));
2948+
assert_change(
2949+
regex_match(col("c1"), lit("^foo")),
2950+
like(col("c1"), lit("foo%")),
2951+
);
2952+
assert_change(
2953+
regex_match(col("c1"), lit("foo$")),
2954+
like(col("c1"), lit("%foo")),
2955+
);
29052956
assert_change(
29062957
regex_match(col("c1"), lit("^foo|bar$")),
2907-
like(col("c1"), "foo%").or(like(col("c1"), "%bar")),
2958+
like(col("c1"), lit("foo%")).or(like(col("c1"), lit("%bar"))),
29082959
);
29092960

29102961
// OR-chain
29112962
assert_change(
29122963
regex_match(col("c1"), lit("foo|bar|baz")),
2913-
like(col("c1"), "%foo%")
2914-
.or(like(col("c1"), "%bar%"))
2915-
.or(like(col("c1"), "%baz%")),
2964+
like(col("c1"), lit("%foo%"))
2965+
.or(like(col("c1"), lit("%bar%")))
2966+
.or(like(col("c1"), lit("%baz%"))),
29162967
);
29172968
assert_change(
29182969
regex_match(col("c1"), lit("foo|x|baz")),
2919-
like(col("c1"), "%foo%")
2920-
.or(like(col("c1"), "%x%"))
2921-
.or(like(col("c1"), "%baz%")),
2970+
like(col("c1"), lit("%foo%"))
2971+
.or(like(col("c1"), lit("%x%")))
2972+
.or(like(col("c1"), lit("%baz%"))),
29222973
);
29232974
assert_change(
29242975
regex_not_match(col("c1"), lit("foo|bar|baz")),
2925-
not_like(col("c1"), "%foo%")
2926-
.and(not_like(col("c1"), "%bar%"))
2927-
.and(not_like(col("c1"), "%baz%")),
2976+
not_like(col("c1"), lit("%foo%"))
2977+
.and(not_like(col("c1"), lit("%bar%")))
2978+
.and(not_like(col("c1"), lit("%baz%"))),
29282979
);
29292980
// both anchored expressions (translated to equality) and unanchored
29302981
assert_change(
29312982
regex_match(col("c1"), lit("foo|^x$|baz")),
2932-
like(col("c1"), "%foo%")
2983+
like(col("c1"), lit("%foo%"))
29332984
.or(col("c1").eq(lit("x")))
2934-
.or(like(col("c1"), "%baz%")),
2985+
.or(like(col("c1"), lit("%baz%"))),
29352986
);
29362987
assert_change(
29372988
regex_not_match(col("c1"), lit("foo|^bar$|baz")),
2938-
not_like(col("c1"), "%foo%")
2989+
not_like(col("c1"), lit("%foo%"))
29392990
.and(col("c1").not_eq(lit("bar")))
2940-
.and(not_like(col("c1"), "%baz%")),
2991+
.and(not_like(col("c1"), lit("%baz%"))),
29412992
);
29422993
// Too many patterns (MAX_REGEX_ALTERNATIONS_EXPANSION)
29432994
assert_no_change(regex_match(col("c1"), lit("foo|bar|baz|blarg|bozo|etc")));
@@ -2987,41 +3038,41 @@ mod tests {
29873038
})
29883039
}
29893040

2990-
fn like(expr: Expr, pattern: &str) -> Expr {
3041+
fn like(expr: Expr, pattern: impl Into<Expr>) -> Expr {
29913042
Expr::Like(Like {
29923043
negated: false,
29933044
expr: Box::new(expr),
2994-
pattern: Box::new(lit(pattern)),
3045+
pattern: Box::new(pattern.into()),
29953046
escape_char: None,
29963047
case_insensitive: false,
29973048
})
29983049
}
29993050

3000-
fn not_like(expr: Expr, pattern: &str) -> Expr {
3051+
fn not_like(expr: Expr, pattern: impl Into<Expr>) -> Expr {
30013052
Expr::Like(Like {
30023053
negated: true,
30033054
expr: Box::new(expr),
3004-
pattern: Box::new(lit(pattern)),
3055+
pattern: Box::new(pattern.into()),
30053056
escape_char: None,
30063057
case_insensitive: false,
30073058
})
30083059
}
30093060

3010-
fn ilike(expr: Expr, pattern: &str) -> Expr {
3061+
fn ilike(expr: Expr, pattern: impl Into<Expr>) -> Expr {
30113062
Expr::Like(Like {
30123063
negated: false,
30133064
expr: Box::new(expr),
3014-
pattern: Box::new(lit(pattern)),
3065+
pattern: Box::new(pattern.into()),
30153066
escape_char: None,
30163067
case_insensitive: true,
30173068
})
30183069
}
30193070

3020-
fn not_ilike(expr: Expr, pattern: &str) -> Expr {
3071+
fn not_ilike(expr: Expr, pattern: impl Into<Expr>) -> Expr {
30213072
Expr::Like(Like {
30223073
negated: true,
30233074
expr: Box::new(expr),
3024-
pattern: Box::new(lit(pattern)),
3075+
pattern: Box::new(pattern.into()),
30253076
escape_char: None,
30263077
case_insensitive: true,
30273078
})
@@ -3633,31 +3684,112 @@ mod tests {
36333684

36343685
#[test]
36353686
fn test_like_and_ilke() {
3636-
// LIKE '%'
3637-
let expr = like(col("c1"), "%");
3687+
let null = lit(ScalarValue::Utf8(None));
3688+
3689+
// expr [NOT] [I]LIKE NULL
3690+
let expr = like(col("c1"), null.clone());
3691+
assert_eq!(simplify(expr), lit_bool_null());
3692+
3693+
let expr = not_like(col("c1"), null.clone());
3694+
assert_eq!(simplify(expr), lit_bool_null());
3695+
3696+
let expr = ilike(col("c1"), null.clone());
3697+
assert_eq!(simplify(expr), lit_bool_null());
3698+
3699+
let expr = not_ilike(col("c1"), null.clone());
3700+
assert_eq!(simplify(expr), lit_bool_null());
3701+
3702+
// expr [NOT] [I]LIKE '%'
3703+
let expr = like(col("c1"), lit("%"));
3704+
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
3705+
3706+
let expr = not_like(col("c1"), lit("%"));
3707+
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
3708+
3709+
let expr = ilike(col("c1"), lit("%"));
3710+
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
3711+
3712+
let expr = not_ilike(col("c1"), lit("%"));
3713+
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
3714+
3715+
// expr [NOT] [I]LIKE '%%'
3716+
let expr = like(col("c1"), lit("%%"));
36383717
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
36393718

3640-
let expr = not_like(col("c1"), "%");
3719+
let expr = not_like(col("c1"), lit("%%"));
36413720
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
36423721

3643-
let expr = ilike(col("c1"), "%");
3722+
let expr = ilike(col("c1"), lit("%%"));
36443723
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
36453724

3646-
let expr = not_ilike(col("c1"), "%");
3725+
let expr = not_ilike(col("c1"), lit("%%"));
36473726
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
36483727

3649-
// null_constant LIKE '%'
3728+
// not_null_expr [NOT] [I]LIKE '%'
3729+
let expr = like(col("c1_non_null"), lit("%"));
3730+
assert_eq!(simplify(expr), lit(true));
3731+
3732+
let expr = not_like(col("c1_non_null"), lit("%"));
3733+
assert_eq!(simplify(expr), lit(false));
3734+
3735+
let expr = ilike(col("c1_non_null"), lit("%"));
3736+
assert_eq!(simplify(expr), lit(true));
3737+
3738+
let expr = not_ilike(col("c1_non_null"), lit("%"));
3739+
assert_eq!(simplify(expr), lit(false));
3740+
3741+
// not_null_expr [NOT] [I]LIKE '%%'
3742+
let expr = like(col("c1_non_null"), lit("%%"));
3743+
assert_eq!(simplify(expr), lit(true));
3744+
3745+
let expr = not_like(col("c1_non_null"), lit("%%"));
3746+
assert_eq!(simplify(expr), lit(false));
3747+
3748+
let expr = ilike(col("c1_non_null"), lit("%%"));
3749+
assert_eq!(simplify(expr), lit(true));
3750+
3751+
let expr = not_ilike(col("c1_non_null"), lit("%%"));
3752+
assert_eq!(simplify(expr), lit(false));
3753+
3754+
// null_constant [NOT] [I]LIKE '%'
3755+
let expr = like(null.clone(), lit("%"));
3756+
assert_eq!(simplify(expr), lit_bool_null());
3757+
3758+
let expr = not_like(null.clone(), lit("%"));
3759+
assert_eq!(simplify(expr), lit_bool_null());
3760+
3761+
let expr = ilike(null.clone(), lit("%"));
3762+
assert_eq!(simplify(expr), lit_bool_null());
3763+
3764+
let expr = not_ilike(null, lit("%"));
3765+
assert_eq!(simplify(expr), lit_bool_null());
3766+
3767+
// null_constant [NOT] [I]LIKE '%%'
3768+
let null = lit(ScalarValue::Utf8(None));
3769+
let expr = like(null.clone(), lit("%%"));
3770+
assert_eq!(simplify(expr), lit_bool_null());
3771+
3772+
let expr = not_like(null.clone(), lit("%%"));
3773+
assert_eq!(simplify(expr), lit_bool_null());
3774+
3775+
let expr = ilike(null.clone(), lit("%%"));
3776+
assert_eq!(simplify(expr), lit_bool_null());
3777+
3778+
let expr = not_ilike(null, lit("%%"));
3779+
assert_eq!(simplify(expr), lit_bool_null());
3780+
3781+
// null_constant [NOT] [I]LIKE 'a%'
36503782
let null = lit(ScalarValue::Utf8(None));
3651-
let expr = like(null.clone(), "%");
3783+
let expr = like(null.clone(), lit("a%"));
36523784
assert_eq!(simplify(expr), lit_bool_null());
36533785

3654-
let expr = not_like(null.clone(), "%");
3786+
let expr = not_like(null.clone(), lit("a%"));
36553787
assert_eq!(simplify(expr), lit_bool_null());
36563788

3657-
let expr = ilike(null.clone(), "%");
3789+
let expr = ilike(null.clone(), lit("a%"));
36583790
assert_eq!(simplify(expr), lit_bool_null());
36593791

3660-
let expr = not_ilike(null, "%");
3792+
let expr = not_ilike(null, lit("a%"));
36613793
assert_eq!(simplify(expr), lit_bool_null());
36623794
}
36633795

datafusion/sqllogictest/test_files/string/init_data.slt.part

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
# TODO (https://github.com/apache/datafusion/issues/12637): add a row with '%%' pattern
1819
statement ok
1920
create table test_source as values
2021
('Andrew', 'X', 'datafusion📊🔥', '🔥'),

0 commit comments

Comments
 (0)