Skip to content

Commit be55e3f

Browse files
committed
Fix incorrect ... LIKE '%' simplification with NULLs (apache#13259)
* Fix incorrect `... LIKE '%'` simplification `expr LIKE '%'` was previously simplified to `true`, but the expression returns `NULL` when `expr` is null. The conversion was conditional on `!is_null(expr)` which means "is not always true, i.e. is not a null literal". This commit adds correct simplification logic. It additionally expands the rule coverage to include string view (Utf8View) and large string (LargeUtf8). This allows writing shared test cases even despite `utf8_view LIKE '%'` returning incorrect results at execution time (tracked by apache#12637). I.e. the simplification masks the bug for cases where pattern is statically known. * fixup! Fix incorrect `... LIKE '%'` simplification * fix tests (re review comments)
1 parent 6a8f88f commit be55e3f

File tree

3 files changed

+109
-16
lines changed

3 files changed

+109
-16
lines changed

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,13 +1476,28 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
14761476
negated,
14771477
escape_char: _,
14781478
case_insensitive: _,
1479-
}) if !is_null(&expr)
1480-
&& matches!(
1481-
pattern.as_ref(),
1482-
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
1483-
) =>
1479+
}) if matches!(
1480+
pattern.as_ref(),
1481+
Expr::Literal(ScalarValue::Utf8(Some(pattern_str))) if pattern_str == "%"
1482+
) || matches!(
1483+
pattern.as_ref(),
1484+
Expr::Literal(ScalarValue::LargeUtf8(Some(pattern_str))) if pattern_str == "%"
1485+
) || matches!(
1486+
pattern.as_ref(),
1487+
Expr::Literal(ScalarValue::Utf8View(Some(pattern_str))) if pattern_str == "%"
1488+
) =>
14841489
{
1485-
Transformed::yes(lit(!negated))
1490+
// exp LIKE '%' is
1491+
// - when exp is not NULL, it's true
1492+
// - when exp is NULL, it's NULL
1493+
// exp NOT LIKE '%' is
1494+
// - when exp is not NULL, it's false
1495+
// - when exp is NULL, it's NULL
1496+
Transformed::yes(Expr::Case(Case {
1497+
expr: Some(Box::new(Expr::IsNotNull(expr))),
1498+
when_then_expr: vec![(Box::new(lit(true)), Box::new(lit(!negated)))],
1499+
else_expr: None,
1500+
}))
14861501
}
14871502

14881503
// a is not null/unknown --> true (if a is not nullable)
@@ -2777,10 +2792,22 @@ mod tests {
27772792
assert_no_change(regex_match(col("c1"), lit("f_o")));
27782793

27792794
// empty cases
2780-
assert_change(regex_match(col("c1"), lit("")), lit(true));
2781-
assert_change(regex_not_match(col("c1"), lit("")), lit(false));
2782-
assert_change(regex_imatch(col("c1"), lit("")), lit(true));
2783-
assert_change(regex_not_imatch(col("c1"), lit("")), lit(false));
2795+
assert_change(
2796+
regex_match(col("c1"), lit("")),
2797+
if_not_null(col("c1"), true),
2798+
);
2799+
assert_change(
2800+
regex_not_match(col("c1"), lit("")),
2801+
if_not_null(col("c1"), false),
2802+
);
2803+
assert_change(
2804+
regex_imatch(col("c1"), lit("")),
2805+
if_not_null(col("c1"), true),
2806+
);
2807+
assert_change(
2808+
regex_not_imatch(col("c1"), lit("")),
2809+
if_not_null(col("c1"), false),
2810+
);
27842811

27852812
// single character
27862813
assert_change(regex_match(col("c1"), lit("x")), like(col("c1"), "%x%"));
@@ -3606,20 +3633,20 @@ mod tests {
36063633

36073634
#[test]
36083635
fn test_like_and_ilke() {
3609-
// test non-null values
3636+
// LIKE '%'
36103637
let expr = like(col("c1"), "%");
3611-
assert_eq!(simplify(expr), lit(true));
3638+
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
36123639

36133640
let expr = not_like(col("c1"), "%");
3614-
assert_eq!(simplify(expr), lit(false));
3641+
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
36153642

36163643
let expr = ilike(col("c1"), "%");
3617-
assert_eq!(simplify(expr), lit(true));
3644+
assert_eq!(simplify(expr), if_not_null(col("c1"), true));
36183645

36193646
let expr = not_ilike(col("c1"), "%");
3620-
assert_eq!(simplify(expr), lit(false));
3647+
assert_eq!(simplify(expr), if_not_null(col("c1"), false));
36213648

3622-
// test null values
3649+
// null_constant LIKE '%'
36233650
let null = lit(ScalarValue::Utf8(None));
36243651
let expr = like(null.clone(), "%");
36253652
assert_eq!(simplify(expr), lit_bool_null());
@@ -4043,4 +4070,12 @@ mod tests {
40434070
);
40444071
}
40454072
}
4073+
4074+
fn if_not_null(expr: Expr, then: bool) -> Expr {
4075+
Expr::Case(Case {
4076+
expr: Some(expr.is_not_null().into()),
4077+
when_then_expr: vec![(lit(true).into(), lit(then).into())],
4078+
else_expr: None,
4079+
})
4080+
}
40464081
}

datafusion/sqllogictest/test_files/string/string_literal.slt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -821,3 +821,11 @@ query TT
821821
select ' ', '|'
822822
----
823823
|
824+
825+
query BBB
826+
SELECT
827+
NULL LIKE '%',
828+
'' LIKE '%',
829+
'a' LIKE '%'
830+
----
831+
NULL true true

datafusion/sqllogictest/test_files/string/string_query.slt.part

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,56 @@ NULL NULL NULL NULL NULL
873873
#Raphael datafusionДатаФусион false false false false
874874
#NULL NULL NULL NULL NULL NULL
875875

876+
# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
877+
query TTBB
878+
SELECT ascii_1, unicode_1,
879+
ascii_1 LIKE '%' AS ascii_1_like_percent,
880+
unicode_1 LIKE '%' AS unicode_1_like_percent
881+
-- ascii_1 LIKE '%%' AS ascii_1_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
882+
-- unicode_1 LIKE '%%' AS unicode_1_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
883+
FROM test_basic_operator
884+
----
885+
Andrew datafusion📊🔥 true true
886+
Xiangpeng datafusion数据融合 true true
887+
Raphael datafusionДатаФусион true true
888+
under_score un iść core true true
889+
percent pan Tadeusz ma iść w kąt true true
890+
(empty) (empty) true true
891+
NULL NULL NULL NULL
892+
NULL NULL NULL NULL
893+
894+
# TODO (https://github.com/apache/datafusion/issues/12637) uncomment additional test projections
895+
query TTBB
896+
SELECT ascii_1, unicode_1,
897+
ascii_1 NOT LIKE '%' AS ascii_1_not_like_percent,
898+
unicode_1 NOT LIKE '%' AS unicode_1_not_like_percent
899+
-- ascii_1 NOT LIKE '%%' AS ascii_1_not_like_percent_percent, -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
900+
-- unicode_1 NOT LIKE '%%' AS unicode_1_not_like_percent_percent -- TODO enable after fixing https://github.com/apache/datafusion/issues/12637
901+
FROM test_basic_operator
902+
----
903+
Andrew datafusion📊🔥 false false
904+
Xiangpeng datafusion数据融合 false false
905+
Raphael datafusionДатаФусион false false
906+
under_score un iść core false false
907+
percent pan Tadeusz ma iść w kąt false false
908+
(empty) (empty) false false
909+
NULL NULL NULL NULL
910+
NULL NULL NULL NULL
911+
912+
query T
913+
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 LIKE '%'
914+
----
915+
Andrew
916+
Xiangpeng
917+
Raphael
918+
under_score
919+
percent
920+
(empty)
921+
922+
query T
923+
SELECT ascii_1 FROM test_basic_operator WHERE ascii_1 NOT LIKE '%'
924+
----
925+
876926
# Test pattern without wildcard characters
877927
query TTBBBB
878928
select ascii_1, unicode_1,

0 commit comments

Comments
 (0)