@@ -1590,6 +1590,13 @@ fn build_statistics_expr(
15901590 ) ) ,
15911591 ) )
15921592 }
1593+ Operator :: NotLikeMatch => {
1594+ build_not_like_match ( expr_builder) . ok_or_else ( || {
1595+ plan_datafusion_err ! (
1596+ "The NOT LIKE expression with wildcards is only supported at the end of the pattern"
1597+ )
1598+ } ) ?
1599+ }
15931600 Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
15941601 plan_datafusion_err ! (
15951602 "LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1645,19 @@ fn build_statistics_expr(
16381645 Ok ( statistics_expr)
16391646}
16401647
1648+ /// returns the string literal of the scalar value if it is a string
1649+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1650+ s. try_as_str ( ) . flatten ( )
1651+ }
1652+
1653+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1654+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1655+ let s = unpack_string ( lit. value ( ) ) ?;
1656+ return Some ( s) ;
1657+ }
1658+ None
1659+ }
1660+
16411661/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421662/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431663/// lowest string after all P* strings.
@@ -1650,19 +1670,6 @@ fn build_like_match(
16501670 // column LIKE '%foo%' => min <= '' && '' <= max => true
16511671 // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521672
1653- /// returns the string literal of the scalar value if it is a string
1654- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655- s. try_as_str ( ) . flatten ( )
1656- }
1657-
1658- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660- let s = unpack_string ( lit. value ( ) ) ?;
1661- return Some ( s) ;
1662- }
1663- None
1664- }
1665-
16661673 // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671674 // this may involve building the physical expressions that call lower() and upper()
16681675 let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1717,56 @@ fn build_like_match(
17101717 Some ( combined)
17111718}
17121719
1720+ // For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1721+ fn build_not_like_match (
1722+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1723+ ) -> Option < Arc < dyn PhysicalExpr > > {
1724+ // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1725+
1726+ let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
1727+ let max_column_expr = expr_builder. max_column_expr ( ) . ok ( ) ?;
1728+
1729+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1730+
1731+ let pattern = extract_string_literal ( scalar_expr) ?;
1732+
1733+ let chars: Vec < char > = pattern. chars ( ) . collect ( ) ;
1734+ for i in 0 ..chars. len ( ) - 1 {
1735+ // Check if current char is a wildcard and is not escaped with backslash
1736+ if ( chars[ i] == '%' || chars[ i] == '_' ) && ( i == 0 || chars[ i - 1 ] != '\\' ) {
1737+ // Example: For pattern "foo%bar", the row group might include values like
1738+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1739+ return None ;
1740+ }
1741+ }
1742+
1743+ if chars. last ( ) == Some ( & '_' ) && ( chars. len ( ) > 1 && chars[ chars. len ( ) - 2 ] != '\\' ) {
1744+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1745+ // which means not every row is guaranteed to match the pattern.
1746+ return None ;
1747+ }
1748+
1749+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1750+ true ,
1751+ false ,
1752+ Arc :: clone ( & min_column_expr) ,
1753+ Arc :: clone ( scalar_expr) ,
1754+ ) ) ;
1755+
1756+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1757+ true ,
1758+ false ,
1759+ Arc :: clone ( & max_column_expr) ,
1760+ Arc :: clone ( scalar_expr) ,
1761+ ) ) ;
1762+
1763+ Some ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1764+ min_col_not_like_epxr,
1765+ Operator :: Or ,
1766+ max_col_not_like_expr,
1767+ ) ) )
1768+ }
1769+
17131770/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141771/// This makes it so that the returned string will always compare greater than the input string
17151772/// or any other string with the same prefix.
@@ -4061,6 +4118,106 @@ mod tests {
40614118 prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
40624119 }
40634120
4121+ #[ test]
4122+ fn prune_utf8_not_like_one ( ) {
4123+ let ( schema, statistics) = utf8_setup ( ) ;
4124+
4125+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4126+ #[ rustfmt:: skip]
4127+ let expected_ret = & [
4128+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4129+ true ,
4130+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4131+ true ,
4132+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4133+ true ,
4134+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4135+ true ,
4136+ // s1 [NULL, NULL] ==> unknown (must keep)
4137+ true ,
4138+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4139+ true ,
4140+ // s1 ["", "A"] ==> some rows could pass (must keep)
4141+ true ,
4142+ // s1 ["", ""] ==> some rows could pass (must keep)
4143+ true ,
4144+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4145+ true ,
4146+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4147+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4148+ true ,
4149+ ] ;
4150+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4151+ }
4152+
4153+ #[ test]
4154+ fn prune_utf8_not_like_many ( ) {
4155+ let ( schema, statistics) = utf8_setup ( ) ;
4156+
4157+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4158+ #[ rustfmt:: skip]
4159+ let expected_ret = & [
4160+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4161+ true ,
4162+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4163+ true ,
4164+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4165+ true ,
4166+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4167+ true ,
4168+ // s1 [NULL, NULL] ==> unknown (must keep)
4169+ true ,
4170+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4171+ true ,
4172+ // s1 ["", "A"] ==> some rows could pass (must keep)
4173+ true ,
4174+ // s1 ["", ""] ==> some rows could pass (must keep)
4175+ true ,
4176+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4177+ true ,
4178+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4179+ false ,
4180+ ] ;
4181+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4182+
4183+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4184+ #[ rustfmt:: skip]
4185+ let expected_ret = & [
4186+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4187+ true ,
4188+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4189+ true ,
4190+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4191+ true ,
4192+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4193+ true ,
4194+ // s1 [NULL, NULL] ==> unknown (must keep)
4195+ true ,
4196+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4197+ true ,
4198+ // s1 ["", "A"] ==> some rows could pass (must keep)
4199+ true ,
4200+ // s1 ["", ""] ==> some rows could pass (must keep)
4201+ true ,
4202+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4203+ true ,
4204+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4205+ true ,
4206+ ] ;
4207+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4208+
4209+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4210+ let statistics = TestStatistics :: new ( ) . with (
4211+ "s1" ,
4212+ ContainerStats :: new_utf8 (
4213+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4214+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4215+ ) ,
4216+ ) ;
4217+ let expected_ret = & [ false , true ] ;
4218+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4219+ }
4220+
40644221 #[ test]
40654222 fn test_rewrite_expr_to_prunable ( ) {
40664223 let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments