@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
15901590 ) ) ,
15911591 ) )
15921592 }
1593+ Operator :: NotLikeMatch => build_not_like_match ( expr_builder) ?,
15931594 Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
15941595 plan_datafusion_err ! (
15951596 "LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
16381639 Ok ( statistics_expr)
16391640}
16401641
1642+ /// returns the string literal of the scalar value if it is a string
1643+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1644+ s. try_as_str ( ) . flatten ( )
1645+ }
1646+
1647+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1648+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1649+ let s = unpack_string ( lit. value ( ) ) ?;
1650+ return Some ( s) ;
1651+ }
1652+ None
1653+ }
1654+
16411655/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421656/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431657/// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
16501664 // column LIKE '%foo%' => min <= '' && '' <= max => true
16511665 // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521666
1653- /// returns the string literal of the scalar value if it is a string
1654- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655- s. try_as_str ( ) . flatten ( )
1656- }
1657-
1658- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660- let s = unpack_string ( lit. value ( ) ) ?;
1661- return Some ( s) ;
1662- }
1663- None
1664- }
1665-
16661667 // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671668 // this may involve building the physical expressions that call lower() and upper()
16681669 let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1711,66 @@ fn build_like_match(
17101711 Some ( combined)
17111712}
17121713
1714+ // For predicate `col NOT LIKE 'foo%'`, we rewrite it as `(col_min NOT LIKE 'foo%' OR col_max NOT LIKE 'foo%')`. If both col_min and col_max have the prefix foo, we skip the entire row group (as we can be certain that all data in this row group has the prefix foo).
1715+ fn build_not_like_match (
1716+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1717+ ) -> Result < Arc < dyn PhysicalExpr > > {
1718+ // col NOT LIKE 'prefix%' -> !(col_min LIKE 'prefix%' && col_max LIKE 'prefix%') -> (col_min NOT LIKE 'prefix%' || col_max NOT LIKE 'prefix%')
1719+
1720+ let min_column_expr = expr_builder. min_column_expr ( ) ?;
1721+ let max_column_expr = expr_builder. max_column_expr ( ) ?;
1722+
1723+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1724+
1725+ let pattern = extract_string_literal ( scalar_expr) . ok_or_else ( || {
1726+ plan_datafusion_err ! ( "cannot extract literal from NOT LIKE expression" )
1727+ } ) ?;
1728+
1729+ let chars: Vec < char > = pattern. chars ( ) . collect ( ) ;
1730+ for i in 0 ..chars. len ( ) - 1 {
1731+ // Check if current char is a wildcard and is not escaped with backslash
1732+ if ( chars[ i] == '%' || chars[ i] == '_' ) && ( i == 0 || chars[ i - 1 ] != '\\' ) {
1733+ // Example: For pattern "foo%bar", the row group might include values like
1734+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1735+ // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1736+ // match the pattern, intermediate values like "food" may not
1737+ // match the full pattern "foo%bar", making pruning unsafe.
1738+ // (truncate foo%bar to foo% have same problem)
1739+ return Err ( plan_datafusion_err ! (
1740+ "NOT LIKE expressions with unescaped wildcards ('%' or '_') at the beginning or middle of the pattern are not supported"
1741+ ) ) ;
1742+ }
1743+ }
1744+
1745+ if chars. last ( ) == Some ( & '_' ) && ( chars. len ( ) > 1 && chars[ chars. len ( ) - 2 ] != '\\' ) {
1746+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1747+ // which means not every row is guaranteed to match the pattern.
1748+ return Err ( plan_datafusion_err ! (
1749+ "NOT LIKE expressions with unescaped '_' at the end of the pattern are not supported"
1750+ ) ) ;
1751+ }
1752+
1753+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1754+ true ,
1755+ false ,
1756+ Arc :: clone ( & min_column_expr) ,
1757+ Arc :: clone ( scalar_expr) ,
1758+ ) ) ;
1759+
1760+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1761+ true ,
1762+ false ,
1763+ Arc :: clone ( & max_column_expr) ,
1764+ Arc :: clone ( scalar_expr) ,
1765+ ) ) ;
1766+
1767+ Ok ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1768+ min_col_not_like_epxr,
1769+ Operator :: Or ,
1770+ max_col_not_like_expr,
1771+ ) ) )
1772+ }
1773+
17131774/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141775/// This makes it so that the returned string will always compare greater than the input string
17151776/// or any other string with the same prefix.
@@ -4061,6 +4122,132 @@ mod tests {
40614122 prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
40624123 }
40634124
4125+ #[ test]
4126+ fn prune_utf8_not_like_one ( ) {
4127+ let ( schema, statistics) = utf8_setup ( ) ;
4128+
4129+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4130+ #[ rustfmt:: skip]
4131+ let expected_ret = & [
4132+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4133+ true ,
4134+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4135+ true ,
4136+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4137+ true ,
4138+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4139+ true ,
4140+ // s1 [NULL, NULL] ==> unknown (must keep)
4141+ true ,
4142+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4143+ true ,
4144+ // s1 ["", "A"] ==> some rows could pass (must keep)
4145+ true ,
4146+ // s1 ["", ""] ==> some rows could pass (must keep)
4147+ true ,
4148+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4149+ true ,
4150+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4151+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4152+ true ,
4153+ ] ;
4154+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4155+ }
4156+
4157+ #[ test]
4158+ fn prune_utf8_not_like_many ( ) {
4159+ let ( schema, statistics) = utf8_setup ( ) ;
4160+
4161+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4162+ #[ rustfmt:: skip]
4163+ let expected_ret = & [
4164+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4165+ true ,
4166+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4167+ true ,
4168+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4169+ true ,
4170+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4171+ true ,
4172+ // s1 [NULL, NULL] ==> unknown (must keep)
4173+ true ,
4174+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4175+ true ,
4176+ // s1 ["", "A"] ==> some rows could pass (must keep)
4177+ true ,
4178+ // s1 ["", ""] ==> some rows could pass (must keep)
4179+ true ,
4180+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4181+ true ,
4182+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4183+ false ,
4184+ ] ;
4185+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4186+
4187+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4188+ #[ rustfmt:: skip]
4189+ let expected_ret = & [
4190+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4191+ true ,
4192+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4193+ true ,
4194+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4195+ true ,
4196+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4197+ true ,
4198+ // s1 [NULL, NULL] ==> unknown (must keep)
4199+ true ,
4200+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4201+ true ,
4202+ // s1 ["", "A"] ==> some rows could pass (must keep)
4203+ true ,
4204+ // s1 ["", ""] ==> some rows could pass (must keep)
4205+ true ,
4206+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4207+ true ,
4208+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4209+ true ,
4210+ ] ;
4211+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4212+
4213+ let expr = col ( "s1" ) . not_like ( lit ( "M" ) ) ;
4214+ #[ rustfmt:: skip]
4215+ let expected_ret = & [
4216+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4217+ true ,
4218+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4219+ true ,
4220+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4221+ true ,
4222+ // s1 ["M", "M"] ==> no row match
4223+ false ,
4224+ // s1 [NULL, NULL] ==> unknown (must keep)
4225+ true ,
4226+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4227+ true ,
4228+ // s1 ["", "A"] ==> some rows could pass (must keep)
4229+ true ,
4230+ // s1 ["", ""] ==> some rows could pass (must keep)
4231+ true ,
4232+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4233+ true ,
4234+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4235+ true ,
4236+ ] ;
4237+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4238+
4239+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4240+ let statistics = TestStatistics :: new ( ) . with (
4241+ "s1" ,
4242+ ContainerStats :: new_utf8 (
4243+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4244+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4245+ ) ,
4246+ ) ;
4247+ let expected_ret = & [ false , true ] ;
4248+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4249+ }
4250+
40644251 #[ test]
40654252 fn test_rewrite_expr_to_prunable ( ) {
40664253 let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments