@@ -1590,6 +1590,7 @@ fn build_statistics_expr(
15901590 ) ) ,
15911591 ) )
15921592 }
1593+ Operator :: NotLikeMatch => build_not_like_match ( expr_builder) ?,
15931594 Operator :: LikeMatch => build_like_match ( expr_builder) . ok_or_else ( || {
15941595 plan_datafusion_err ! (
15951596 "LIKE expression with wildcard at the beginning is not supported"
@@ -1638,6 +1639,19 @@ fn build_statistics_expr(
16381639 Ok ( statistics_expr)
16391640}
16401641
1642+ /// returns the string literal of the scalar value if it is a string
1643+ fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1644+ s. try_as_str ( ) . flatten ( )
1645+ }
1646+
1647+ fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1648+ if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1649+ let s = unpack_string ( lit. value ( ) ) ?;
1650+ return Some ( s) ;
1651+ }
1652+ None
1653+ }
1654+
16411655/// Convert `column LIKE literal` where P is a constant prefix of the literal
16421656/// to a range check on the column: `P <= column && column < P'`, where P' is the
16431657/// lowest string after all P* strings.
@@ -1650,19 +1664,6 @@ fn build_like_match(
16501664 // column LIKE '%foo%' => min <= '' && '' <= max => true
16511665 // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
16521666
1653- /// returns the string literal of the scalar value if it is a string
1654- fn unpack_string ( s : & ScalarValue ) -> Option < & str > {
1655- s. try_as_str ( ) . flatten ( )
1656- }
1657-
1658- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & str > {
1659- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1660- let s = unpack_string ( lit. value ( ) ) ?;
1661- return Some ( s) ;
1662- }
1663- None
1664- }
1665-
16661667 // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
16671668 // this may involve building the physical expressions that call lower() and upper()
16681669 let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
@@ -1710,6 +1711,80 @@ fn build_like_match(
17101711 Some ( combined)
17111712}
17121713
1714+ // For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`.
1715+ //
1716+ // The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means
1717+ // **all** data in this row group begins with `const_prefix` as well (and therefore the predicate
1718+ // looking for rows that don't begin with `const_prefix` can never be true)
1719+ fn build_not_like_match (
1720+ expr_builder : & mut PruningExpressionBuilder < ' _ > ,
1721+ ) -> Result < Arc < dyn PhysicalExpr > > {
1722+ // col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%')
1723+
1724+ let min_column_expr = expr_builder. min_column_expr ( ) ?;
1725+ let max_column_expr = expr_builder. max_column_expr ( ) ?;
1726+
1727+ let scalar_expr = expr_builder. scalar_expr ( ) ;
1728+
1729+ let pattern = extract_string_literal ( scalar_expr) . ok_or_else ( || {
1730+ plan_datafusion_err ! ( "cannot extract literal from NOT LIKE expression" )
1731+ } ) ?;
1732+
1733+ let ( const_prefix, remaining) = split_constant_prefix ( pattern) ;
1734+ if const_prefix. is_empty ( ) || remaining != "%" {
1735+ // we can not handle `%` at the beginning or in the middle of the pattern
1736+ // Example: For pattern "foo%bar", the row group might include values like
1737+ // ["foobar", "food", "foodbar"], making it unsafe to prune.
1738+ // Even if the min/max values in the group (e.g., "foobar" and "foodbar")
1739+ // match the pattern, intermediate values like "food" may not
1740+ // match the full pattern "foo%bar", making pruning unsafe.
1741+ // (truncate foo%bar to foo% have same problem)
1742+
1743+ // we can not handle pattern containing `_`
1744+ // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"],
1745+ // which means not every row is guaranteed to match the pattern.
1746+ return Err ( plan_datafusion_err ! (
1747+ "NOT LIKE expressions only support constant_prefix+wildcard`%`"
1748+ ) ) ;
1749+ }
1750+
1751+ let min_col_not_like_epxr = Arc :: new ( phys_expr:: LikeExpr :: new (
1752+ true ,
1753+ false ,
1754+ Arc :: clone ( & min_column_expr) ,
1755+ Arc :: clone ( scalar_expr) ,
1756+ ) ) ;
1757+
1758+ let max_col_not_like_expr = Arc :: new ( phys_expr:: LikeExpr :: new (
1759+ true ,
1760+ false ,
1761+ Arc :: clone ( & max_column_expr) ,
1762+ Arc :: clone ( scalar_expr) ,
1763+ ) ) ;
1764+
1765+ Ok ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1766+ min_col_not_like_epxr,
1767+ Operator :: Or ,
1768+ max_col_not_like_expr,
1769+ ) ) )
1770+ }
1771+
1772+ /// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty)
1773+ fn split_constant_prefix ( pattern : & str ) -> ( & str , & str ) {
1774+ let char_indices = pattern. char_indices ( ) . collect :: < Vec < _ > > ( ) ;
1775+ for i in 0 ..char_indices. len ( ) {
1776+ let ( idx, char) = char_indices[ i] ;
1777+ if char == '%' || char == '_' {
1778+ if i != 0 && char_indices[ i - 1 ] . 1 == '\\' {
1779+ // ecsaped by `\`
1780+ continue ;
1781+ }
1782+ return ( & pattern[ ..idx] , & pattern[ idx..] ) ;
1783+ }
1784+ }
1785+ ( pattern, "" )
1786+ }
1787+
17131788/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
17141789/// This makes it so that the returned string will always compare greater than the input string
17151790/// or any other string with the same prefix.
@@ -4061,6 +4136,132 @@ mod tests {
40614136 prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
40624137 }
40634138
4139+ #[ test]
4140+ fn prune_utf8_not_like_one ( ) {
4141+ let ( schema, statistics) = utf8_setup ( ) ;
4142+
4143+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} _" ) ) ;
4144+ #[ rustfmt:: skip]
4145+ let expected_ret = & [
4146+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4147+ true ,
4148+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4149+ true ,
4150+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4151+ true ,
4152+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4153+ true ,
4154+ // s1 [NULL, NULL] ==> unknown (must keep)
4155+ true ,
4156+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4157+ true ,
4158+ // s1 ["", "A"] ==> some rows could pass (must keep)
4159+ true ,
4160+ // s1 ["", ""] ==> some rows could pass (must keep)
4161+ true ,
4162+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4163+ true ,
4164+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate
4165+ // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}")
4166+ true ,
4167+ ] ;
4168+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4169+ }
4170+
4171+ #[ test]
4172+ fn prune_utf8_not_like_many ( ) {
4173+ let ( schema, statistics) = utf8_setup ( ) ;
4174+
4175+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %" ) ) ;
4176+ #[ rustfmt:: skip]
4177+ let expected_ret = & [
4178+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4179+ true ,
4180+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4181+ true ,
4182+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4183+ true ,
4184+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4185+ true ,
4186+ // s1 [NULL, NULL] ==> unknown (must keep)
4187+ true ,
4188+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4189+ true ,
4190+ // s1 ["", "A"] ==> some rows could pass (must keep)
4191+ true ,
4192+ // s1 ["", ""] ==> some rows could pass (must keep)
4193+ true ,
4194+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4195+ true ,
4196+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match
4197+ false ,
4198+ ] ;
4199+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4200+
4201+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} " ) ) ;
4202+ #[ rustfmt:: skip]
4203+ let expected_ret = & [
4204+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4205+ true ,
4206+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4207+ true ,
4208+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4209+ true ,
4210+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4211+ true ,
4212+ // s1 [NULL, NULL] ==> unknown (must keep)
4213+ true ,
4214+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4215+ true ,
4216+ // s1 ["", "A"] ==> some rows could pass (must keep)
4217+ true ,
4218+ // s1 ["", ""] ==> some rows could pass (must keep)
4219+ true ,
4220+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4221+ true ,
4222+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4223+ true ,
4224+ ] ;
4225+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4226+
4227+ let expr = col ( "s1" ) . not_like ( lit ( "A\u{10ffff} %\u{10ffff} _" ) ) ;
4228+ #[ rustfmt:: skip]
4229+ let expected_ret = & [
4230+ // s1 ["A", "Z"] ==> some rows could pass (must keep)
4231+ true ,
4232+ // s1 ["A", "L"] ==> some rows could pass (must keep)
4233+ true ,
4234+ // s1 ["N", "Z"] ==> some rows could pass (must keep)
4235+ true ,
4236+ // s1 ["M", "M"] ==> some rows could pass (must keep)
4237+ true ,
4238+ // s1 [NULL, NULL] ==> unknown (must keep)
4239+ true ,
4240+ // s1 ["A", NULL] ==> some rows could pass (must keep)
4241+ true ,
4242+ // s1 ["", "A"] ==> some rows could pass (must keep)
4243+ true ,
4244+ // s1 ["", ""] ==> some rows could pass (must keep)
4245+ true ,
4246+ // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4247+ true ,
4248+ // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep)
4249+ true ,
4250+ ] ;
4251+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4252+
4253+ let expr = col ( "s1" ) . not_like ( lit ( "A\\ %%" ) ) ;
4254+ let statistics = TestStatistics :: new ( ) . with (
4255+ "s1" ,
4256+ ContainerStats :: new_utf8 (
4257+ vec ! [ Some ( "A%a" ) , Some ( "A" ) ] ,
4258+ vec ! [ Some ( "A%c" ) , Some ( "A" ) ] ,
4259+ ) ,
4260+ ) ;
4261+ let expected_ret = & [ false , true ] ;
4262+ prune_with_expr ( expr, & schema, & statistics, expected_ret) ;
4263+ }
4264+
40644265 #[ test]
40654266 fn test_rewrite_expr_to_prunable ( ) {
40664267 let schema = Schema :: new ( vec ! [ Field :: new( "a" , DataType :: Int32 , true ) ] ) ;
0 commit comments