Skip to content

Commit 0deef04

Browse files
authored
[Kernel][Expressions] Add support for LIKE expression (#3103)
## Description Add SQL `LIKE` expression support in Kernel list of supported expressions and a default implementation. Addresses part of #2539 (where `STARTS_WITH` as `LIKE 'str%'`) ## How was this patch tested? added unit tests Signed-off-by: Krishnan Paranji Ravi <krishna.pr@gmail.com>
1 parent 35c7536 commit 0deef04

File tree

7 files changed

+444
-4
lines changed

7 files changed

+444
-4
lines changed

kernel/kernel-api/src/main/java/io/delta/kernel/expressions/Predicate.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,12 @@
103103
* <li>Since version: 3.2.0</li>
104104
* </ul>
105105
* </li>
106+
* <li>Name: <code>LIKE</code>
107+
* <ul>
108+
* <li>SQL semantic: <code>expr LIKE expr</code></li>
109+
* <li>Since version: 3.3.0</li>
110+
* </ul>
111+
* </li>
106112
* </ol>
107113
*
108114
* @since 3.0.0

kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/DefaultEngineErrors.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,15 @@ public static UnsupportedOperationException unsupportedExpressionException(
3939
reason);
4040
return new UnsupportedOperationException(message);
4141
}
42+
43+
/**
44+
* Exception class for invalid escape sequence used in input for LIKE expressions
45+
* @param pattern the invalid pattern
46+
* @param index character index of occurrence of the offending escape in the pattern
47+
*/
48+
public static IllegalArgumentException invalidEscapeSequence(String pattern, int index) {
49+
return new IllegalArgumentException(
50+
format("LIKE expression has invalid escape sequence '%s' at index %d",
51+
pattern, index));
52+
}
4253
}

kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluator.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import java.util.stream.Collectors;
2121
import static java.lang.String.format;
2222
import static java.util.Objects.requireNonNull;
23+
import static java.util.stream.Collectors.toList;
2324

2425
import io.delta.kernel.data.ColumnVector;
2526
import io.delta.kernel.data.ColumnarBatch;
@@ -31,8 +32,6 @@
3132
import static io.delta.kernel.internal.util.ExpressionUtils.getRight;
3233
import static io.delta.kernel.internal.util.ExpressionUtils.getUnaryChild;
3334
import static io.delta.kernel.internal.util.Preconditions.checkArgument;
34-
35-
3635
import io.delta.kernel.defaults.internal.data.vector.DefaultBooleanVector;
3736
import io.delta.kernel.defaults.internal.data.vector.DefaultConstantVector;
3837
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;
@@ -280,6 +279,21 @@ ExpressionTransformResult visitCoalesce(ScalarExpression coalesce) {
280279
);
281280
}
282281

282+
@Override
283+
ExpressionTransformResult visitLike(final Predicate like) {
284+
List<ExpressionTransformResult> children =
285+
like.getChildren().stream()
286+
.map(this::visit)
287+
.collect(toList());
288+
Predicate transformedExpression =
289+
LikeExpressionEvaluator.validateAndTransform(
290+
like,
291+
children.stream().map(e -> e.expression).collect(toList()),
292+
children.stream().map(e -> e.outputType).collect(toList()));
293+
294+
return new ExpressionTransformResult(transformedExpression, BooleanType.BOOLEAN);
295+
}
296+
283297
private Predicate validateIsPredicate(
284298
Expression baseExpression,
285299
ExpressionTransformResult result) {
@@ -560,6 +574,15 @@ ColumnVector visitCoalesce(ScalarExpression coalesce) {
560574
);
561575
}
562576

577+
@Override
578+
ColumnVector visitLike(final Predicate like) {
579+
List<Expression> children = like.getChildren();
580+
return LikeExpressionEvaluator.eval(
581+
children.stream()
582+
.map(this::visit)
583+
.collect(toList()));
584+
}
585+
563586
/**
564587
* Utility method to evaluate inputs to the binary input expression. Also validates the
565588
* evaluated expression result {@link ColumnVector}s are of the same size.

kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/ExpressionVisitor.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ abstract class ExpressionVisitor<R> {
5959

6060
abstract R visitCoalesce(ScalarExpression ifNull);
6161

62+
abstract R visitLike(Predicate predicate);
63+
6264
final R visit(Expression expression) {
6365
if (expression instanceof PartitionValueExpression) {
6466
return visitPartitionValue((PartitionValueExpression) expression);
@@ -105,6 +107,8 @@ private R visitScalarExpression(ScalarExpression expression) {
105107
return visitIsNull(new Predicate(name, children));
106108
case "COALESCE":
107109
return visitCoalesce(expression);
110+
case "LIKE":
111+
return visitLike(new Predicate(name, children));
108112
default:
109113
throw new UnsupportedOperationException(
110114
String.format("Scalar expression `%s` is not supported.", name));
@@ -114,8 +118,8 @@ private R visitScalarExpression(ScalarExpression expression) {
114118
private static Predicate elemAsPredicate(List<Expression> expressions, int index) {
115119
if (expressions.size() <= index) {
116120
throw new RuntimeException(
117-
String.format("Trying to access invalid entry (%d) in list %s", index,
118-
expressions.stream().map(Object::toString).collect(joining(","))));
121+
String.format("Trying to access invalid entry (%d) in list %s", index,
122+
expressions.stream().map(Object::toString).collect(joining(","))));
119123
}
120124
Expression elemExpression = expressions.get(index);
121125
if (!(elemExpression instanceof Predicate)) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* Copyright (2023) The Delta Lake Project Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package io.delta.kernel.defaults.internal.expressions;
17+
18+
import java.util.ArrayList;
19+
import java.util.Arrays;
20+
import java.util.List;
21+
import java.util.Objects;
22+
import java.util.regex.Pattern;
23+
24+
import io.delta.kernel.data.ColumnVector;
25+
import io.delta.kernel.expressions.Expression;
26+
import io.delta.kernel.expressions.Literal;
27+
import io.delta.kernel.expressions.Predicate;
28+
import io.delta.kernel.types.BooleanType;
29+
import io.delta.kernel.types.DataType;
30+
import io.delta.kernel.types.StringType;
31+
import io.delta.kernel.internal.util.Utils;
32+
33+
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.invalidEscapeSequence;
34+
import static io.delta.kernel.defaults.internal.DefaultEngineErrors.unsupportedExpressionException;
35+
36+
/**
37+
* Utility methods to evaluate {@code like} expression.
38+
*/
39+
public class LikeExpressionEvaluator {
40+
private LikeExpressionEvaluator() {
41+
}
42+
43+
static Predicate validateAndTransform(
44+
Predicate like,
45+
List<Expression> childrenExpressions,
46+
List<DataType> childrenOutputTypes) {
47+
int size = childrenExpressions.size();
48+
if (size < 2 || size > 3) {
49+
throw unsupportedExpressionException(like,
50+
"Invalid number of inputs to LIKE expression. " +
51+
"Example usage: LIKE(column, 'test%'), LIKE(column, 'test\\[%', '\\')");
52+
}
53+
54+
Expression left = childrenExpressions.get(0);
55+
DataType leftOutputType = childrenOutputTypes.get(0);
56+
Expression right = childrenExpressions.get(1);
57+
DataType rightOutputType = childrenOutputTypes.get(1);
58+
Expression escapeCharExpr = size == 3 ? childrenExpressions.get(2) : null;
59+
DataType escapeCharOutputType = size == 3 ? childrenOutputTypes.get(2) : null;
60+
61+
if (!(StringType.STRING.equivalent(leftOutputType)
62+
&& StringType.STRING.equivalent(rightOutputType))) {
63+
throw unsupportedExpressionException(like,
64+
"LIKE is only supported for string type expressions");
65+
}
66+
67+
if (escapeCharExpr != null &&
68+
(!(escapeCharExpr instanceof Literal &&
69+
StringType.STRING.equivalent(escapeCharOutputType)))) {
70+
throw unsupportedExpressionException(like,
71+
"LIKE expects escape token expression to be a literal of String type");
72+
}
73+
74+
Literal literal = (Literal) escapeCharExpr;
75+
if (literal != null &&
76+
literal.getValue().toString().length() != 1) {
77+
throw unsupportedExpressionException(like,
78+
"LIKE expects escape token to be a single character");
79+
}
80+
81+
List<Expression> children = new ArrayList<>(Arrays.asList(left, right));
82+
if(Objects.nonNull(escapeCharExpr)) {
83+
children.add(escapeCharExpr);
84+
}
85+
return new Predicate(like.getName(), children);
86+
}
87+
88+
static ColumnVector eval(List<ColumnVector> children) {
89+
final char DEFAULT_ESCAPE_CHAR = '\\';
90+
91+
return new ColumnVector() {
92+
final ColumnVector escapeCharVector =
93+
children.size() == 3 ?
94+
children.get(2) :
95+
null;
96+
final ColumnVector left = children.get(0);
97+
final ColumnVector right = children.get(1);
98+
99+
Character escapeChar = null;
100+
101+
public void initEscapeCharIfRequired() {
102+
if (escapeChar == null) {
103+
escapeChar =
104+
escapeCharVector != null && !escapeCharVector.getString(0).isEmpty() ?
105+
escapeCharVector.getString(0).charAt(0) :
106+
DEFAULT_ESCAPE_CHAR;
107+
}
108+
}
109+
110+
@Override
111+
public DataType getDataType() {
112+
return BooleanType.BOOLEAN;
113+
}
114+
115+
@Override
116+
public int getSize() {
117+
return left.getSize();
118+
}
119+
120+
@Override
121+
public void close() {
122+
Utils.closeCloseables(left, right);
123+
}
124+
125+
@Override
126+
public boolean getBoolean(int rowId) {
127+
initEscapeCharIfRequired();
128+
return isLike(left.getString(rowId), right.getString(rowId), escapeChar);
129+
}
130+
131+
@Override
132+
public boolean isNullAt(int rowId) {
133+
return left.isNullAt(rowId) || right.isNullAt(rowId);
134+
}
135+
136+
public boolean isLike(String input, String pattern, char escape) {
137+
if (!Objects.isNull(input) && !Objects.isNull(pattern)) {
138+
String regex = escapeLikeRegex(pattern, escape);
139+
return input.matches(regex);
140+
}
141+
return false;
142+
}
143+
};
144+
}
145+
146+
/**
147+
* utility method to convert a predicate pattern to a java regex
148+
* @param pattern the pattern used in the expression
149+
* @param escape escape character to use
150+
* @return java regex
151+
*/
152+
private static String escapeLikeRegex(String pattern, char escape) {
153+
final int len = pattern.length();
154+
final StringBuilder javaPattern = new StringBuilder(len + len);
155+
for (int i = 0; i < len; i++) {
156+
char c = pattern.charAt(i);
157+
158+
if (c == escape) {
159+
if (i == (pattern.length() - 1)) {
160+
throw invalidEscapeSequence(pattern, i);
161+
}
162+
char nextChar = pattern.charAt(i + 1);
163+
if ((nextChar == '_')
164+
|| (nextChar == '%')
165+
|| (nextChar == escape)) {
166+
javaPattern.append(Pattern.quote(Character.toString(nextChar)));
167+
i++;
168+
} else {
169+
throw invalidEscapeSequence(pattern, i);
170+
}
171+
} else if (c == '_') {
172+
javaPattern.append('.');
173+
} else if (c == '%') {
174+
javaPattern.append(".*");
175+
} else {
176+
javaPattern.append(Pattern.quote(Character.toString(c)));
177+
}
178+
179+
}
180+
return "(?s)" + javaPattern;
181+
}
182+
}

0 commit comments

Comments
 (0)