Skip to content

Commit db42558

Browse files
ilicmarkodbvkorukanti
authored andcommitted
[3.2][Kernel] Change string comparing from UTF16 to UTF8 (#3611)
Changed string comparing from UTF16 to UTF8. This fixes comparison issues around the characters with surrogate pairs. Tests added to `DefaultExpressionEvaluatorSuite.scala`
1 parent 755c962 commit db42558

File tree

2 files changed

+79
-2
lines changed

2 files changed

+79
-2
lines changed

kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package io.delta.kernel.defaults.internal.expressions;
1717

1818
import java.math.BigDecimal;
19+
import java.nio.charset.StandardCharsets;
1920
import java.util.Comparator;
2021
import java.util.List;
2122
import java.util.function.Function;
@@ -35,6 +36,19 @@
3536
class DefaultExpressionUtils {
3637
private DefaultExpressionUtils() {}
3738

39+
static final Comparator<String> STRING_COMPARATOR = (leftOp, rightOp) -> {
40+
byte[] leftBytes = leftOp.getBytes(StandardCharsets.UTF_8);
41+
byte[] rightBytes = rightOp.getBytes(StandardCharsets.UTF_8);
42+
int i = 0;
43+
while (i < leftBytes.length && i < rightBytes.length) {
44+
if (leftBytes[i] != rightBytes[i]) {
45+
return Byte.toUnsignedInt(leftBytes[i]) - Byte.toUnsignedInt(rightBytes[i]);
46+
}
47+
i++;
48+
}
49+
return Integer.compare(leftBytes.length, rightBytes.length);
50+
};
51+
3852
/**
3953
* Utility method that calculates the nullability result from given two vectors. Result is
4054
* null if at least one side is a null.
@@ -186,10 +200,10 @@ static void compareDouble(ColumnVector left, ColumnVector right, int[] result) {
186200
}
187201

188202
static void compareString(ColumnVector left, ColumnVector right, int[] result) {
189-
Comparator<String> comparator = Comparator.naturalOrder();
190203
for (int rowId = 0; rowId < left.getSize(); rowId++) {
191204
if (!left.isNullAt(rowId) && !right.isNullAt(rowId)) {
192-
result[rowId] = comparator.compare(left.getString(rowId), right.getString(rowId));
205+
result[rowId] =
206+
STRING_COMPARATOR.compare(left.getString(rowId), right.getString(rowId));
193207
}
194208
}
195209
}

kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/expressions/DefaultExpressionEvaluatorSuite.scala

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,9 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa
308308
}
309309

310310
test("evaluate expression: comparators (=, <, <=, >, >=)") {
311+
val ASCII_MAX_CHARACTER = '\u007F'
312+
val UTF8_MAX_CHARACTER = new String(Character.toChars(Character.MAX_CODE_POINT))
313+
311314
// Literals for each data type from the data type value range, used as inputs to comparator
312315
// (small, big, small, null)
313316
val literals = Seq(
@@ -332,6 +335,66 @@ class DefaultExpressionEvaluatorSuite extends AnyFunSuite with ExpressionSuiteBa
332335
),
333336
(ofDate(-12123), ofDate(123123), ofDate(-12123), ofNull(DateType.DATE)),
334337
(ofString("apples"), ofString("oranges"), ofString("apples"), ofNull(StringType.STRING)),
338+
(ofString(""), ofString("a"), ofString(""), ofNull(StringType.STRING)),
339+
(ofString("abc"), ofString("abc0"), ofString("abc"), ofNull(StringType.STRING)),
340+
(ofString("abc"), ofString("abcd"), ofString("abc"), ofNull(StringType.STRING)),
341+
(ofString("abc"), ofString("abd"), ofString("abc"), ofNull(StringType.STRING)),
342+
(
343+
ofString("Abcabcabc"),
344+
ofString("aBcabcabc"),
345+
ofString("Abcabcabc"),
346+
ofNull(StringType.STRING)
347+
),
348+
(
349+
ofString("abcabcabC"),
350+
ofString("abcabcabc"),
351+
ofString("abcabcabC"),
352+
ofNull(StringType.STRING)
353+
),
354+
// scalastyle:off nonascii
355+
(ofString("abc"), ofString("世界"), ofString("abc"), ofNull(StringType.STRING)),
356+
(ofString("世界"), ofString("你好"), ofString("世界"), ofNull(StringType.STRING)),
357+
(ofString("你好122"), ofString("你好123"), ofString("你好122"), ofNull(StringType.STRING)),
358+
(ofString("A"), ofString("Ā"), ofString("A"), ofNull(StringType.STRING)),
359+
(ofString("»"), ofString("î"), ofString("»"), ofNull(StringType.STRING)),
360+
(ofString(""), ofString("🌼"), ofString(""), ofNull(StringType.STRING)),
361+
(
362+
ofString("abcdef🚀"),
363+
ofString(s"abcdef$UTF8_MAX_CHARACTER"),
364+
ofString("abcdef🚀"),
365+
ofNull(StringType.STRING)
366+
),
367+
(
368+
ofString("abcde�abcdef�abcdef�abcdef"),
369+
ofString(s"abcde�$ASCII_MAX_CHARACTER"),
370+
ofString("abcde�abcdef�abcdef�abcdef"),
371+
ofNull(StringType.STRING)
372+
),
373+
(
374+
ofString("abcde�abcdef�abcdef�abcdef"),
375+
ofString(s"abcde�$ASCII_MAX_CHARACTER"),
376+
ofString("abcde�abcdef�abcdef�abcdef"),
377+
ofNull(StringType.STRING)
378+
),
379+
(
380+
ofString("����"),
381+
ofString(s"��$UTF8_MAX_CHARACTER"),
382+
ofString("����"),
383+
ofNull(StringType.STRING)
384+
),
385+
(
386+
ofString(s"a${UTF8_MAX_CHARACTER}d"),
387+
ofString(s"a$UTF8_MAX_CHARACTER$ASCII_MAX_CHARACTER"),
388+
ofString(s"a${UTF8_MAX_CHARACTER}d"),
389+
ofNull(StringType.STRING)
390+
),
391+
(
392+
ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"),
393+
ofString(s"abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼$UTF8_MAX_CHARACTER"),
394+
ofString("abcdefghijklm💞😉💕\n🥀🌹💐🌺🌷🌼🌻🌷🥀"),
395+
ofNull(StringType.STRING)
396+
),
397+
// scalastyle:on nonascii
335398
(
336399
ofBinary("apples".getBytes()),
337400
ofBinary("oranges".getBytes()),

0 commit comments

Comments
 (0)