From 1afded07ed218999ec006dc3da92ca55350e6dd0 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 14 Jul 2017 20:16:04 -0700 Subject: [PATCH] [SPARK-21344][SQL] BinaryType comparison does signed byte array comparison ## What changes were proposed in this pull request? This PR fixes a wrong comparison for `BinaryType`. This PR enables unsigned comparison and unsigned prefix generation for an array for `BinaryType`. Previous implementations uses signed operations. ## How was this patch tested? Added a test suite in `OrderingSuite`. Author: Kazuaki Ishizaki Closes #18571 from kiszk/SPARK-21344. (cherry picked from commit ac5d5d795909061a17e056696cf0ef87d9e65dd1) Signed-off-by: gatorsmile --- .../apache/spark/unsafe/types/ByteArray.java | 2 +- .../unsafe/sort/PrefixComparatorsSuite.scala | 17 ++++++++++++++++- .../spark/sql/catalyst/util/TypeUtils.scala | 4 +++- .../catalyst/expressions/OrderingSuite.scala | 19 +++++++++++++++++++ .../resources/sql-tests/inputs/comparator.sql | 3 +++ .../sql-tests/results/comparator.sql.out | 18 ++++++++++++++++++ 6 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 sql/core/src/test/resources/sql-tests/inputs/comparator.sql create mode 100644 sql/core/src/test/resources/sql-tests/results/comparator.sql.out diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java index 3ced2094f5e6b..7ced13d357237 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java @@ -44,7 +44,7 @@ public static long getPrefix(byte[] bytes) { final int minLen = Math.min(bytes.length, 8); long p = 0; for (int i = 0; i < minLen; ++i) { - p |= (128L + Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i)) + p |= ((long) Platform.getByte(bytes, Platform.BYTE_ARRAY_OFFSET + i) & 0xff) << (56 - 8 * i); } return p; diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala index b4083230b4ac5..712a1ad62f6a4 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala @@ -62,7 +62,9 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks { def compareBinary(x: Array[Byte], y: Array[Byte]): Int = { for (i <- 0 until x.length; if i < y.length) { - val res = x(i).compare(y(i)) + val v1 = x(i) & 0xff + val v2 = y(i) & 0xff + val res = v1 - v2 if (res != 0) return res } x.length - y.length @@ -79,6 +81,19 @@ class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks { (prefixComparisonResult > 0 && compareBinary(x, y) > 0)) } + val binaryRegressionTests = Seq( + (Array[Byte](1), Array[Byte](-1)), + (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)), + (Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1)), + (Array[Byte](1), Array[Byte](1, 1, 1, 1)), + (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1)), + (Array[Byte](-1), Array[Byte](-1, -1, -1, -1)), + (Array[Byte](-1, -1, -1, -1, -1), Array[Byte](-1, -1, -1, -1, -1, -1, -1, -1, -1)) + ) + binaryRegressionTests.foreach { case (b1, b2) => + testPrefixComparison(b1, b2) + } + // scalastyle:off val regressionTests = Table( ("s1", "s2"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala index 7101ca5a17de9..45225779bffcb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TypeUtils.scala @@ -70,7 +70,9 @@ object TypeUtils { def compareBinary(x: Array[Byte], y: Array[Byte]): Int = { for (i <- 0 until x.length; if i < y.length) { - val res = x(i).compareTo(y(i)) + val v1 = x(i) & 0xff + val v2 = y(i) & 0xff + val res = v1 - v2 if (res != 0) return res } x.length - y.length diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala index 190fab5d249bb..aa61ba2bff2bb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/OrderingSuite.scala @@ -137,4 +137,23 @@ class OrderingSuite extends SparkFunSuite with ExpressionEvalHelper { // verify that we can support up to 5000 ordering comparisons, which should be sufficient GenerateOrdering.generate(Array.fill(5000)(sortOrder)) } + + test("SPARK-21344: BinaryType comparison does signed byte array comparison") { + val data = Seq( + (Array[Byte](1), Array[Byte](-1)), + (Array[Byte](1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, -1)), + (Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, 1), Array[Byte](1, 1, 1, 1, 1, 1, 1, 1, -1)) + ) + data.foreach { case (b1, b2) => + val rowOrdering = InterpretedOrdering.forSchema(Seq(BinaryType)) + val genOrdering = GenerateOrdering.generate( + BoundReference(0, BinaryType, nullable = true).asc :: Nil) + val rowType = StructType(StructField("b", BinaryType, nullable = true) :: Nil) + val toCatalyst = CatalystTypeConverters.createToCatalystConverter(rowType) + val rowB1 = toCatalyst(Row(b1)).asInstanceOf[InternalRow] + val rowB2 = toCatalyst(Row(b2)).asInstanceOf[InternalRow] + assert(rowOrdering.compare(rowB1, rowB2) < 0) + assert(genOrdering.compare(rowB1, rowB2) < 0) + } + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql new file mode 100644 index 0000000000000..3e2447723e576 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql @@ -0,0 +1,3 @@ +-- binary type +select x'00' < x'0f'; +select x'00' < x'ff'; diff --git a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out new file mode 100644 index 0000000000000..afc7b5448b7b6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out @@ -0,0 +1,18 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 2 + + +-- !query 0 +select x'00' < x'0f' +-- !query 0 schema +struct<(X'00' < X'0F'):boolean> +-- !query 0 output +true + + +-- !query 1 +select x'00' < x'ff' +-- !query 1 schema +struct<(X'00' < X'FF'):boolean> +-- !query 1 output +true