Skip to content

Commit 1c5117b

Browse files
yaooqinndongjoon-hyun
authored andcommitted
[SPARK-37820][SQL] Replace ApacheCommonBase64 with JavaBase64 for string funcs
### What changes were proposed in this pull request? Replace Base64 on ApacheCommonBase64 with native support (https://docs.oracle.com/javase/8/docs/api/java/util/Base64.html) for Base-64 encode/decode. ApacheCommonBase64 obeys http://www.ietf.org/rfc/rfc2045.txt with url-safe off according to its doc, so we choose `java.util.Base64.Decoder#RFC2045` ### Why are the changes needed? 1. Performace gain - http://java-performance.info/base64-encoding-and-decoding-performance/ - have done benchmarks against jdk8/11, shown 2-5x faster 2. reduce dependencies after we replace other related places ### Does this PR introduce _any_ user-facing change? NO ### How was this patch tested? - Existing StringExpressionSuite - benchmarks Closes #35110 from yaooqinn/SPARK-37820. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
1 parent df74f6f commit 1c5117b

File tree

5 files changed

+251
-6
lines changed

5 files changed

+251
-6
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,12 @@ package org.apache.spark.sql.catalyst.expressions
1919

2020
import java.net.{URI, URISyntaxException}
2121
import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
22+
import java.util.{Base64 => JBase64}
2223
import java.util.{HashMap, Locale, Map => JMap}
2324
import java.util.regex.Pattern
2425

2526
import scala.collection.mutable.ArrayBuffer
2627

27-
import org.apache.commons.codec.binary.{Base64 => CommonsBase64}
28-
2928
import org.apache.spark.sql.catalyst.InternalRow
3029
import org.apache.spark.sql.catalyst.analysis.{ExpressionBuilder, FunctionRegistry, TypeCheckResult}
3130
import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -2345,13 +2344,13 @@ case class Base64(child: Expression)
23452344
override def inputTypes: Seq[DataType] = Seq(BinaryType)
23462345

23472346
protected override def nullSafeEval(bytes: Any): Any = {
2348-
UTF8String.fromBytes(CommonsBase64.encodeBase64(bytes.asInstanceOf[Array[Byte]]))
2347+
UTF8String.fromBytes(JBase64.getMimeEncoder.encode(bytes.asInstanceOf[Array[Byte]]))
23492348
}
23502349

23512350
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
23522351
nullSafeCodeGen(ctx, ev, (child) => {
23532352
s"""${ev.value} = UTF8String.fromBytes(
2354-
${classOf[CommonsBase64].getName}.encodeBase64($child));
2353+
${classOf[JBase64].getName}.getMimeEncoder().encode($child));
23552354
"""})
23562355
}
23572356

@@ -2377,12 +2376,12 @@ case class UnBase64(child: Expression)
23772376
override def inputTypes: Seq[DataType] = Seq(StringType)
23782377

23792378
protected override def nullSafeEval(string: Any): Any =
2380-
CommonsBase64.decodeBase64(string.asInstanceOf[UTF8String].toString)
2379+
JBase64.getMimeDecoder.decode(string.asInstanceOf[UTF8String].toString)
23812380

23822381
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
23832382
nullSafeCodeGen(ctx, ev, (child) => {
23842383
s"""
2385-
${ev.value} = ${classOf[CommonsBase64].getName}.decodeBase64($child.toString());
2384+
${ev.value} = ${classOf[JBase64].getName}.getMimeDecoder().decode($child.toString());
23862385
"""})
23872386
}
23882387

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
2+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
3+
encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
4+
------------------------------------------------------------------------------------------------------------------------
5+
java 4000 4121 204 5.0 200.0 1.0X
6+
apache 34197 34280 71 0.6 1709.9 0.1X
7+
8+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
9+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
10+
encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
11+
------------------------------------------------------------------------------------------------------------------------
12+
java 4696 4761 62 4.3 234.8 1.0X
13+
apache 35117 35342 262 0.6 1755.9 0.1X
14+
15+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
16+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
17+
encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
18+
------------------------------------------------------------------------------------------------------------------------
19+
java 6059 6192 120 3.3 303.0 1.0X
20+
apache 36995 37108 109 0.5 1849.8 0.2X
21+
22+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
23+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
24+
encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
25+
------------------------------------------------------------------------------------------------------------------------
26+
java 6993 7032 52 2.9 349.6 1.0X
27+
apache 37686 37888 198 0.5 1884.3 0.2X
28+
29+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
30+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
31+
decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
32+
------------------------------------------------------------------------------------------------------------------------
33+
java 5322 5503 162 3.8 266.1 1.0X
34+
apache 35180 35391 195 0.6 1759.0 0.2X
35+
36+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
37+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
38+
decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
39+
------------------------------------------------------------------------------------------------------------------------
40+
java 6780 6814 38 2.9 339.0 1.0X
41+
apache 35161 35279 102 0.6 1758.1 0.2X
42+
43+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
44+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
45+
decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
46+
------------------------------------------------------------------------------------------------------------------------
47+
java 8941 9068 130 2.2 447.1 1.0X
48+
apache 41628 41704 122 0.5 2081.4 0.2X
49+
50+
OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
51+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
52+
decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
53+
------------------------------------------------------------------------------------------------------------------------
54+
java 10248 10336 77 2.0 512.4 1.0X
55+
apache 42702 42732 47 0.5 2135.1 0.2X
56+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
2+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
3+
encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
4+
------------------------------------------------------------------------------------------------------------------------
5+
java 3787 3862 75 5.3 189.3 1.0X
6+
apache 28972 29107 153 0.7 1448.6 0.1X
7+
8+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
9+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
10+
encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
11+
------------------------------------------------------------------------------------------------------------------------
12+
java 4732 4741 8 4.2 236.6 1.0X
13+
apache 31133 31330 230 0.6 1556.6 0.2X
14+
15+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
16+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
17+
encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
18+
------------------------------------------------------------------------------------------------------------------------
19+
java 5928 5940 11 3.4 296.4 1.0X
20+
apache 31932 31981 47 0.6 1596.6 0.2X
21+
22+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
23+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
24+
encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
25+
------------------------------------------------------------------------------------------------------------------------
26+
java 6290 6312 36 3.2 314.5 1.0X
27+
apache 33568 33677 107 0.6 1678.4 0.2X
28+
29+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
30+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
31+
decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
32+
------------------------------------------------------------------------------------------------------------------------
33+
java 5087 5162 67 3.9 254.4 1.0X
34+
apache 30471 30598 161 0.7 1523.6 0.2X
35+
36+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
37+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
38+
decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
39+
------------------------------------------------------------------------------------------------------------------------
40+
java 6362 6384 22 3.1 318.1 1.0X
41+
apache 32436 32560 107 0.6 1621.8 0.2X
42+
43+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
44+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
45+
decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
46+
------------------------------------------------------------------------------------------------------------------------
47+
java 8808 8812 5 2.3 440.4 1.0X
48+
apache 37324 37537 215 0.5 1866.2 0.2X
49+
50+
OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
51+
Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
52+
decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
53+
------------------------------------------------------------------------------------------------------------------------
54+
java 9904 9915 11 2.0 495.2 1.0X
55+
apache 39963 40190 215 0.5 1998.2 0.2X
56+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
2+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
3+
encode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
4+
------------------------------------------------------------------------------------------------------------------------
5+
java 5408 5970 745 3.7 270.4 1.0X
6+
apache 35038 35285 216 0.6 1751.9 0.2X
7+
8+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
9+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
10+
encode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
11+
------------------------------------------------------------------------------------------------------------------------
12+
java 5950 6191 209 3.4 297.5 1.0X
13+
apache 37222 37440 191 0.5 1861.1 0.2X
14+
15+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
16+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
17+
encode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
18+
------------------------------------------------------------------------------------------------------------------------
19+
java 7472 7815 363 2.7 373.6 1.0X
20+
apache 40215 40300 143 0.5 2010.7 0.2X
21+
22+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
23+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
24+
encode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
25+
------------------------------------------------------------------------------------------------------------------------
26+
java 9548 9721 296 2.1 477.4 1.0X
27+
apache 40876 41011 143 0.5 2043.8 0.2X
28+
29+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
30+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
31+
decode for 1: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
32+
------------------------------------------------------------------------------------------------------------------------
33+
java 6835 7203 624 2.9 341.8 1.0X
34+
apache 37065 37202 184 0.5 1853.3 0.2X
35+
36+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
37+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
38+
decode for 3: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
39+
------------------------------------------------------------------------------------------------------------------------
40+
java 8151 8292 187 2.5 407.5 1.0X
41+
apache 39188 39455 262 0.5 1959.4 0.2X
42+
43+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
44+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
45+
decode for 5: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
46+
------------------------------------------------------------------------------------------------------------------------
47+
java 11225 11582 429 1.8 561.2 1.0X
48+
apache 42835 42987 145 0.5 2141.8 0.3X
49+
50+
OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
51+
Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
52+
decode for 7: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
53+
------------------------------------------------------------------------------------------------------------------------
54+
java 13722 13987 301 1.5 686.1 1.0X
55+
apache 44221 44443 354 0.5 2211.0 0.3X
56+
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.execution.benchmark
19+
20+
import org.apache.spark.benchmark.Benchmark
21+
22+
/**
23+
* Benchmark for measuring perf of different Base64 implementations
24+
* To run this benchmark:
25+
* {{{
26+
* 1. without sbt:
27+
* bin/spark-submit --class <this class> --jars <spark core test jar> <sql core test jar>
28+
* 2. build/sbt "sql/test:runMain <this class>"
29+
* 3. generate result:
30+
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
31+
* Results will be written to "benchmarks/Base64Benchmark-results.txt".
32+
* }}}
33+
*/
34+
object Base64Benchmark extends SqlBasedBenchmark {
35+
import spark.implicits._
36+
private val N = 20L * 1000 * 1000
37+
38+
private def doEncode(len: Int, f: Array[Byte] => Array[Byte]): Unit = {
39+
spark.range(N).map(_ => "Spark" * len).foreach { s =>
40+
f(s.getBytes)
41+
()
42+
}
43+
}
44+
45+
private def doDecode(len: Int, f: Array[Byte] => Array[Byte]): Unit = {
46+
spark.range(N).map(_ => "Spark" * len).map { s =>
47+
// using the same encode func
48+
java.util.Base64.getMimeEncoder.encode(s.getBytes)
49+
}.foreach { s =>
50+
f(s)
51+
()
52+
}
53+
}
54+
55+
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
56+
Seq(1, 3, 5, 7).map { len =>
57+
val benchmark = new Benchmark(s"encode for $len", N, output = output)
58+
benchmark.addCase("java", 3) { _ =>
59+
doEncode(len, x => java.util.Base64.getMimeEncoder().encode(x))
60+
}
61+
benchmark.addCase(s"apache", 3) { _ =>
62+
doEncode(len, org.apache.commons.codec.binary.Base64.encodeBase64)
63+
}
64+
benchmark
65+
}.foreach(_.run())
66+
67+
Seq(1, 3, 5, 7).map { len =>
68+
val benchmark = new Benchmark(s"decode for $len", N, output = output)
69+
benchmark.addCase("java", 3) { _ =>
70+
doDecode(len, x => java.util.Base64.getMimeDecoder.decode(x))
71+
}
72+
benchmark.addCase(s"apache", 3) { _ =>
73+
doDecode(len, org.apache.commons.codec.binary.Base64.decodeBase64)
74+
}
75+
benchmark
76+
}.foreach(_.run())
77+
}
78+
}

0 commit comments

Comments
 (0)