Skip to content

Commit 452c1b6

Browse files
yaooqinnLuciferYang
authored andcommitted
[SPARK-48551][SQL] Perf improvement for escapePathName
### What changes were proposed in this pull request? This PR improves perf for escapePathName with algorithms briefly described as: - If a path contains no special characters, we return the original identity instead of creating a new StringBuilder to append char by char - If a path contains special characters, we relocate the IDX of the first special character. Then initialize the StringBuilder with [0, IDX) of the original string, and do heximal padding if necessary starting from IDX. - An optimized char-to-hex function replaces the `String.format` Add a fast path for storage paths or their parts that do not require escaping to avoid creating a StringBuilder to append per character. ### Why are the changes needed? performance improvement for hotspots ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - new tests in ExternalCatalogUtilsSuite - Benchmark results (9x faster) ### Was this patch authored or co-authored using generative AI tooling? no Closes #46894 from yaooqinn/SPARK-48551. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: yangjie01 <yangjie01@baidu.com>
1 parent 53d65fd commit 452c1b6

File tree

5 files changed

+168
-14
lines changed

5 files changed

+168
-14
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
================================================================================================
2+
Escape
3+
================================================================================================
4+
5+
OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
6+
AMD EPYC 7763 64-Core Processor
7+
Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
8+
------------------------------------------------------------------------------------------------------------------------
9+
Legacy 7128 7146 8 0.1 7127.9 1.0X
10+
New 790 795 5 1.3 789.7 9.0X
11+
12+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
================================================================================================
2+
Escape
3+
================================================================================================
4+
5+
OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
6+
AMD EPYC 7763 64-Core Processor
7+
Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
8+
------------------------------------------------------------------------------------------------------------------------
9+
Legacy 6719 6726 6 0.1 6719.3 1.0X
10+
New 735 744 21 1.4 735.3 9.1X
11+
12+

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ object ExternalCatalogUtils {
4040
// The following string escaping code is mainly copied from Hive (o.a.h.h.common.FileUtils).
4141
//////////////////////////////////////////////////////////////////////////////////////////////////
4242

43-
val charToEscape = {
43+
final val (charToEscape, sizeOfCharToEscape) = {
4444
val bitSet = new java.util.BitSet(128)
4545

4646
/**
@@ -60,28 +60,42 @@ object ExternalCatalogUtils {
6060
Array(' ', '<', '>', '|').foreach(bitSet.set(_))
6161
}
6262

63-
bitSet
63+
(bitSet, bitSet.size)
6464
}
6565

66-
def needsEscaping(c: Char): Boolean = {
67-
c < charToEscape.size() && charToEscape.get(c)
66+
private final val HEX_CHARS = "0123456789ABCDEF".toCharArray
67+
68+
@inline final def needsEscaping(c: Char): Boolean = {
69+
c < sizeOfCharToEscape && charToEscape.get(c)
6870
}
6971

7072
def escapePathName(path: String): String = {
71-
val builder = new StringBuilder()
72-
path.foreach { c =>
73-
if (needsEscaping(c)) {
74-
builder.append('%')
75-
builder.append(f"${c.asInstanceOf[Int]}%02X")
76-
} else {
77-
builder.append(c)
73+
if (path == null || path.isEmpty) {
74+
return path
75+
}
76+
val length = path.length
77+
var firstIndex = 0
78+
while (firstIndex < length && !needsEscaping(path.charAt(firstIndex))) {
79+
firstIndex += 1
80+
}
81+
if (firstIndex == length) {
82+
path
83+
} else {
84+
val sb = new java.lang.StringBuilder(length + 16)
85+
if (firstIndex != 0) sb.append(path, 0, firstIndex)
86+
while(firstIndex < length) {
87+
val c = path.charAt(firstIndex)
88+
if (needsEscaping(c)) {
89+
sb.append('%').append(HEX_CHARS((c & 0xF0) >> 4)).append(HEX_CHARS(c & 0x0F))
90+
} else {
91+
sb.append(c)
92+
}
93+
firstIndex += 1
7894
}
95+
sb.toString
7996
}
80-
81-
builder.toString()
8297
}
8398

84-
8599
def unescapePathName(path: String): String = {
86100
val sb = new StringBuilder
87101
var i = 0
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.sql.catalyst
18+
19+
import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
20+
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
21+
22+
/**
23+
* Benchmark for path escaping
24+
* To run this benchmark:
25+
* {{{
26+
* 1. without sbt:
27+
* bin/spark-submit --class <this class> --jars <spark core test jar> <spark catalyst test jar>
28+
* 2. build/sbt "catalyst/Test/runMain <this class>"
29+
* 3. generate result:
30+
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "catalyst/Test/runMain <this class>"
31+
* Results will be written to "benchmarks/EscapePathBenchmark-results.txt".
32+
* }}}
33+
*/
34+
object EscapePathBenchmark extends BenchmarkBase {
35+
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
36+
val N = 1000000
37+
runBenchmark("Escape") {
38+
val benchmark = new Benchmark("Escape Tests", N, 10, output = output)
39+
val paths = Seq(
40+
"https://issues.apache.org/jira/browse/SPARK-48551",
41+
"https...issues.apache.org/jira/browse/SPARK-48551",
42+
"https...issues.apache.org.jira/browse/SPARK-48551",
43+
"https...issues.apache.org.jira.browse/SPARK-48551",
44+
"https...issues.apache.org.jira.browse.SPARK-48551")
45+
benchmark.addCase("Legacy") { _ =>
46+
(1 to N).foreach(_ => paths.foreach(escapePathNameLegacy))
47+
}
48+
49+
benchmark.addCase("New") { _ =>
50+
(1 to N).foreach(_ => {
51+
paths.foreach(ExternalCatalogUtils.escapePathName)
52+
})
53+
}
54+
benchmark.run()
55+
}
56+
}
57+
58+
/**
59+
* Legacy implementation of escapePathName before Spark 4.0
60+
*/
61+
def escapePathNameLegacy(path: String): String = {
62+
val builder = new StringBuilder()
63+
path.foreach { c =>
64+
if (ExternalCatalogUtils.needsEscaping(c)) {
65+
builder.append('%')
66+
builder.append(f"${c.asInstanceOf[Int]}%02X")
67+
} else {
68+
builder.append(c)
69+
}
70+
}
71+
72+
builder.toString()
73+
}
74+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.catalog
19+
20+
import org.apache.spark.SparkFunSuite
21+
import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
22+
23+
class ExternalCatalogUtilsSuite extends SparkFunSuite {
24+
25+
test("SPARK-48551: escapePathName") {
26+
ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c =>
27+
// Check parity with old conversion technique:
28+
assert(escapePathName(c.toString) === "%" + f"$c%02X",
29+
s"wrong escaping for $c")
30+
}
31+
assert(escapePathName("") === "")
32+
assert(escapePathName(" ") === " ")
33+
assert(escapePathName("\n") === "%0A")
34+
assert(escapePathName("a b") === "a b")
35+
assert(escapePathName("a:b") === "a%3Ab")
36+
assert(escapePathName(":ab") === "%3Aab")
37+
assert(escapePathName("ab:") === "ab%3A")
38+
assert(escapePathName("a%b") === "a%25b")
39+
assert(escapePathName("a,b") === "a,b")
40+
assert(escapePathName("a/b") === "a%2Fb")
41+
}
42+
}

0 commit comments

Comments
 (0)