Skip to content

Commit 69a912e

Browse files
committed
always do char padding in predicates
1 parent 3cd35f8 commit 69a912e

File tree

4 files changed

+70
-13
lines changed

4 files changed

+70
-13
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4603,6 +4603,14 @@ object SQLConf {
46034603
.booleanConf
46044604
.createWithDefault(true)
46054605

4606+
val LEGACY_NO_CHAR_PADDING_IN_PREDICATE = buildConf("spark.sql.legacy.noCharPaddingInPredicate")
4607+
.internal()
4608+
.doc("When true, Spark will not apply char type padding for CHAR type columns in string " +
4609+
s"comparison predicates, when '${READ_SIDE_CHAR_PADDING.key}' is false.")
4610+
.version("4.0.0")
4611+
.booleanConf
4612+
.createWithDefault(false)
4613+
46064614
val CLI_PRINT_HEADER =
46074615
buildConf("spark.sql.cli.print.header")
46084616
.doc("When set to true, spark-sql CLI prints the names of the columns in query output.")

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ApplyCharTypePadding.scala

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
2424
import org.apache.spark.sql.catalyst.trees.TreePattern.{BINARY_COMPARISON, IN}
2525
import org.apache.spark.sql.catalyst.util.CharVarcharUtils
2626
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
27+
import org.apache.spark.sql.internal.SQLConf
2728
import org.apache.spark.sql.types.{CharType, Metadata, StringType}
2829
import org.apache.spark.unsafe.types.UTF8String
2930

@@ -66,9 +67,10 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
6667
r.copy(dataCols = cleanedDataCols, partitionCols = cleanedPartCols)
6768
})
6869
}
69-
paddingForStringComparison(newPlan)
70+
paddingForStringComparison(newPlan, padCharCol = false)
7071
} else {
71-
paddingForStringComparison(plan)
72+
paddingForStringComparison(
73+
plan, padCharCol = !conf.getConf(SQLConf.LEGACY_NO_CHAR_PADDING_IN_PREDICATE))
7274
}
7375
}
7476

@@ -90,7 +92,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
9092
}
9193
}
9294

93-
private def paddingForStringComparison(plan: LogicalPlan): LogicalPlan = {
95+
private def paddingForStringComparison(plan: LogicalPlan, padCharCol: Boolean): LogicalPlan = {
9496
plan.resolveOperatorsUpWithPruning(_.containsAnyPattern(BINARY_COMPARISON, IN)) {
9597
case operator => operator.transformExpressionsUpWithPruning(
9698
_.containsAnyPattern(BINARY_COMPARISON, IN)) {
@@ -99,12 +101,12 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
99101
// String literal is treated as char type when it's compared to a char type column.
100102
// We should pad the shorter one to the longer length.
101103
case b @ BinaryComparison(e @ AttrOrOuterRef(attr), lit) if lit.foldable =>
102-
padAttrLitCmp(e, attr.metadata, lit).map { newChildren =>
104+
padAttrLitCmp(e, attr.metadata, padCharCol, lit).map { newChildren =>
103105
b.withNewChildren(newChildren)
104106
}.getOrElse(b)
105107

106108
case b @ BinaryComparison(lit, e @ AttrOrOuterRef(attr)) if lit.foldable =>
107-
padAttrLitCmp(e, attr.metadata, lit).map { newChildren =>
109+
padAttrLitCmp(e, attr.metadata, padCharCol, lit).map { newChildren =>
108110
b.withNewChildren(newChildren.reverse)
109111
}.getOrElse(b)
110112

@@ -117,9 +119,10 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
117119
val literalCharLengths = literalChars.map(_.numChars())
118120
val targetLen = (length +: literalCharLengths).max
119121
Some(i.copy(
120-
value = addPadding(e, length, targetLen),
122+
value = addPadding(e, length, targetLen, alwaysPad = padCharCol),
121123
list = list.zip(literalCharLengths).map {
122-
case (lit, charLength) => addPadding(lit, charLength, targetLen)
124+
case (lit, charLength) =>
125+
addPadding(lit, charLength, targetLen, alwaysPad = false)
123126
} ++ nulls.map(Literal.create(_, StringType))))
124127
case _ => None
125128
}.getOrElse(i)
@@ -162,6 +165,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
162165
private def padAttrLitCmp(
163166
expr: Expression,
164167
metadata: Metadata,
168+
padCharCol: Boolean,
165169
lit: Expression): Option[Seq[Expression]] = {
166170
if (expr.dataType == StringType) {
167171
CharVarcharUtils.getRawType(metadata).flatMap {
@@ -174,7 +178,14 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
174178
if (length < stringLitLen) {
175179
Some(Seq(StringRPad(expr, Literal(stringLitLen)), lit))
176180
} else if (length > stringLitLen) {
177-
Some(Seq(expr, StringRPad(lit, Literal(length))))
181+
val paddedExpr = if (padCharCol) {
182+
StringRPad(expr, Literal(length))
183+
} else {
184+
expr
185+
}
186+
Some(Seq(paddedExpr, StringRPad(lit, Literal(length))))
187+
} else if (padCharCol) {
188+
Some(Seq(StringRPad(expr, Literal(length)), lit))
178189
} else {
179190
None
180191
}
@@ -186,7 +197,15 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
186197
}
187198
}
188199

189-
private def addPadding(expr: Expression, charLength: Int, targetLength: Int): Expression = {
190-
if (targetLength > charLength) StringRPad(expr, Literal(targetLength)) else expr
200+
private def addPadding(
201+
expr: Expression,
202+
charLength: Int,
203+
targetLength: Int,
204+
alwaysPad: Boolean): Expression = {
205+
if (targetLength > charLength) {
206+
StringRPad(expr, Literal(targetLength))
207+
} else if (alwaysPad) {
208+
StringRPad(expr, Literal(charLength))
209+
} else expr
191210
}
192211
}

sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -942,6 +942,34 @@ class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSpa
942942
}
943943
}
944944
}
945+
946+
test("SPARK-48498: always do char padding in predicates") {
947+
import testImplicits._
948+
withSQLConf(SQLConf.READ_SIDE_CHAR_PADDING.key -> "false") {
949+
withTempPath { dir =>
950+
withTable("t") {
951+
Seq(
952+
"12" -> "12",
953+
"12" -> "12 ",
954+
"12 " -> "12",
955+
"12 " -> "12 "
956+
).toDF("c1", "c2").write.format(format).save(dir.toString)
957+
sql(s"CREATE TABLE t (c1 CHAR(3), c2 STRING) USING $format LOCATION '$dir'")
958+
// Comparing CHAR column with STRING column directly compares the stored value.
959+
checkAnswer(
960+
sql("SELECT c1 = c2 FROM t"),
961+
Seq(Row(true), Row(false), Row(false), Row(true))
962+
)
963+
// No matter the CHAR type value is padded or not in the storage, we should always pad it
964+
// before comparison with STRING literals.
965+
checkAnswer(
966+
sql("SELECT c1 = '12', c1 = '12 ', c1 = '12 ' FROM t WHERE c2 = '12'"),
967+
Seq(Row(true, true, true), Row(true, true, true))
968+
)
969+
}
970+
}
971+
}
972+
}
945973
}
946974

947975
class DSV2CharVarcharTestSuite extends CharVarcharTestSuite

sql/core/src/test/scala/org/apache/spark/sql/PlanStabilitySuite.scala

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,9 +256,11 @@ trait PlanStabilitySuite extends DisableAdaptiveExecutionSuite {
256256
protected def testQuery(tpcdsGroup: String, query: String, suffix: String = ""): Unit = {
257257
val queryString = resourceToString(s"$tpcdsGroup/$query.sql",
258258
classLoader = Thread.currentThread().getContextClassLoader)
259-
// Disable char/varchar read-side handling for better performance.
260-
withSQLConf(SQLConf.READ_SIDE_CHAR_PADDING.key -> "false",
261-
SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "10MB") {
259+
withSQLConf(
260+
// Disable char/varchar read-side handling for better performance.
261+
SQLConf.READ_SIDE_CHAR_PADDING.key -> "false",
262+
SQLConf.LEGACY_NO_CHAR_PADDING_IN_PREDICATE.key -> "true",
263+
SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "10MB") {
262264
val qe = sql(queryString).queryExecution
263265
val plan = qe.executedPlan
264266
val explain = normalizeLocation(normalizeIds(qe.explainString(FormattedMode)))

0 commit comments

Comments
 (0)