Skip to content

[SPARK-48498][SQL][FOLLOWUP] do padding for char-char comparison #47412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -240,14 +240,14 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils {
* attributes. When comparing two char type columns/fields, we need to pad the shorter one to
* the longer length.
*/
def addPaddingInStringComparison(attrs: Seq[Attribute]): Seq[Expression] = {
def addPaddingInStringComparison(attrs: Seq[Attribute], alwaysPad: Boolean): Seq[Expression] = {
val rawTypes = attrs.map(attr => getRawType(attr.metadata))
if (rawTypes.exists(_.isEmpty)) {
attrs
} else {
val typeWithTargetCharLength = rawTypes.map(_.get).reduce(typeWithWiderCharLength)
attrs.zip(rawTypes.map(_.get)).map { case (attr, rawType) =>
padCharToTargetLength(attr, rawType, typeWithTargetCharLength).getOrElse(attr)
padCharToTargetLength(attr, rawType, typeWithTargetCharLength, alwaysPad).getOrElse(attr)
}
}
}
Expand All @@ -270,9 +270,10 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils {
private def padCharToTargetLength(
expr: Expression,
rawType: DataType,
typeWithTargetCharLength: DataType): Option[Expression] = {
typeWithTargetCharLength: DataType,
alwaysPad: Boolean): Option[Expression] = {
(rawType, typeWithTargetCharLength) match {
case (CharType(len), CharType(target)) if target > len =>
case (CharType(len), CharType(target)) if alwaysPad || target > len =>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need padding if len = target?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't need to. This is the existing logic. My change is always pad as the CHAR value may not be padded to its declared length.

Some(StringRPad(expr, Literal(target)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall be the bigger number here?


case (StructType(fields), StructType(targets)) =>
Expand All @@ -283,7 +284,8 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils {
while (i < fields.length) {
val field = fields(i)
val fieldExpr = GetStructField(expr, i, Some(field.name))
val padded = padCharToTargetLength(fieldExpr, field.dataType, targets(i).dataType)
val padded = padCharToTargetLength(
fieldExpr, field.dataType, targets(i).dataType, alwaysPad)
needPadding = padded.isDefined
createStructExprs += Literal(field.name)
createStructExprs += padded.getOrElse(fieldExpr)
Expand All @@ -293,7 +295,7 @@ object CharVarcharUtils extends Logging with SparkCharVarcharUtils {

case (ArrayType(et, containsNull), ArrayType(target, _)) =>
val param = NamedLambdaVariable("x", replaceCharVarcharWithString(et), containsNull)
padCharToTargetLength(param, et, target).map { padded =>
padCharToTargetLength(param, et, target, alwaysPad).map { padded =>
val func = LambdaFunction(padded, Seq(param))
ArrayTransform(expr, func)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
case (_, _: OuterReference) => Seq(right)
case _ => Nil
}
val newChildren = CharVarcharUtils.addPaddingInStringComparison(Seq(left, right))
val newChildren = CharVarcharUtils.addPaddingInStringComparison(
Seq(left, right), padCharCol)
if (outerRefs.nonEmpty) {
b.withNewChildren(newChildren.map(_.transform {
case a: Attribute if outerRefs.exists(_.semanticEquals(a)) => OuterReference(a)
Expand All @@ -148,7 +149,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {

case i @ In(e @ AttrOrOuterRef(attr), list) if list.forall(_.isInstanceOf[Attribute]) =>
val newChildren = CharVarcharUtils.addPaddingInStringComparison(
attr +: list.map(_.asInstanceOf[Attribute]))
attr +: list.map(_.asInstanceOf[Attribute]), padCharCol)
if (e.isInstanceOf[OuterReference]) {
i.copy(
value = newChildren.head.transform {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -960,25 +960,45 @@ class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSpa
import testImplicits._
withSQLConf(SQLConf.READ_SIDE_CHAR_PADDING.key -> "false") {
withTempPath { dir =>
withTable("t") {
withTable("t1", "t2") {
Seq(
"12" -> "12",
"12" -> "12 ",
"12 " -> "12",
"12 " -> "12 "
).toDF("c1", "c2").write.format(format).save(dir.toString)
sql(s"CREATE TABLE t (c1 CHAR(3), c2 STRING) USING $format LOCATION '$dir'")

sql(s"CREATE TABLE t1 (c1 CHAR(3), c2 STRING) USING $format LOCATION '$dir'")
// Comparing CHAR column with STRING column directly compares the stored value.
checkAnswer(
sql("SELECT c1 = c2 FROM t"),
sql("SELECT c1 = c2 FROM t1"),
Seq(Row(true), Row(false), Row(false), Row(true))
)
checkAnswer(
sql("SELECT c1 IN (c2) FROM t1"),
Seq(Row(true), Row(false), Row(false), Row(true))
)
// No matter the CHAR type value is padded or not in the storage, we should always pad it
// before comparison with STRING literals.
checkAnswer(
sql("SELECT c1 = '12', c1 = '12 ', c1 = '12 ' FROM t WHERE c2 = '12'"),
sql("SELECT c1 = '12', c1 = '12 ', c1 = '12 ' FROM t1 WHERE c2 = '12'"),
Seq(Row(true, true, true), Row(true, true, true))
)
checkAnswer(
sql("SELECT c1 IN ('12'), c1 IN ('12 '), c1 IN ('12 ') FROM t1 WHERE c2 = '12'"),
Seq(Row(true, true, true), Row(true, true, true))
)

sql(s"CREATE TABLE t2 (c1 CHAR(3), c2 CHAR(5)) USING $format LOCATION '$dir'")
// Comparing CHAR column with CHAR column compares the padded values.
checkAnswer(
sql("SELECT c1 = c2, c2 = c1 FROM t2"),
Seq(Row(true, true), Row(true, true), Row(true, true), Row(true, true))
)
checkAnswer(
sql("SELECT c1 IN (c2), c2 IN (c1) FROM t2"),
Seq(Row(true, true), Row(true, true), Row(true, true), Row(true, true))
)
}
}
}
Expand Down