Skip to content

Commit 2814293

Browse files
shujingyang-dbdongjoon-hyun
authored andcommitted
[SPARK-45962][SQL] Remove treatEmptyValuesAsNulls and use nullValue option instead in XML
### What changes were proposed in this pull request? Remove treatEmptyValuesAsNulls and use nullValue option instead in XML ### Why are the changes needed? Today, we offer two available options to handle null values. To enhance user clarity and simplify usage, we propose consolidating these into a single option. We recommend retaining the nullValue option due to its broader semantic scope. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #43852 from shujingyang-db/treatEmptyValue. Authored-by: Shujing Yang <shujing.yang@databricks.com> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
1 parent a7147c8 commit 2814293

File tree

5 files changed

+14
-17
lines changed

5 files changed

+14
-17
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,8 @@ class StaxXmlParser(
183183
(parser.peek, dataType) match {
184184
case (_: StartElement, dt: DataType) => convertComplicatedType(dt, attributes)
185185
case (_: EndElement, _: StringType) =>
186-
// Empty. It's null if these are explicitly treated as null, or "" is the null value
187-
if (options.treatEmptyValuesAsNulls || options.nullValue == "") {
186+
// Empty. It's null if "" is the null value
187+
if (options.nullValue == "") {
188188
null
189189
} else {
190190
UTF8String.fromString("")
@@ -224,7 +224,8 @@ class StaxXmlParser(
224224
parser.peek match {
225225
case _: StartElement => convertComplicatedType(dataType, attributes)
226226
case _: EndElement if data.isEmpty => null
227-
case _: EndElement if options.treatEmptyValuesAsNulls => null
227+
// treat empty values as null
228+
case _: EndElement if options.nullValue == "" => null
228229
case _: EndElement => convertTo(data, dataType)
229230
case _ => convertField(parser, dataType, attributes)
230231
}
@@ -444,8 +445,7 @@ class StaxXmlParser(
444445
private def castTo(
445446
datum: String,
446447
castType: DataType): Any = {
447-
if ((datum == options.nullValue) ||
448-
(options.treatEmptyValuesAsNulls && datum == "")) {
448+
if (datum == options.nullValue || datum == null) {
449449
null
450450
} else {
451451
castType match {
@@ -493,8 +493,7 @@ class StaxXmlParser(
493493
} else {
494494
datum
495495
}
496-
if ((value == options.nullValue) ||
497-
(options.treatEmptyValuesAsNulls && value == "")) {
496+
if (value == options.nullValue || value == null) {
498497
null
499498
} else {
500499
dataType match {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParserUtils.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ object StaxXmlParserUtils {
9696
attributes.map { attr =>
9797
val key = options.attributePrefix + getName(attr.getName, options)
9898
val value = attr.getValue match {
99-
case v if options.treatEmptyValuesAsNulls && v.trim.isEmpty => null
99+
case v if (options.nullValue == "") && v.trim.isEmpty => null
100100
case v => v
101101
}
102102
key -> value

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
161161
parser.peek match {
162162
case _: StartElement => inferObject(parser)
163163
case _: EndElement if data.isEmpty => NullType
164-
case _: EndElement if options.treatEmptyValuesAsNulls => NullType
164+
case _: EndElement if options.nullValue == "" => NullType
165165
case _: EndElement => StringType
166166
case _ => inferField(parser)
167167
}

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlOptions.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ class XmlOptions(
8686
val samplingRatio = parameters.get(SAMPLING_RATIO).map(_.toDouble).getOrElse(1.0)
8787
require(samplingRatio > 0, s"$SAMPLING_RATIO ($samplingRatio) should be greater than 0")
8888
val excludeAttributeFlag = getBool(EXCLUDE_ATTRIBUTE, false)
89-
val treatEmptyValuesAsNulls = getBool(TREAT_EMPTY_VALUE_AS_NULLS, false)
9089
val attributePrefix =
9190
parameters.getOrElse(ATTRIBUTE_PREFIX, XmlOptions.DEFAULT_ATTRIBUTE_PREFIX)
9291
val valueTag = parameters.getOrElse(VALUE_TAG, XmlOptions.DEFAULT_VALUE_TAG)
@@ -188,7 +187,6 @@ object XmlOptions extends DataSourceOptions {
188187
val DECLARATION = newOption("declaration")
189188
val ARRAY_ELEMENT_NAME = newOption("arrayElementName")
190189
val EXCLUDE_ATTRIBUTE = newOption("excludeAttribute")
191-
val TREAT_EMPTY_VALUE_AS_NULLS = newOption("treatEmptyValuesAsNulls")
192190
val ATTRIBUTE_PREFIX = newOption("attributePrefix")
193191
val VALUE_TAG = newOption("valueTag")
194192
val NULL_VALUE = newOption("nullValue")

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
7777
test("DSL test with xml having unbalanced datatypes") {
7878
val results = spark.read
7979
.option("rowTag", "ROW")
80-
.option("treatEmptyValuesAsNulls", "true")
80+
.option("nullValue", "")
8181
.option("multiLine", "true")
8282
.xml(getTestResourcePath(resDir + "gps-empty-field.xml"))
8383

@@ -440,7 +440,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
440440
assert(getLines(xmlFile).count(_.contains("<foo>")) === 2)
441441
}
442442

443-
test("DSL save with nullValue and treatEmptyValuesAsNulls") {
443+
test("DSL save with nullValue") {
444444
val copyFilePath = getEmptyTempDir().resolve("books-copy.xml")
445445

446446
val books = spark.read
@@ -452,7 +452,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
452452

453453
val booksCopy = spark.read
454454
.option("rowTag", "book")
455-
.option("treatEmptyValuesAsNulls", "true")
455+
.option("nullValue", "")
456456
.xml(copyFilePath.toString)
457457

458458
assert(booksCopy.count() === books.count())
@@ -741,7 +741,7 @@ class XmlSuite extends QueryTest with SharedSparkSession {
741741
field("age", IntegerType))
742742
val results = spark.read.schema(schema)
743743
.option("rowTag", "ROW")
744-
.option("treatEmptyValuesAsNulls", true)
744+
.option("nullValue", "")
745745
.xml(getTestResourcePath(resDir + "null-numbers.xml"))
746746
.select("name", "age")
747747
.collect()
@@ -950,10 +950,10 @@ class XmlSuite extends QueryTest with SharedSparkSession {
950950
"requirement failed: 'valueTag' and 'attributePrefix' options should not be the same.")
951951
}
952952

953-
test("nullValue and treatEmptyValuesAsNulls test") {
953+
test("nullValue test") {
954954
val resultsOne = spark.read
955955
.option("rowTag", "ROW")
956-
.option("treatEmptyValuesAsNulls", "true")
956+
.option("nullValue", "")
957957
.xml(getTestResourcePath(resDir + "gps-empty-field.xml"))
958958
assert(resultsOne.selectExpr("extensions.TrackPointExtension").head().getStruct(0) !== null)
959959
assert(resultsOne.selectExpr("extensions.TrackPointExtension")

0 commit comments

Comments
 (0)