Skip to content

Commit 8ab6ae3

Browse files
iRaksoncloud-fan
authored andcommitted
[SPARK-30790] The dataType of map() should be map<null,null>
### What changes were proposed in this pull request? `spark.sql("select map()")` returns {}. After these changes it will return map<null,null> ### Why are the changes needed? After changes introduced due to #27521, it is important to maintain consistency while using map(). ### Does this PR introduce any user-facing change? Yes. Now map() will give map<null,null> instead of {}. ### How was this patch tested? UT added. Migration guide updated as well Closes #27542 from iRakson/SPARK-30790. Authored-by: iRakson <raksonrakesh@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com> (cherry picked from commit 926e3a1) Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 8298173 commit 8ab6ae3

File tree

5 files changed

+36
-20
lines changed

5 files changed

+36
-20
lines changed

docs/sql-migration-guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ license: |
216216

217217
- Since Spark 3.0, the `size` function returns `NULL` for the `NULL` input. In Spark version 2.4 and earlier, this function gives `-1` for the same input. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.sizeOfNull` to `true`.
218218

219-
- Since Spark 3.0, when the `array` function is called without any parameters, it returns an empty array of `NullType`. In Spark version 2.4 and earlier, it returns an empty array of string type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.arrayDefaultToStringType.enabled` to `true`.
219+
- Since Spark 3.0, when the `array`/`map` function is called without any parameters, it returns an empty collection with `NullType` as element type. In Spark version 2.4 and earlier, it returns an empty collection with `StringType` as element type. To restore the behavior before Spark 3.0, you can set `spark.sql.legacy.createEmptyCollectionUsingStringType` to `true`.
220220

221221
- Since Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, `SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'` throws parser exception.
222222

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
4646
}
4747

4848
private val defaultElementType: DataType = {
49-
if (SQLConf.get.getConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING)) {
49+
if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
5050
StringType
5151
} else {
5252
NullType
@@ -145,6 +145,14 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
145145
lazy val keys = children.indices.filter(_ % 2 == 0).map(children)
146146
lazy val values = children.indices.filter(_ % 2 != 0).map(children)
147147

148+
private val defaultElementType: DataType = {
149+
if (SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) {
150+
StringType
151+
} else {
152+
NullType
153+
}
154+
}
155+
148156
override def foldable: Boolean = children.forall(_.foldable)
149157

150158
override def checkInputDataTypes(): TypeCheckResult = {
@@ -167,9 +175,9 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
167175
override lazy val dataType: MapType = {
168176
MapType(
169177
keyType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(keys.map(_.dataType))
170-
.getOrElse(StringType),
178+
.getOrElse(defaultElementType),
171179
valueType = TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(values.map(_.dataType))
172-
.getOrElse(StringType),
180+
.getOrElse(defaultElementType),
173181
valueContainsNull = values.exists(_.nullable))
174182
}
175183

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapBuilder.scala

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,11 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
2929
*/
3030
class ArrayBasedMapBuilder(keyType: DataType, valueType: DataType) extends Serializable {
3131
assert(!keyType.existsRecursively(_.isInstanceOf[MapType]), "key of map cannot be/contain map")
32-
assert(keyType != NullType, "map key cannot be null type.")
3332

3433
private lazy val keyToIndex = keyType match {
3534
// Binary type data is `byte[]`, which can't use `==` to check equality.
36-
case _: AtomicType | _: CalendarIntervalType if !keyType.isInstanceOf[BinaryType] =>
37-
new java.util.HashMap[Any, Int]()
35+
case _: AtomicType | _: CalendarIntervalType | _: NullType
36+
if !keyType.isInstanceOf[BinaryType] => new java.util.HashMap[Any, Int]()
3837
case _ =>
3938
// for complex types, use interpreted ordering to be able to compare unsafe data with safe
4039
// data, e.g. UnsafeRow vs GenericInternalRow.

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2007,12 +2007,12 @@ object SQLConf {
20072007
.booleanConf
20082008
.createWithDefault(false)
20092009

2010-
val LEGACY_ARRAY_DEFAULT_TO_STRING =
2011-
buildConf("spark.sql.legacy.arrayDefaultToStringType.enabled")
2010+
val LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE =
2011+
buildConf("spark.sql.legacy.createEmptyCollectionUsingStringType")
20122012
.internal()
2013-
.doc("When set to true, it returns an empty array of string type when the `array` " +
2014-
"function is called without any parameters. Otherwise, it returns an empty " +
2015-
"array of `NullType`")
2013+
.doc("When set to true, Spark returns an empty collection with `StringType` as element " +
2014+
"type if the `array`/`map` function is called without any parameters. Otherwise, Spark " +
2015+
"returns an empty collection with `NullType` as element type.")
20162016
.booleanConf
20172017
.createWithDefault(false)
20182018

sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3499,13 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
34993499
).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_))
35003500
}
35013501

3502-
test("SPARK-21281 use string types by default if map have no argument") {
3503-
val ds = spark.range(1)
3504-
var expectedSchema = new StructType()
3505-
.add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false)
3506-
assert(ds.select(map().as("x")).schema == expectedSchema)
3507-
}
3508-
35093502
test("SPARK-21281 fails if functions have no argument") {
35103503
val df = Seq(1).toDF("a")
35113504

@@ -3563,14 +3556,30 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
35633556
test("SPARK-29462: Empty array of NullType for array function with no arguments") {
35643557
Seq((true, StringType), (false, NullType)).foreach {
35653558
case (arrayDefaultToString, expectedType) =>
3566-
withSQLConf(SQLConf.LEGACY_ARRAY_DEFAULT_TO_STRING.key -> arrayDefaultToString.toString) {
3559+
withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
3560+
arrayDefaultToString.toString) {
35673561
val schema = spark.range(1).select(array()).schema
35683562
assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType])
35693563
val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType
35703564
assert(actualType === expectedType)
35713565
}
35723566
}
35733567
}
3568+
3569+
test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") {
3570+
Seq((true, StringType), (false, NullType)).foreach {
3571+
case (mapDefaultToString, expectedType) =>
3572+
withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key ->
3573+
mapDefaultToString.toString) {
3574+
val schema = spark.range(1).select(map()).schema
3575+
assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType])
3576+
val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType
3577+
val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType
3578+
assert(actualKeyType === expectedType)
3579+
assert(actualValueType === expectedType)
3580+
}
3581+
}
3582+
}
35743583
}
35753584

35763585
object DataFrameFunctionsSuite {

0 commit comments

Comments
 (0)