Skip to content

Commit 1f1443e

Browse files
committed
[SPARK-29347][SQL] Add JSON serialization for external Rows
### What changes were proposed in this pull request? This PR adds JSON serialization for Spark external Rows. ### Why are the changes needed? This is to be used for observable metrics where the `StreamingQueryProgress` contains a map of observed metrics rows which needs to be serialized in some cases. ### Does this PR introduce any user-facing change? Yes, a user can call `toJson` on rows returned when collecting a DataFrame to the driver. ### How was this patch tested? Added a new test suite: `RowJsonSuite` that should test this. Closes #26013 from hvanhovell/SPARK-29347. Authored-by: herman <herman@databricks.com> Signed-off-by: herman <herman@databricks.com>
1 parent ff9fcd5 commit 1f1443e

File tree

2 files changed

+246
-2
lines changed

2 files changed

+246
-2
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,24 @@
1717

1818
package org.apache.spark.sql
1919

20+
import java.sql.{Date, Timestamp}
21+
import java.time.{Instant, LocalDate}
22+
import java.util.{Base64, TimeZone}
23+
2024
import scala.collection.JavaConverters._
25+
import scala.collection.mutable
2126
import scala.util.hashing.MurmurHash3
2227

23-
import org.apache.spark.annotation.Stable
28+
import org.json4s._
29+
import org.json4s.JsonAST.JValue
30+
import org.json4s.jackson.JsonMethods._
31+
32+
import org.apache.spark.annotation.{Private, Stable, Unstable}
33+
import org.apache.spark.sql.catalyst.CatalystTypeConverters
2434
import org.apache.spark.sql.catalyst.expressions.GenericRow
25-
import org.apache.spark.sql.types.StructType
35+
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter}
36+
import org.apache.spark.sql.internal.SQLConf
37+
import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, Decimal, MapType, StringType, StructType, UserDefinedType}
2638

2739
/**
2840
* @since 1.3.0
@@ -501,4 +513,96 @@ trait Row extends Serializable {
501513
private def getAnyValAs[T <: AnyVal](i: Int): T =
502514
if (isNullAt(i)) throw new NullPointerException(s"Value at index $i is null")
503515
else getAs[T](i)
516+
517+
/**
518+
* The compact JSON representation of this row.
519+
* @since 3.0
520+
*/
521+
@Unstable
522+
def json: String = compact(jsonValue)
523+
524+
/**
525+
* The pretty (i.e. indented) JSON representation of this row.
526+
* @since 3.0
527+
*/
528+
@Unstable
529+
def prettyJson: String = pretty(render(jsonValue))
530+
531+
/**
532+
* JSON representation of the row.
533+
*
534+
* Note that this only supports the data types that are also supported by
535+
* [[org.apache.spark.sql.catalyst.encoders.RowEncoder]].
536+
*
537+
* @return the JSON representation of the row.
538+
*/
539+
private[sql] def jsonValue: JValue = {
540+
require(schema != null, "JSON serialization requires a non-null schema.")
541+
542+
lazy val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
543+
lazy val dateFormatter = DateFormatter.apply(zoneId)
544+
lazy val timestampFormatter = TimestampFormatter(zoneId)
545+
546+
// Convert an iterator of values to a json array
547+
def iteratorToJsonArray(iterator: Iterator[_], elementType: DataType): JArray = {
548+
JArray(iterator.map(toJson(_, elementType)).toList)
549+
}
550+
551+
// Convert a value to json.
552+
def toJson(value: Any, dataType: DataType): JValue = (value, dataType) match {
553+
case (null, _) => JNull
554+
case (b: Boolean, _) => JBool(b)
555+
case (b: Byte, _) => JLong(b)
556+
case (s: Short, _) => JLong(s)
557+
case (i: Int, _) => JLong(i)
558+
case (l: Long, _) => JLong(l)
559+
case (f: Float, _) => JDouble(f)
560+
case (d: Double, _) => JDouble(d)
561+
case (d: BigDecimal, _) => JDecimal(d)
562+
case (d: java.math.BigDecimal, _) => JDecimal(d)
563+
case (d: Decimal, _) => JDecimal(d.toBigDecimal)
564+
case (s: String, _) => JString(s)
565+
case (b: Array[Byte], BinaryType) =>
566+
JString(Base64.getEncoder.encodeToString(b))
567+
case (d: LocalDate, _) =>
568+
JString(dateFormatter.format(DateTimeUtils.localDateToDays(d)))
569+
case (d: Date, _) =>
570+
JString(dateFormatter.format(DateTimeUtils.fromJavaDate(d)))
571+
case (i: Instant, _) =>
572+
JString(timestampFormatter.format(DateTimeUtils.instantToMicros(i)))
573+
case (t: Timestamp, _) =>
574+
JString(timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t)))
575+
case (a: Array[_], ArrayType(elementType, _)) =>
576+
iteratorToJsonArray(a.iterator, elementType)
577+
case (s: Seq[_], ArrayType(elementType, _)) =>
578+
iteratorToJsonArray(s.iterator, elementType)
579+
case (m: Map[String @unchecked, _], MapType(StringType, valueType, _)) =>
580+
new JObject(m.toList.sortBy(_._1).map {
581+
case (k, v) => k -> toJson(v, valueType)
582+
})
583+
case (m: Map[_, _], MapType(keyType, valueType, _)) =>
584+
new JArray(m.iterator.map {
585+
case (k, v) =>
586+
new JObject("key" -> toJson(k, keyType) :: "value" -> toJson(v, valueType) :: Nil)
587+
}.toList)
588+
case (r: Row, _) => r.jsonValue
589+
case (v: Any, udt: UserDefinedType[Any @unchecked]) =>
590+
val dataType = udt.sqlType
591+
toJson(CatalystTypeConverters.convertToScala(udt.serialize(v), dataType), dataType)
592+
case _ =>
593+
throw new IllegalArgumentException(s"Failed to convert value $value " +
594+
s"(class of ${value.getClass}}) with the type of $dataType to JSON.")
595+
}
596+
597+
// Convert the row fields to json
598+
var n = 0
599+
var elements = new mutable.ListBuffer[JField]
600+
val len = length
601+
while (n < len) {
602+
val field = schema(n)
603+
elements += (field.name -> toJson(apply(n), field.dataType))
604+
n += 1
605+
}
606+
new JObject(elements.toList)
607+
}
504608
}
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.sql
18+
19+
import java.sql.{Date, Timestamp}
20+
import java.time.{Instant, LocalDate}
21+
22+
import org.json4s.JsonAST.{JArray, JBool, JDecimal, JDouble, JLong, JNull, JObject, JString, JValue}
23+
24+
import org.apache.spark.SparkFunSuite
25+
import org.apache.spark.sql.catalyst.encoders.{ExamplePoint, ExamplePointUDT}
26+
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
27+
import org.apache.spark.sql.internal.SQLConf
28+
import org.apache.spark.sql.types._
29+
30+
/**
31+
* Test suite for [[Row]] JSON serialization.
32+
*/
33+
class RowJsonSuite extends SparkFunSuite {
34+
private val schema = new StructType()
35+
.add("c1", "string")
36+
.add("c2", IntegerType)
37+
38+
private def testJson(name: String, value: Any, dt: DataType, expected: JValue): Unit = {
39+
test(name) {
40+
val row = new GenericRowWithSchema(Array(value), new StructType().add("a", dt))
41+
assert(row.jsonValue === JObject("a" -> expected))
42+
}
43+
}
44+
45+
private def testJson(value: Any, dt: DataType, expected: JValue): Unit = {
46+
testJson(s"$dt $value", value, dt, expected)
47+
}
48+
49+
// Nulls
50+
private def testJsonNull(dt: DataType, expected: JValue): Unit = {
51+
testJson(null, dt, JNull)
52+
}
53+
testJsonNull(IntegerType, JNull)
54+
testJsonNull(FloatType, JNull)
55+
testJsonNull(ArrayType(DoubleType, containsNull = true), JNull)
56+
57+
// Primitives
58+
testJson(true, BooleanType, JBool(true))
59+
testJson(false, BooleanType, JBool(false))
60+
testJson(23.toByte, ByteType, JLong(23))
61+
testJson(-126.toByte, ByteType, JLong(-126))
62+
testJson(20281.toShort, ShortType, JLong(20281))
63+
testJson(-8752.toShort, ShortType, JLong(-8752))
64+
testJson(1078231987, IntegerType, JLong(1078231987))
65+
testJson(-10, IntegerType, JLong(-10))
66+
testJson(139289832109874199L, LongType, JLong(139289832109874199L))
67+
testJson(-7873748239973488L, LongType, JLong(-7873748239973488L))
68+
testJson(10.232e10f, FloatType, JDouble(10.232e10f))
69+
testJson(9.7e-13f, FloatType, JDouble(9.7e-13f))
70+
testJson(3.891e98d, DoubleType, JDouble(3.891e98d))
71+
testJson(-7.8e5d, DoubleType, JDouble(-7.8e5d))
72+
testJson(BigDecimal("1092.88"), DecimalType(10, 2), JDecimal(BigDecimal("1092.88")))
73+
testJson(Decimal("782.0003"), DecimalType(7, 4), JDecimal(BigDecimal("782.0003")))
74+
testJson(new java.math.BigDecimal("-77.89"), DecimalType(4, 2), JDecimal(BigDecimal("-77.89")))
75+
testJson("hello world", StringType, JString("hello world"))
76+
testJson("BinaryType", Array('a'.toByte, 'b'.toByte), BinaryType, JString("YWI="))
77+
testJson(Date.valueOf("2019-04-22"), DateType, JString("2019-04-22"))
78+
testJson(LocalDate.of(2018, 5, 14), DateType, JString("2018-05-14"))
79+
testJson(
80+
Timestamp.valueOf("2017-01-06 10:22:03.00"),
81+
TimestampType,
82+
JString("2017-01-06 10:22:03"))
83+
testJson(
84+
Timestamp.valueOf("2017-05-30 10:22:03.00").toInstant,
85+
TimestampType,
86+
JString("2017-05-30 10:22:03"))
87+
88+
// Complex types
89+
testJson(
90+
"ArrayType(LongType,true)",
91+
Array(1L, null, 77L),
92+
ArrayType(LongType, containsNull = true),
93+
JArray(JLong(1L) :: JNull :: JLong(77L) :: Nil))
94+
95+
testJson(
96+
Seq(1, -2, 3),
97+
ArrayType(IntegerType, containsNull = false),
98+
JArray(JLong(1) :: JLong(-2) :: JLong(3) :: Nil))
99+
100+
testJson(
101+
Map("a" -> "b", "c" -> "d", "e" -> null),
102+
MapType(StringType, StringType, valueContainsNull = true),
103+
JObject("a" -> JString("b"), "c" -> JString("d"), "e" -> JNull))
104+
105+
testJson(
106+
Map(1 -> "b", 2 -> "d", 3 -> null),
107+
MapType(IntegerType, StringType, valueContainsNull = true),
108+
JArray(
109+
JObject("key" -> JLong(1), "value" -> JString("b")) ::
110+
JObject("key" -> JLong(2), "value" -> JString("d")) ::
111+
JObject("key" -> JLong(3), "value" -> JNull) :: Nil))
112+
113+
testJson(
114+
new GenericRowWithSchema(Array("1", 2), schema),
115+
schema,
116+
JObject("c1" -> JString("1"), "c2" -> JLong(2)))
117+
118+
testJson(
119+
"UDT",
120+
new ExamplePoint(3.4d, 8.98d),
121+
new ExamplePointUDT,
122+
JArray(JDouble(3.4d) :: JDouble(8.98d) :: Nil))
123+
124+
test("no schema") {
125+
val e = intercept[IllegalArgumentException] {
126+
Row("a").jsonValue
127+
}
128+
assert(e.getMessage.contains("requires a non-null schema"))
129+
}
130+
131+
test("unsupported type") {
132+
val e = intercept[IllegalArgumentException] {
133+
val row = new GenericRowWithSchema(
134+
Array((1, 2)),
135+
new StructType().add("a", ObjectType(classOf[(Int, Int)])))
136+
row.jsonValue
137+
}
138+
assert(e.getMessage.contains("Failed to convert value"))
139+
}
140+
}

0 commit comments

Comments
 (0)