Skip to content

Commit e01919e

Browse files
brkyvzHyukjinKwon
authored andcommitted
[SPARK-23094] Fix invalid character handling in JsonDataSource
## What changes were proposed in this pull request? There were two related fixes regarding `from_json`, `get_json_object` and `json_tuple` ([Fix #1](apache@c8803c0), [Fix #2](apache@86174ea)), but they weren't comprehensive it seems. I wanted to extend those fixes to all the parsers, and add tests for each case. ## How was this patch tested? Regression tests Author: Burak Yavuz <brkyvz@gmail.com> Closes apache#20302 from brkyvz/json-invfix.
1 parent f568e9c commit e01919e

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/CreateJacksonParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,11 @@ private[sql] object CreateJacksonParser extends Serializable {
4040
}
4141

4242
def text(jsonFactory: JsonFactory, record: Text): JsonParser = {
43-
jsonFactory.createParser(record.getBytes, 0, record.getLength)
43+
val bain = new ByteArrayInputStream(record.getBytes, 0, record.getLength)
44+
jsonFactory.createParser(new InputStreamReader(bain, "UTF-8"))
4445
}
4546

4647
def inputStream(jsonFactory: JsonFactory, record: InputStream): JsonParser = {
47-
jsonFactory.createParser(record)
48+
jsonFactory.createParser(new InputStreamReader(record, "UTF-8"))
4849
}
4950
}

sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ import org.apache.spark.sql.types._
2828
class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
2929
override val dataSourceName: String = "json"
3030

31+
private val badJson = "\u0000\u0000\u0000A\u0001AAA"
32+
3133
// JSON does not write data of NullType and does not play well with BinaryType.
3234
override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
3335
case _: NullType => false
@@ -105,4 +107,36 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
105107
)
106108
}
107109
}
110+
111+
test("invalid json with leading nulls - from file (multiLine=true)") {
112+
import testImplicits._
113+
withTempDir { tempDir =>
114+
val path = tempDir.getAbsolutePath
115+
Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
116+
val expected = s"""$badJson\n{"a":1}\n"""
117+
val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
118+
val df =
119+
spark.read.format(dataSourceName).option("multiLine", true).schema(schema).load(path)
120+
checkAnswer(df, Row(null, expected))
121+
}
122+
}
123+
124+
test("invalid json with leading nulls - from file (multiLine=false)") {
125+
import testImplicits._
126+
withTempDir { tempDir =>
127+
val path = tempDir.getAbsolutePath
128+
Seq(badJson, """{"a":1}""").toDS().write.mode("overwrite").text(path)
129+
val schema = new StructType().add("a", IntegerType).add("_corrupt_record", StringType)
130+
val df =
131+
spark.read.format(dataSourceName).option("multiLine", false).schema(schema).load(path)
132+
checkAnswer(df, Seq(Row(1, null), Row(null, badJson)))
133+
}
134+
}
135+
136+
test("invalid json with leading nulls - from dataset") {
137+
import testImplicits._
138+
checkAnswer(
139+
spark.read.json(Seq(badJson).toDS()),
140+
Row(badJson))
141+
}
108142
}

0 commit comments

Comments
 (0)