Skip to content

Commit c1971a5

Browse files
committed
Output warning for blacklisted encodings in write
1 parent 6ddf503 commit c1971a5

File tree

3 files changed

+21
-10
lines changed

3 files changed

+21
-10
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,10 @@ private[sql] class JSONOptionsInRead(
140140
}
141141

142142
protected override def checkedEncoding(enc: String): String = {
143-
// The following encodings are not supported in per-line mode (multiline is false)
144-
// because they cause some problems in reading files with BOM which is supposed to
145-
// present in the files with such encodings. After splitting input files by lines,
146-
// only the first lines will have the BOM which leads to impossibility for reading
147-
// the rest lines. Besides of that, the lineSep option must have the BOM in such
148-
// encodings which can never present between lines.
149-
val blacklist = Seq(Charset.forName("UTF-16"), Charset.forName("UTF-32"))
150-
val isBlacklisted = blacklist.contains(Charset.forName(enc))
143+
val isBlacklisted = JSONOptionsInRead.blacklist.contains(Charset.forName(enc))
151144
require(multiLine || !isBlacklisted,
152145
s"""The ${enc} encoding must not be included in the blacklist when multiLine is disabled:
153-
|Blacklist: ${blacklist.mkString(", ")}""".stripMargin)
146+
|Blacklist: ${JSONOptionsInRead.blacklist.mkString(", ")}""".stripMargin)
154147

155148
val isLineSepRequired =
156149
multiLine || Charset.forName(enc) == StandardCharsets.UTF_8 || lineSeparator.nonEmpty
@@ -159,3 +152,16 @@ private[sql] class JSONOptionsInRead(
159152
enc
160153
}
161154
}
155+
156+
private[sql] object JSONOptionsInRead {
157+
// The following encodings are not supported in per-line mode (multiline is false)
158+
// because they cause some problems in reading files with BOM which is supposed to
159+
// present in the files with such encodings. After splitting input files by lines,
160+
// only the first lines will have the BOM which leads to impossibility for reading
161+
// the rest lines. Besides of that, the lineSep option must have the BOM in such
162+
// encodings which can never present between lines.
163+
val blacklist = Seq(
164+
Charset.forName("UTF-16"),
165+
Charset.forName("UTF-32")
166+
)
167+
}

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ private[json] class JsonOutputWriter(
159159
case None => StandardCharsets.UTF_8
160160
}
161161

162+
if (JSONOptionsInRead.blacklist.contains(encoding)) {
163+
logWarning(s"The JSON file ($path) was written in the encoding ${encoding.displayName()}" +
164+
" which can be read back by Spark only if multiLine is enabled.")
165+
}
166+
162167
private val writer = CodecStreams.createOutputStreamWriter(
163168
context, new Path(path), encoding)
164169

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ import com.fasterxml.jackson.core.JsonFactory
2727
import org.apache.hadoop.fs.{Path, PathFilter}
2828
import org.apache.hadoop.io.SequenceFile.CompressionType
2929
import org.apache.hadoop.io.compress.GzipCodec
30-
import org.apache.spark.{SparkException, TestUtils}
3130

31+
import org.apache.spark.{SparkException, TestUtils}
3232
import org.apache.spark.rdd.RDD
3333
import org.apache.spark.sql.{functions => F, _}
3434
import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}

0 commit comments

Comments
 (0)