Output warning for blacklisted encodings in write

MaxGekk · MaxGekk · commit c1971a5214d3 · 2018-06-15T14:41:59.000+02:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -140,17 +140,10 @@ private[sql] class JSONOptionsInRead(
   }
 
   protected override def checkedEncoding(enc: String): String = {
-    // The following encodings are not supported in per-line mode (multiline is false)
-    // because they cause some problems in reading files with BOM which is supposed to
-    // present in the files with such encodings. After splitting input files by lines,
-    // only the first lines will have the BOM which leads to impossibility for reading
-    // the rest lines. Besides of that, the lineSep option must have the BOM in such
-    // encodings which can never present between lines.
-    val blacklist = Seq(Charset.forName("UTF-16"), Charset.forName("UTF-32"))
-    val isBlacklisted = blacklist.contains(Charset.forName(enc))
+    val isBlacklisted = JSONOptionsInRead.blacklist.contains(Charset.forName(enc))
     require(multiLine || !isBlacklisted,
       s"""The ${enc} encoding must not be included in the blacklist when multiLine is disabled:
-         |Blacklist: ${blacklist.mkString(", ")}""".stripMargin)
+         |Blacklist: ${JSONOptionsInRead.blacklist.mkString(", ")}""".stripMargin)
 
     val isLineSepRequired =
         multiLine || Charset.forName(enc) == StandardCharsets.UTF_8 || lineSeparator.nonEmpty
@@ -159,3 +152,16 @@ private[sql] class JSONOptionsInRead(
     enc
   }
 }
+
+private[sql] object JSONOptionsInRead {
+  // The following encodings are not supported in per-line mode (multiline is false)
+  // because they cause some problems in reading files with BOM which is supposed to
+  // present in the files with such encodings. After splitting input files by lines,
+  // only the first lines will have the BOM which leads to impossibility for reading
+  // the rest lines. Besides of that, the lineSep option must have the BOM in such
+  // encodings which can never present between lines.
+  val blacklist = Seq(
+    Charset.forName("UTF-16"),
+    Charset.forName("UTF-32")
+  )
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala
@@ -159,6 +159,11 @@ private[json] class JsonOutputWriter(
     case None => StandardCharsets.UTF_8
   }
 
+  if (JSONOptionsInRead.blacklist.contains(encoding)) {
+    logWarning(s"The JSON file ($path) was written in the encoding ${encoding.displayName()}" +
+         " which can be read back by Spark only if multiLine is enabled.")
+  }
+
   private val writer = CodecStreams.createOutputStreamWriter(
     context, new Path(path), encoding)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -27,8 +27,8 @@ import com.fasterxml.jackson.core.JsonFactory
 import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.GzipCodec
-import org.apache.spark.{SparkException, TestUtils}
 
+import org.apache.spark.{SparkException, TestUtils}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{functions => F, _}
 import org.apache.spark.sql.catalyst.json.{CreateJacksonParser, JacksonParser, JSONOptions}