update

WeichenXu123 · WeichenXu123 · commit b48ced168f68 · 2019-07-17T01:18:25.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -1180,6 +1180,13 @@ package object config {
       .intConf
       .createWithDefault(1)
 
+  private[spark] val IO_FILE_UNSPLITTABLE_WARNING_THRESHOLD =
+    ConfigBuilder("spark.io.file.unsplittable.warning.threshold")
+    .doc("When spark loading one single large unsplittable file, if file size exceed this " +
+      "threshold, then log warning.")
+    .longConf
+    .createWithDefault(1024 * 1024 * 1024)
+
   private[spark] val EVENT_LOG_COMPRESSION_CODEC =
     ConfigBuilder("spark.eventLog.compression.codec")
       .doc("The codec used to compress event log. By default, Spark provides four codecs: " +
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -41,7 +41,7 @@ import org.apache.spark.internal.config._
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.{NextIterator, SerializableConfiguration, ShutdownHookManager}
+import org.apache.spark.util.{NextIterator, SerializableConfiguration, ShutdownHookManager, Utils}
 
 /**
  * A Spark split class that wraps around a Hadoop InputSplit.
@@ -199,21 +199,7 @@ class HadoopRDD[K, V](
 
   private val UNSPLITTABLE_FILE_SIZE_LOG_THRESHOLD = 1024 * 1024 * 1024
 
-  @transient private lazy val compressionCodecs = new CompressionCodecFactory(getJobConf())
-
-  private def checkAndLogUnsplittableLargeFile(split: InputSplit): Unit = {
-    if (split.isInstanceOf[FileSplit]) {
-      val fileSplit = split.asInstanceOf[FileSplit]
-      val path = fileSplit.getPath
-      val codec = compressionCodecs.getCodec(path)
-      if (codec != null && !codec.isInstanceOf[SplittableCompressionCodec]) {
-        if (fileSplit.getLength > UNSPLITTABLE_FILE_SIZE_LOG_THRESHOLD) {
-          logWarning(s"File ${path.toString} is large and unsplittable so the corresponding " +
-            s"rdd partition have to deal with the whole file and consume large time.")
-        }
-      }
-    }
-  }
+  @transient private lazy val codecFactory = new CompressionCodecFactory(getJobConf())
 
   override def getPartitions: Array[Partition] = {
     val jobConf = getJobConf()
@@ -227,7 +213,13 @@ class HadoopRDD[K, V](
         allInputSplits
       }
       if (inputSplits.length == 1) {
-        checkAndLogUnsplittableLargeFile(inputSplits(0))
+        val fileSplit = inputSplits(0).asInstanceOf[FileSplit]
+        val path = fileSplit.getPath
+        if (Utils.isFileSplittable(path, codecFactory)
+          && fileSplit.getLength > conf.get(IO_FILE_UNSPLITTABLE_WARNING_THRESHOLD)) {
+          logWarning(s"File ${path.toString} is large and unsplittable so the corresponding " +
+            s"rdd partition have to deal with the whole file and consume large time.")
+        }
       }
       val array = new Array[Partition](inputSplits.size)
       for (i <- 0 until inputSplits.size) {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -51,6 +51,7 @@ import com.google.common.net.InetAddresses
 import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
+import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
 import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.eclipse.jetty.util.MultiException
@@ -2895,6 +2896,12 @@ private[spark] object Utils extends Logging {
   def isLocalUri(uri: String): Boolean = {
     uri.startsWith(s"$LOCAL_SCHEME:")
   }
+
+  /** Check whether the file of the path is splittable. */
+  def isFileSplittable(path: Path, codecFactory: CompressionCodecFactory): Boolean = {
+    val codec = codecFactory.getCodec(path)
+    codec == null || codec.isInstanceOf[SplittableCompressionCodec]
+  }
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -21,6 +21,8 @@ import java.util.{Locale, OptionalLong}
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.internal.config.IO_FILE_UNSPLITTABLE_WARNING_THRESHOLD
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
@@ -36,7 +38,9 @@ abstract class FileScan(
     sparkSession: SparkSession,
     fileIndex: PartitioningAwareFileIndex,
     readDataSchema: StructType,
-    readPartitionSchema: StructType) extends Scan with Batch with SupportsReportStatistics {
+    readPartitionSchema: StructType)
+  extends Scan
+  with Batch with SupportsReportStatistics with Logging {
   /**
    * Returns whether a file with `path` could be split or not.
    */
@@ -91,6 +95,16 @@ abstract class FileScan(
         )
       }.toArray.sortBy(_.length)(implicitly[Ordering[Long]].reverse)
     }
+
+    if (splitFiles.length == 1) {
+      val path = new Path(splitFiles(0).filePath)
+      if (isSplitable(path) && splitFiles(0).length >
+        sparkSession.sparkContext.getConf.get(IO_FILE_UNSPLITTABLE_WARNING_THRESHOLD)) {
+        logWarning(s"File ${path.toString} is large and unsplittable so the corresponding " +
+          s"rdd partition have to deal with the whole file and consume large time.")
+      }
+    }
+
     FilePartition.getFilePartitions(sparkSession, splitFiles, maxSplitBytes)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala
@@ -19,12 +19,13 @@ package org.apache.spark.sql.execution.datasources.v2
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
-import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec}
+import org.apache.hadoop.io.compress.CompressionCodecFactory
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
+import org.apache.spark.util.Utils
 
 abstract class TextBasedFileScan(
     sparkSession: SparkSession,
@@ -33,14 +34,8 @@ abstract class TextBasedFileScan(
     readPartitionSchema: StructType,
     options: CaseInsensitiveStringMap)
   extends FileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema) {
-  private var codecFactory: CompressionCodecFactory = _
+  @transient private lazy val codecFactory: CompressionCodecFactory = new CompressionCodecFactory(
+    sparkSession.sessionState.newHadoopConfWithOptions(options.asScala.toMap))
 
-  override def isSplitable(path: Path): Boolean = {
-    if (codecFactory == null) {
-      codecFactory = new CompressionCodecFactory(
-        sparkSession.sessionState.newHadoopConfWithOptions(options.asScala.toMap))
-    }
-    val codec = codecFactory.getCodec(path)
-    codec == null || codec.isInstanceOf[SplittableCompressionCodec]
-  }
+  override def isSplitable(path: Path): Boolean = Utils.isFileSplittable(path, codecFactory)
 }