thread safety

Davies Liu · Davies Liu · commit bc46dda866a5 · 2014-10-24T22:06:46.000-07:00
diff --git a/core/src/main/scala/org/apache/spark/SerializableWritable.scala b/core/src/main/scala/org/apache/spark/SerializableWritable.scala
@@ -19,9 +19,9 @@ package org.apache.spark
 
 import java.io._
 
-import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.ObjectWritable
 import org.apache.hadoop.io.Writable
+import org.apache.spark.deploy.SparkHadoopUtil
 
 import org.apache.spark.annotation.DeveloperApi
 
@@ -30,16 +30,18 @@ class SerializableWritable[T <: Writable](@transient var t: T) extends Serializa
   def value = t
   override def toString = t.toString
 
-  private def writeObject(out: ObjectOutputStream) {
+  protected def writeObject(out: ObjectOutputStream) {
     out.defaultWriteObject()
     new ObjectWritable(t).write(out)
   }
 
-  private def readObject(in: ObjectInputStream) {
+  protected def readObject(in: ObjectInputStream) {
     in.defaultReadObject()
     val ow = new ObjectWritable()
-    ow.setConf(new Configuration())
-    ow.readFields(in)
+    SparkHadoopUtil.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+      ow.setConf(SparkHadoopUtil.newConfiguration())
+      ow.readFields(in)  // not thread safe
+    }
     t = ow.get().asInstanceOf[T]
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -569,7 +569,7 @@ class SparkContext(config: SparkConf) extends Logging {
     val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, path)
     new HadoopRDD(
       this,
-      new SerializableWritable(hadoopConfiguration),
+      hadoopConfiguration,
       Some(setInputPathsFunc),
       inputFormatClass,
       keyClass,
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -124,6 +124,20 @@ class SparkHadoopUtil extends Logging {
 }
 
 object SparkHadoopUtil {
+  /**
+   * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
+   * Therefore, we synchronize on this lock before calling new JobConf() or new Configuration().
+   */
+  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
+
+  /**
+   * Create a new Configuration in thread-safe way
+   */
+  def newConfiguration(): Configuration = {
+    CONFIGURATION_INSTANTIATION_LOCK.synchronized {
+      new Configuration()
+    }
+  }
 
   private val hadoop = {
     val yarnMode = java.lang.Boolean.valueOf(
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -25,7 +25,6 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark._
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 
 private[spark] class CheckpointRDDPartition(val index: Int) extends Partition {}
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -39,7 +39,6 @@ import org.apache.hadoop.util.ReflectionUtils
 
 import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.{DataReadMethod, InputMetrics}
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
@@ -86,7 +85,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
  * [[org.apache.spark.SparkContext.hadoopRDD()]]
  *
  * @param sc The SparkContext to associate the RDD with.
- * @param broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed
+ * @param conf A general Hadoop Configuration, or a subclass of it. If the enclosed
  *     variabe references an instance of JobConf, then that JobConf will be used for the Hadoop job.
  *     Otherwise, a new JobConf will be created on each slave using the enclosed Configuration.
  * @param initLocalJobConfFuncOpt Optional closure used to initialize any JobConf that HadoopRDD
@@ -99,14 +98,17 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
 @DeveloperApi
 class HadoopRDD[K, V](
     sc: SparkContext,
-    conf: SerializableWritable[Configuration],
+    @transient conf: Configuration,
     initLocalJobConfFuncOpt: Option[JobConf => Unit],
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
     minPartitions: Int)
   extends RDD[(K, V)](sc, Nil) with Logging {
 
+  // The serializable configuration
+  private val sConf = new SerializableWritable(conf)
+
   def this(
       sc: SparkContext,
       conf: JobConf,
@@ -116,63 +118,27 @@ class HadoopRDD[K, V](
       minPartitions: Int) = {
     this(
       sc,
-      new SerializableWritable(conf),
+      conf,
       None /* initLocalJobConfFuncOpt */,
       inputFormatClass,
       keyClass,
       valueClass,
       minPartitions)
   }
 
-  protected val jobConfCacheKey = "rdd_%d_job_conf".format(id)
-
   protected val inputFormatCacheKey = "rdd_%d_input_format".format(id)
 
   // used to build JobTracker ID
   private val createTime = new Date()
 
-  private val shouldCloneJobConf = sc.conf.get("spark.hadoop.cloneConf", "false").toBoolean
-
   // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
-  protected def getJobConf(): JobConf = {
-    val conf: Configuration = this.conf.value
-    if (shouldCloneJobConf) {
-      // Hadoop Configuration objects are not thread-safe, which may lead to various problems if
-      // one job modifies a configuration while another reads it (SPARK-2546).  This problem occurs
-      // somewhat rarely because most jobs treat the configuration as though it's immutable.  One
-      // solution, implemented here, is to clone the Configuration object.  Unfortunately, this
-      // clone can be very expensive.  To avoid unexpected performance regressions for workloads and
-      // Hadoop versions that do not suffer from these thread-safety issues, this cloning is
-      // disabled by default.
-      HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
-        logDebug("Cloning Hadoop Configuration")
-        val newJobConf = new JobConf(conf)
-        if (!conf.isInstanceOf[JobConf]) {
-          initLocalJobConfFuncOpt.map(f => f(newJobConf))
-        }
+  protected def getJobConf(): JobConf = sConf.value match {
+    case jobConf: JobConf => jobConf
+    case c => SparkHadoopUtil.CONFIGURATION_INSTANTIATION_LOCK synchronized {
+        val newJobConf = new JobConf(c)
+        initLocalJobConfFuncOpt.map(f => f(newJobConf))
         newJobConf
       }
-    } else {
-      if (conf.isInstanceOf[JobConf]) {
-        logDebug("Re-using user-broadcasted JobConf")
-        conf.asInstanceOf[JobConf]
-      } else if (HadoopRDD.containsCachedMetadata(jobConfCacheKey)) {
-        logDebug("Re-using cached JobConf")
-        HadoopRDD.getCachedMetadata(jobConfCacheKey).asInstanceOf[JobConf]
-      } else {
-        // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
-        // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
-        // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
-        // Synchronize to prevent ConcurrentModificationException (SPARK-1097, HADOOP-10456).
-        HadoopRDD.CONFIGURATION_INSTANTIATION_LOCK.synchronized {
-          logDebug("Creating new JobConf and caching it for later re-use")
-          val newJobConf = new JobConf(conf)
-          initLocalJobConfFuncOpt.map(f => f(newJobConf))
-          HadoopRDD.putCachedMetadata(jobConfCacheKey, newJobConf)
-          newJobConf
-        }
-      }
-    }
   }
 
   protected def getInputFormat(conf: JobConf): InputFormat[K, V] = {
@@ -295,12 +261,6 @@ class HadoopRDD[K, V](
 }
 
 private[spark] object HadoopRDD extends Logging {
-  /**
-   * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
-   * Therefore, we synchronize on this lock before calling new JobConf() or new Configuration().
-   */
-  val CONFIGURATION_INSTANTIATION_LOCK = new Object()
-
   /**
    * The three methods below are helpers for accessing the local map, a property of the SparkEnv of
    * the local process.
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -74,7 +74,6 @@ class NewHadoopRDD[K, V](
   with Logging {
 
   private val sConf = new SerializableWritable(conf)
-  // private val serializableConf = new SerializableWritable(conf)
 
   private val jobTrackerId: String = {
     val formatter = new SimpleDateFormat("yyyyMMddHHmm")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -31,7 +31,6 @@ import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, JobConf}
 
 import org.apache.spark.SerializableWritable
-import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.{EmptyRDD, HadoopRDD, RDD, UnionRDD}
 import org.apache.spark.sql.catalyst.expressions._
 
@@ -64,7 +63,7 @@ class HadoopTableReader(
 
   // TODO: set aws s3 credentials.
 
-  private val conf = new SerializableWritable(hiveExtraConf)
+  private val conf: SerializableWritable[Configuration] = new SerializableWritable(hiveExtraConf)
 
   override def makeRDDForTable(hiveTable: HiveTable): RDD[Row] =
     makeRDDForTable(
@@ -157,7 +156,7 @@ class HadoopTableReader(
 
       // Create local references so that the outer object isn't serialized.
       val tableDesc = relation.tableDesc
-      val broadcastedHiveConf = _broadcastedHiveConf
+      val _conf = conf
       val localDeserializer = partDeserializer
       val mutableRow = new SpecificMutableRow(attributes.map(_.dataType))
 
@@ -179,7 +178,7 @@ class HadoopTableReader(
       fillPartitionKeys(partValues, mutableRow)
 
       createHadoopRdd(tableDesc, inputPathStr, ifc).mapPartitions { iter =>
-        val hconf = broadcastedHiveConf.value.value
+        val hconf = _conf.value
         val deserializer = localDeserializer.newInstance()
         deserializer.initialize(hconf, partProps)
 
@@ -211,7 +210,7 @@ class HadoopTableReader(
   }
 
   /**
-   * Creates a HadoopRDD based on the broadcasted HiveConf and other job properties that will be
+   * Creates a HadoopRDD based on the HiveConf and other job properties that will be
    * applied locally on each slave.
    */
   private def createHadoopRdd(
@@ -223,7 +222,7 @@ class HadoopTableReader(
 
     val rdd = new HadoopRDD(
       sc.sparkContext,
-      _broadcastedHiveConf.asInstanceOf[Broadcast[SerializableWritable[Configuration]]],
+      conf.value,
       Some(initializeJobConfFunc),
       inputFormatClass,
       classOf[Writable],