@@ -39,7 +39,6 @@ import org.apache.hadoop.util.ReflectionUtils
3939
4040import org .apache .spark ._
4141import org .apache .spark .annotation .DeveloperApi
42- import org .apache .spark .broadcast .Broadcast
4342import org .apache .spark .deploy .SparkHadoopUtil
4443import org .apache .spark .executor .{DataReadMethod , InputMetrics }
4544import org .apache .spark .rdd .HadoopRDD .HadoopMapPartitionsWithSplitRDD
@@ -86,7 +85,7 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
8685 * [[org.apache.spark.SparkContext.hadoopRDD() ]]
8786 *
8887 * @param sc The SparkContext to associate the RDD with.
89- * @param broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed
88+ * @param conf A general Hadoop Configuration, or a subclass of it. If the enclosed
9089 * variabe references an instance of JobConf, then that JobConf will be used for the Hadoop job.
9190 * Otherwise, a new JobConf will be created on each slave using the enclosed Configuration.
9291 * @param initLocalJobConfFuncOpt Optional closure used to initialize any JobConf that HadoopRDD
@@ -99,14 +98,17 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
9998@ DeveloperApi
10099class HadoopRDD [K , V ](
101100 sc : SparkContext ,
102- conf : SerializableWritable [ Configuration ] ,
101+ @ transient conf : Configuration ,
103102 initLocalJobConfFuncOpt : Option [JobConf => Unit ],
104103 inputFormatClass : Class [_ <: InputFormat [K , V ]],
105104 keyClass : Class [K ],
106105 valueClass : Class [V ],
107106 minPartitions : Int )
108107 extends RDD [(K , V )](sc, Nil ) with Logging {
109108
109+ // The serializable configuration
110+ private val sConf = new SerializableWritable (conf)
111+
110112 def this (
111113 sc : SparkContext ,
112114 conf : JobConf ,
@@ -116,63 +118,27 @@ class HadoopRDD[K, V](
116118 minPartitions : Int ) = {
117119 this (
118120 sc,
119- new SerializableWritable ( conf) ,
121+ conf,
120122 None /* initLocalJobConfFuncOpt */ ,
121123 inputFormatClass,
122124 keyClass,
123125 valueClass,
124126 minPartitions)
125127 }
126128
127- protected val jobConfCacheKey = " rdd_%d_job_conf" .format(id)
128-
129129 protected val inputFormatCacheKey = " rdd_%d_input_format" .format(id)
130130
131131 // used to build JobTracker ID
132132 private val createTime = new Date ()
133133
134- private val shouldCloneJobConf = sc.conf.get(" spark.hadoop.cloneConf" , " false" ).toBoolean
135-
136134 // Returns a JobConf that will be used on slaves to obtain input splits for Hadoop reads.
137- protected def getJobConf (): JobConf = {
138- val conf : Configuration = this .conf.value
139- if (shouldCloneJobConf) {
140- // Hadoop Configuration objects are not thread-safe, which may lead to various problems if
141- // one job modifies a configuration while another reads it (SPARK-2546). This problem occurs
142- // somewhat rarely because most jobs treat the configuration as though it's immutable. One
143- // solution, implemented here, is to clone the Configuration object. Unfortunately, this
144- // clone can be very expensive. To avoid unexpected performance regressions for workloads and
145- // Hadoop versions that do not suffer from these thread-safety issues, this cloning is
146- // disabled by default.
147- HadoopRDD .CONFIGURATION_INSTANTIATION_LOCK .synchronized {
148- logDebug(" Cloning Hadoop Configuration" )
149- val newJobConf = new JobConf (conf)
150- if (! conf.isInstanceOf [JobConf ]) {
151- initLocalJobConfFuncOpt.map(f => f(newJobConf))
152- }
135+ protected def getJobConf (): JobConf = sConf.value match {
136+ case jobConf : JobConf => jobConf
137+ case c => SparkHadoopUtil .CONFIGURATION_INSTANTIATION_LOCK synchronized {
138+ val newJobConf = new JobConf (c)
139+ initLocalJobConfFuncOpt.map(f => f(newJobConf))
153140 newJobConf
154141 }
155- } else {
156- if (conf.isInstanceOf [JobConf ]) {
157- logDebug(" Re-using user-broadcasted JobConf" )
158- conf.asInstanceOf [JobConf ]
159- } else if (HadoopRDD .containsCachedMetadata(jobConfCacheKey)) {
160- logDebug(" Re-using cached JobConf" )
161- HadoopRDD .getCachedMetadata(jobConfCacheKey).asInstanceOf [JobConf ]
162- } else {
163- // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in the
164- // local process. The local cache is accessed through HadoopRDD.putCachedMetadata().
165- // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary objects.
166- // Synchronize to prevent ConcurrentModificationException (SPARK-1097, HADOOP-10456).
167- HadoopRDD .CONFIGURATION_INSTANTIATION_LOCK .synchronized {
168- logDebug(" Creating new JobConf and caching it for later re-use" )
169- val newJobConf = new JobConf (conf)
170- initLocalJobConfFuncOpt.map(f => f(newJobConf))
171- HadoopRDD .putCachedMetadata(jobConfCacheKey, newJobConf)
172- newJobConf
173- }
174- }
175- }
176142 }
177143
178144 protected def getInputFormat (conf : JobConf ): InputFormat [K , V ] = {
@@ -295,12 +261,6 @@ class HadoopRDD[K, V](
295261}
296262
297263private [spark] object HadoopRDD extends Logging {
298- /**
299- * Configuration's constructor is not threadsafe (see SPARK-1097 and HADOOP-10456).
300- * Therefore, we synchronize on this lock before calling new JobConf() or new Configuration().
301- */
302- val CONFIGURATION_INSTANTIATION_LOCK = new Object ()
303-
304264 /**
305265 * The three methods below are helpers for accessing the local map, a property of the SparkEnv of
306266 * the local process.
0 commit comments