revert perivous change and fix bug in other way

wei-mao-intel · wei-mao-intel · commit a3d05e08c988 · 2016-06-01T15:33:50.000+08:00
1. revert perivous change
2. add batchTimesWithNoJob Set to record the batch with no job
3. add aggrate method of InputInfo
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/InputInfoTracker.scala
@@ -39,6 +39,12 @@ case class StreamInputInfo(
 
   def metadataDescription: Option[String] =
     metadata.get(StreamInputInfo.METADATA_KEY_DESCRIPTION).map(_.toString)
+
+  def merge(other: StreamInputInfo): StreamInputInfo = {
+    require(other.inputStreamId == inputStreamId,
+      "Can't merge two StreamInputInfo with different id")
+    StreamInputInfo(inputStreamId, numRecords + other.numRecords, metadata ++ other.metadata)
+  }
 }
 
 @DeveloperApi
@@ -79,6 +85,28 @@ private[streaming] class InputInfoTracker(ssc: StreamingContext) extends Logging
     inputInfos.map(_.toMap).getOrElse(Map[Int, StreamInputInfo]())
   }
 
+  /**
+   * Get the all the input stream's information of all specified batch times and
+   * merge results together.
+   */
+  def getInfo(batchTimes: Iterable[Time]): Map[Int, StreamInputInfo] = synchronized {
+    val inputInfosSet = batchTimes.map{ batchTime =>
+      val inputInfos = batchTimeToInputInfos.get(batchTime)
+      inputInfos.getOrElse(mutable.Map[Int, StreamInputInfo]())
+    }
+
+    val aggregatedInputInfos = mutable.Map[Int, StreamInputInfo]()
+    inputInfosSet.foreach(inputInfos => inputInfos.foreach { case (id, info) =>
+      val currentInfo = aggregatedInputInfos.get(id)
+      if (currentInfo.isEmpty) {
+        aggregatedInputInfos(id) = info
+      } else {
+        aggregatedInputInfos(id) = currentInfo.get.merge(info)
+      }
+    })
+    aggregatedInputInfos.toMap
+  }
+
   /** Cleanup the tracked input information older than threshold batch time */
   def cleanup(batchThreshTime: Time): Unit = synchronized {
     val timesToCleanup = batchTimeToInputInfos.keys.filter(_ < batchThreshTime)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.streaming.scheduler
 
+import scala.collection.mutable
 import scala.util.{Failure, Success, Try}
 
 import org.apache.spark.SparkEnv
@@ -77,6 +78,10 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
   // last batch whose completion,checkpointing and metadata cleanup has been completed
   private var lastProcessedBatch: Time = null
 
+  // On some batch time, a JobSet with no jobs will be submit. We record such batch time here in
+  // order to correct the input info of later jobSet with jobs.
+  private var batchTimesWithNoJob: mutable.HashSet[Time] = mutable.HashSet[Time]()
+
   /** Start generation of jobs */
   def start(): Unit = synchronized {
     if (eventLoop != null) return // generator has already been started
@@ -249,7 +254,15 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging {
       graph.generateJobs(time) // generate jobs using allocated block
     } match {
       case Success(jobs) =>
-        val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
+        val streamIdToInputInfos = if (jobs.isEmpty) {
+          batchTimesWithNoJob.add(time)
+          Map.empty[Int, StreamInputInfo]
+        } else {
+          batchTimesWithNoJob.add(time)
+          val inputInfo = jobScheduler.inputInfoTracker.getInfo(batchTimesWithNoJob)
+          batchTimesWithNoJob.clear()
+          inputInfo
+        }
         jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
       case Failure(e) =>
         jobScheduler.reportError("Error generating jobs for time " + time, e)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -21,7 +21,6 @@ import java.util.Properties
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.HashSet
 import scala.util.Failure
 
 import org.apache.commons.lang.SerializationUtils
@@ -65,8 +64,6 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
 
   private var eventLoop: EventLoop[JobSchedulerEvent] = null
 
-  private val inputInfoMissedTimes = HashSet[Time]()
-
   def start(): Unit = synchronized {
     if (eventLoop != null) return // scheduler has already been started
 
@@ -142,7 +139,6 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
   def submitJobSet(jobSet: JobSet) {
     if (jobSet.jobs.isEmpty) {
       logInfo("No jobs added for time " + jobSet.time)
-      inputInfoMissedTimes.add(jobSet.time)
     } else {
       listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
       jobSets.put(jobSet.time, jobSet)
@@ -197,14 +193,6 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
     listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
     logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
     if (jobSet.hasCompleted) {
-      // submit fake BatchCompleted event to show missing inputInfo on Streaming UI
-      inputInfoMissedTimes.foreach (time => {
-        val streamIdToInputInfos = inputInfoTracker.getInfo(time)
-        val fakeJobSet = JobSet(time, Seq(), streamIdToInputInfos)
-        listenerBus.post(StreamingListenerBatchCompleted(fakeJobSet.toBatchInfo))
-      })
-      inputInfoMissedTimes.clear()
-
       jobSets.remove(jobSet.time)
       jobGenerator.onBatchCompletion(jobSet.time)
       logInfo("Total delay: %.3f s for time %s (execution: %.3f s)".format(