@@ -498,12 +498,16 @@ class DAGScheduler(
498
498
* the last fetch failure.
499
499
*/
500
500
private [scheduler] def resubmitFailedStages () {
501
- logInfo(" Resubmitting failed stages" )
502
- clearCacheLocs()
503
- val failedStagesCopy = failedStages.toArray
504
- failedStages.clear()
505
- for (stage <- failedStagesCopy.sortBy(_.jobId)) {
506
- submitStage(stage)
501
+ if (failedStages.size > 0 ) {
502
+ // Failed stages may be removed by job cancellation, so failed might be empty even if
503
+ // the ResubmitFailedStages event has been scheduled.
504
+ logInfo(" Resubmitting failed stages" )
505
+ clearCacheLocs()
506
+ val failedStagesCopy = failedStages.toArray
507
+ failedStages.clear()
508
+ for (stage <- failedStagesCopy.sortBy(_.jobId)) {
509
+ submitStage(stage)
510
+ }
507
511
}
508
512
}
509
513
@@ -582,6 +586,91 @@ class DAGScheduler(
582
586
}
583
587
}
584
588
589
+ private [scheduler] def handleJobGroupCancelled (groupId : String ) {
590
+ // Cancel all jobs belonging to this job group.
591
+ // First finds all active jobs with this group id, and then kill stages for them.
592
+ val activeInGroup = activeJobs.filter(activeJob =>
593
+ groupId == activeJob.properties.get(SparkContext .SPARK_JOB_GROUP_ID ))
594
+ val jobIds = activeInGroup.map(_.jobId)
595
+ jobIds.foreach(handleJobCancellation(_, " part of cancel job group" ))
596
+ }
597
+
598
+ private [scheduler] def handleBeginEvent (task : Task [_], taskInfo : TaskInfo ) {
599
+ for (stage <- stageIdToStage.get(task.stageId); stageInfo <- stageToInfos.get(stage)) {
600
+ if (taskInfo.serializedSize > DAGScheduler .TASK_SIZE_TO_WARN * 1024 &&
601
+ ! stageInfo.emittedTaskSizeWarning) {
602
+ stageInfo.emittedTaskSizeWarning = true
603
+ logWarning((" Stage %d (%s) contains a task of very large " +
604
+ " size (%d KB). The maximum recommended task size is %d KB." ).format(
605
+ task.stageId, stageInfo.name, taskInfo.serializedSize / 1024 ,
606
+ DAGScheduler .TASK_SIZE_TO_WARN ))
607
+ }
608
+ }
609
+ listenerBus.post(SparkListenerTaskStart (task.stageId, taskInfo))
610
+ }
611
+
612
+ private [scheduler] def handleTaskSetFailed (taskSet : TaskSet , reason : String ) {
613
+ stageIdToStage.get(taskSet.stageId).foreach {abortStage(_, reason) }
614
+ }
615
+
616
+ private [scheduler] def cleanUpAfterSchedulerStop () {
617
+ for (job <- activeJobs) {
618
+ val error = new SparkException (" Job cancelled because SparkContext was shut down" )
619
+ job.listener.jobFailed(error)
620
+ // Tell the listeners that all of the running stages have ended. Don't bother
621
+ // cancelling the stages because if the DAG scheduler is stopped, the entire application
622
+ // is in the process of getting stopped.
623
+ val stageFailedMessage = " Stage cancelled because SparkContext was shut down"
624
+ runningStages.foreach { stage =>
625
+ val info = stageToInfos(stage)
626
+ info.stageFailed(stageFailedMessage)
627
+ listenerBus.post(SparkListenerStageCompleted (info))
628
+ }
629
+ listenerBus.post(SparkListenerJobEnd (job.jobId, JobFailed (error)))
630
+ }
631
+ }
632
+
633
+ private [scheduler] def handleJobSubmitted (jobId : Int ,
634
+ finalRDD : RDD [_],
635
+ func : (TaskContext , Iterator [_]) => _,
636
+ partitions : Array [Int ],
637
+ allowLocal : Boolean ,
638
+ callSite : String ,
639
+ listener : JobListener ,
640
+ properties : Properties = null ) {
641
+ var finalStage : Stage = null
642
+ try {
643
+ // New stage creation may throw an exception if, for example, jobs are run on a
644
+ // HadoopRDD whose underlying HDFS files have been deleted.
645
+ finalStage = newStage(finalRDD, partitions.size, None , jobId, Some (callSite))
646
+ } catch {
647
+ case e : Exception =>
648
+ logWarning(" Creating new stage failed due to exception - job: " + jobId, e)
649
+ listener.jobFailed(e)
650
+ }
651
+ if (finalStage != null ) {
652
+ val job = new ActiveJob (jobId, finalStage, func, partitions, callSite, listener, properties)
653
+ clearCacheLocs()
654
+ logInfo(" Got job %s (%s) with %d output partitions (allowLocal=%s)" .
655
+ format(job.jobId, callSite, partitions.length, allowLocal))
656
+ logInfo(" Final stage: " + finalStage + " (" + finalStage.name + " )" )
657
+ logInfo(" Parents of final stage: " + finalStage.parents)
658
+ logInfo(" Missing parents: " + getMissingParentStages(finalStage))
659
+ if (allowLocal && finalStage.parents.size == 0 && partitions.length == 1 ) {
660
+ // Compute very short actions like first() or take() with no parent stages locally.
661
+ listenerBus.post(SparkListenerJobStart (job.jobId, Array [Int ](), properties))
662
+ runLocally(job)
663
+ } else {
664
+ jobIdToActiveJob(jobId) = job
665
+ activeJobs += job
666
+ resultStageToJob(finalStage) = job
667
+ listenerBus.post(SparkListenerJobStart (job.jobId, jobIdToStageIds(jobId).toArray,
668
+ properties))
669
+ submitStage(finalStage)
670
+ }
671
+ }
672
+ }
673
+
585
674
/** Submits stage, but first recursively submits any missing parents. */
586
675
private [scheduler] def submitStage (stage : Stage ) {
587
676
val jobId = activeJobForStage(stage)
@@ -673,6 +762,10 @@ class DAGScheduler(
673
762
*/
674
763
private [scheduler] def handleTaskCompletion (event : CompletionEvent ) {
675
764
val task = event.task
765
+ val stageId = task.stageId
766
+ val taskType = Utils .getFormattedClassName(task)
767
+ listenerBus.post(SparkListenerTaskEnd (stageId, taskType, event.reason, event.taskInfo,
768
+ event.taskMetrics))
676
769
if (! stageIdToStage.contains(task.stageId)) {
677
770
// Skip all the actions if the stage has been cancelled.
678
771
return
@@ -1045,36 +1138,8 @@ private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGSchedule
1045
1138
*/
1046
1139
def receive = {
1047
1140
case JobSubmitted (jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) =>
1048
- var finalStage : Stage = null
1049
- try {
1050
- // New stage creation may throw an exception if, for example, jobs are run on a
1051
- // HadoopRDD whose underlying HDFS files have been deleted.
1052
- finalStage = dagScheduler.newStage(rdd, partitions.size, None , jobId, Some (callSite))
1053
- } catch {
1054
- case e : Exception =>
1055
- logWarning(" Creating new stage failed due to exception - job: " + jobId, e)
1056
- listener.jobFailed(e)
1057
- }
1058
- val job = new ActiveJob (jobId, finalStage, func, partitions, callSite, listener, properties)
1059
- dagScheduler.clearCacheLocs()
1060
- logInfo(" Got job %s (%s) with %d output partitions (allowLocal=%s)" .
1061
- format(job.jobId, callSite, partitions.length, allowLocal))
1062
- logInfo(" Final stage: " + finalStage + " (" + finalStage.name + " )" )
1063
- logInfo(" Parents of final stage: " + finalStage.parents)
1064
- logInfo(" Missing parents: " + dagScheduler.getMissingParentStages(finalStage))
1065
- if (allowLocal && finalStage.parents.size == 0 && partitions.length == 1 ) {
1066
- // Compute very short actions like first() or take() with no parent stages locally.
1067
- dagScheduler.listenerBus.post(SparkListenerJobStart (job.jobId, Array [Int ](), properties))
1068
- dagScheduler.runLocally(job)
1069
- } else {
1070
- dagScheduler.jobIdToActiveJob(jobId) = job
1071
- dagScheduler.activeJobs += job
1072
- dagScheduler.resultStageToJob(finalStage) = job
1073
- dagScheduler.listenerBus.post(
1074
- SparkListenerJobStart (job.jobId, dagScheduler.jobIdToStageIds(jobId).toArray,
1075
- properties))
1076
- dagScheduler.submitStage(finalStage)
1077
- }
1141
+ dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,
1142
+ listener, properties)
1078
1143
1079
1144
case StageCancelled (stageId) =>
1080
1145
dagScheduler.handleStageCancellation(stageId)
@@ -1083,12 +1148,7 @@ private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGSchedule
1083
1148
dagScheduler.handleJobCancellation(jobId)
1084
1149
1085
1150
case JobGroupCancelled (groupId) =>
1086
- // Cancel all jobs belonging to this job group.
1087
- // First finds all active jobs with this group id, and then kill stages for them.
1088
- val activeInGroup = dagScheduler.activeJobs.filter(activeJob =>
1089
- groupId == activeJob.properties.get(SparkContext .SPARK_JOB_GROUP_ID ))
1090
- val jobIds = activeInGroup.map(_.jobId)
1091
- jobIds.foreach(dagScheduler.handleJobCancellation(_, " part of cancel job group" ))
1151
+ dagScheduler.handleJobGroupCancelled(groupId)
1092
1152
1093
1153
case AllJobsCancelled =>
1094
1154
dagScheduler.doCancelAllJobs()
@@ -1100,60 +1160,24 @@ private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGSchedule
1100
1160
dagScheduler.handleExecutorLost(execId)
1101
1161
1102
1162
case BeginEvent (task, taskInfo) =>
1103
- for (
1104
- job <- dagScheduler.jobIdToActiveJob.get(task.stageId);
1105
- stage <- dagScheduler.stageIdToStage.get(task.stageId);
1106
- stageInfo <- dagScheduler.stageToInfos.get(stage)
1107
- ) {
1108
- if (taskInfo.serializedSize > DAGScheduler .TASK_SIZE_TO_WARN * 1024 &&
1109
- ! stageInfo.emittedTaskSizeWarning) {
1110
- stageInfo.emittedTaskSizeWarning = true
1111
- logWarning((" Stage %d (%s) contains a task of very large " +
1112
- " size (%d KB). The maximum recommended task size is %d KB." ).format(
1113
- task.stageId, stageInfo.name, taskInfo.serializedSize / 1024 ,
1114
- DAGScheduler .TASK_SIZE_TO_WARN ))
1115
- }
1116
- }
1117
- dagScheduler.listenerBus.post(SparkListenerTaskStart (task.stageId, taskInfo))
1163
+ dagScheduler.handleBeginEvent(task, taskInfo)
1118
1164
1119
1165
case GettingResultEvent (task, taskInfo) =>
1120
1166
dagScheduler.listenerBus.post(SparkListenerTaskGettingResult (taskInfo))
1121
1167
1122
1168
case completion @ CompletionEvent (task, reason, _, _, taskInfo, taskMetrics) =>
1123
- val stageId = task.stageId
1124
- val taskType = Utils .getFormattedClassName(task)
1125
- dagScheduler.listenerBus.post(SparkListenerTaskEnd (stageId, taskType, reason, taskInfo,
1126
- taskMetrics))
1127
1169
dagScheduler.handleTaskCompletion(completion)
1128
1170
1129
1171
case TaskSetFailed (taskSet, reason) =>
1130
- dagScheduler.stageIdToStage.get(taskSet.stageId).foreach {
1131
- dagScheduler.abortStage(_, reason) }
1172
+ dagScheduler.handleTaskSetFailed(taskSet, reason)
1132
1173
1133
1174
case ResubmitFailedStages =>
1134
- if (dagScheduler.failedStages.size > 0 ) {
1135
- // Failed stages may be removed by job cancellation, so failed might be empty even if
1136
- // the ResubmitFailedStages event has been scheduled.
1137
- dagScheduler.resubmitFailedStages()
1138
- }
1175
+ dagScheduler.resubmitFailedStages()
1139
1176
}
1140
1177
1141
1178
override def postStop () {
1142
1179
// Cancel any active jobs in postStop hook
1143
- for (job <- dagScheduler.activeJobs) {
1144
- val error = new SparkException (" Job cancelled because SparkContext was shut down" )
1145
- job.listener.jobFailed(error)
1146
- // Tell the listeners that all of the running stages have ended. Don't bother
1147
- // cancelling the stages because if the DAG scheduler is stopped, the entire application
1148
- // is in the process of getting stopped.
1149
- val stageFailedMessage = " Stage cancelled because SparkContext was shut down"
1150
- dagScheduler.runningStages.foreach { stage =>
1151
- val info = dagScheduler.stageToInfos(stage)
1152
- info.stageFailed(stageFailedMessage)
1153
- dagScheduler.listenerBus.post(SparkListenerStageCompleted (info))
1154
- }
1155
- dagScheduler.listenerBus.post(SparkListenerJobEnd (job.jobId, JobFailed (error)))
1156
- }
1180
+ dagScheduler.cleanUpAfterSchedulerStop()
1157
1181
}
1158
1182
}
1159
1183
0 commit comments