[HADP-55679] Fix NPE problem caused by incorrect taskId (apache#631)

colinmjj · GitHub Enterprise · commit b3325ad9e291 · 2024-11-29T08:49:25.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/SpillableTaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/SpillableTaskResultGetter.scala
@@ -196,7 +196,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
      taskSetManager: TaskSetManager,
      taskDataSeq: Seq[(Long, ByteBuffer)]): Unit = {
     val tids = taskDataSeq.map(_._1)
-    val taskId2TaskIdx = scheduler.removeMultiRunningTasks(taskSetManager, tids)
+    val taskId2TaskPartitionId = scheduler.removeMultiRunningTasks(taskSetManager, tids)
 
     // Killed tasks due to result size exceeds
     val killTaskIds = new ArrayBuffer[Long]
@@ -212,7 +212,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
       val tid = t._1
       val serializedData = t._2
       try {
-        val taskIdx = taskId2TaskIdx.get(tid).get
+        val taskPartitionId = taskId2TaskPartitionId.get(tid).get
         serializer.get().deserialize[TaskResult[_]](serializedData) match {
           case directResult: DirectTaskResult[_] =>
             val start = System.currentTimeMillis()
@@ -244,7 +244,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
                 if (resultValue == null) {
                   logWarning(s"TID ${tid} deserializeDirectResult is null")
                   // There is possible lock contention to the TaskSetResultStore
-                  store.save(taskIdx, null, 0, taskSetManager.taskSet.id)
+                  store.save(taskPartitionId, null, 0, taskSetManager.taskSet.id)
                   if (store.isFinished) {
                     resultStoreMap.remove(taskSetManager.taskSet.stageId.toString)
                   }
@@ -257,7 +257,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
                     getLargeResultExecutor.execute(
                       new SpillDirectResultTask(store,
                         tid,
-                        taskIdx,
+                        taskPartitionId,
                         resultValue,
                         resultSize,
                         directResult,
@@ -266,7 +266,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
                   } else {
                     // There is possible lock contention to the TaskSetResultStore
                     val  (returnResult, spilledSize) = store.save(
-                      taskIdx, resultValue, resultSize, taskSetManager.taskSet.id)
+                      taskPartitionId, resultValue, resultSize, taskSetManager.taskSet.id)
                     if (spilledSize > 0) {
                       taskSetManager.totalResultInMemorySize.addAndGet(-spilledSize)
                     }
@@ -314,7 +314,8 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
                 // There is possible lock contention to the TaskSetResultStore
                 if (store.maybeSpill(size)) {
                   getLargeResultExecutor.execute(
-                    new FetchLargeResultTask(tid, taskIdx, blockId, size, taskSetManager, store))
+                    new FetchLargeResultTask(tid, taskPartitionId, blockId, size,
+                      taskSetManager, store))
                 } else {
                   successInDirectTaskIds += tid
                   val result =
@@ -324,7 +325,7 @@ private[spark] class SpillableTaskResultGetter(sparkEnv: SparkEnv, scheduler: Ta
                     failureTaskIds += tid
                   } else {
                     val  (returnResult, spilledSize) = store.save(
-                      taskIdx, result.value(), size, taskSetManager.taskSet.id)
+                      taskPartitionId, result.value(), size, taskSetManager.taskSet.id)
                     if (spilledSize > 0) {
                       taskSetManager.totalResultInMemorySize.addAndGet(-spilledSize)
                     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -1036,9 +1036,9 @@ private[spark] class TaskSchedulerImpl(
       taskSetManager: TaskSetManager,
       tids: Seq[Long]): Map[Long, Int] = synchronized {
     tids.map { tid =>
-      val taskIdx = taskSetManager.taskInfos(tid).index
+      val taskPartitionId = taskSetManager.taskInfos(tid).partitionId
       taskSetManager.removeRunningTask(tid)
-      (tid, taskIdx)
+      (tid, taskPartitionId)
     }.toMap
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetResultStore.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetResultStore.scala
@@ -66,18 +66,18 @@ private[spark] case class TaskSetResultStore(
   }
 
   def save(
-      taskIdx: Int,
+      taskPartitionId: Int,
       resultValue: Any,
       size: Long,
       taskSetId: String): (Any, Long) = synchronized {
-    if (!closed.get && !finished(taskIdx)) {
-      finished(taskIdx) = true
+    if (!closed.get && !finished(taskPartitionId)) {
+      finished(taskPartitionId) = true
       numFinished += 1
       if (spillContext.nonEmpty) {
-        spillContext.get.resultHandler(taskIdx, resultValue)
+        spillContext.get.resultHandler(taskPartitionId, resultValue)
       }
-      bufferedResultMap.put(taskIdx, resultValue)
-      bufferedResultSize.put(taskIdx, size)
+      bufferedResultMap.put(taskPartitionId, resultValue)
+      bufferedResultSize.put(taskPartitionId, size)
       totalBufferedSize += size
       var spilledSize = 0L;
       if (totalBufferedSize > spillThreshold) {
@@ -87,10 +87,10 @@ private[spark] case class TaskSetResultStore(
         spilledSize += close(taskSetId)
       }
       if (isSpilled) (spilledPartitionResults, spilledSize) else (resultValue, 0L)
-    } else if (!finished(taskIdx)) {
+    } else if (!finished(taskPartitionId)) {
       throw new IllegalStateException("Cannot write to a closed TaskSetResultStore.")
     } else {
-      logInfo(s"Duplicated task result of speculative task $taskSetId:$taskIdx found")
+      logInfo(s"Duplicated task result of speculative task $taskSetId:$taskPartitionId found")
       if (isSpilled) (spilledPartitionResults, 0L) else (resultValue, 0L)
     }
   }