[CARMEL-7385][CARMEL-6381] Remove unnecessary sql metrics for UnionExec (apache#132)

wangyum · GitHub Enterprise · commit e988be75c37c · 2024-01-17T23:29:04.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -296,6 +296,18 @@ package object config {
       .stringConf
       .createWithDefaultString("file,hdfs")
 
+  private[spark] val EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED =
+    ConfigBuilder("spark.executor.metrics.send.updated.exceptFirstPart")
+      .doc("Only sent updated metrics to driver side for all tasks except the first " +
+        "partition, the first partition will send back all metrics, because some metrics " +
+        "like sql related metrics is needed from driver side even it is zero, but only " +
+        "one partition send back the zero metrics is good enough, that will save lots " +
+        "of driver memory especially for union rdds, which contains lots of unused metrics " +
+        "for each task.")
+      .version("3.5.0")
+      .booleanConf
+      .createWithDefault(true)
+
   private[spark] val EXECUTOR_JAVA_OPTIONS =
     ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS)
       .withPrepended(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -455,8 +455,7 @@ private[spark] class DAGScheduler(
       prevShuffleSize.getAndAdd(currTaskShuffleSize)
     }
 
-    val event = CompletionEvent(task, reason, result,
-      lightAccumUpdates, lightTaskMetrics, metricPeaks, taskInfo)
+    val event = CompletionEvent(task, reason, result, lightTaskMetrics, metricPeaks, taskInfo)
     val stageOpt = stageIdToStage.get(task.stageId)
     if (stageOpt.isEmpty) {
       // The stage may have already finished when we get this event -- eg. maybe it was a
@@ -483,14 +482,15 @@ private[spark] class DAGScheduler(
               case Some(job) =>
                 // Only update the accumulator once for each result task.
                 if (!job.finished(rt.outputId)) {
-                  updateAccumulators(event)
+                  updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
                 }
               case None => // Ignore update if task's job has finished.
             }
           case _ =>
-            updateAccumulators(event)
+            updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
         }
-      case _: ExceptionFailure | _: TaskKilled => updateAccumulators(event)
+      case _: ExceptionFailure | _: TaskKilled =>
+        updateAccumulators(event.task, lightAccumUpdates, event.taskInfo)
       case _ =>
     }
 
@@ -504,7 +504,7 @@ private[spark] class DAGScheduler(
 
     val taskMetricsForDAG: TaskMetrics = taskMetricsFromAccumulators(accumUpdatesForDAG)
     val eventForDAGScheduler = CompletionEvent(task, reason, result,
-      accumUpdatesForDAG, taskMetricsForDAG, metricPeaks, taskInfo)
+      taskMetricsForDAG, metricPeaks, taskInfo)
     eventProcessLoop.post(eventForDAGScheduler)
   }
 
@@ -1844,14 +1844,7 @@ private[spark] class DAGScheduler(
       // this synchronization in case another concurrent job is checkpointing this RDD, so we get a
       // consistent view of both variables.
       RDDCheckpointData.synchronized {
-        taskBinaryBytes = stage match {
-          case stage: ShuffleMapStage =>
-            JavaUtils.bufferToArray(
-              closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
-          case stage: ResultStage =>
-            JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
-        }
-
+        taskBinaryBytes = serializeTaskBinaries(stage)
         partitions = stage.rdd.partitions
       }
     } catch {
@@ -1912,6 +1905,17 @@ private[spark] class DAGScheduler(
     }
   }
 
+  private[scheduler] def serializeTaskBinaries(stage: Stage): Array[Byte] = {
+    val taskBinaries = stage match {
+      case stage: ShuffleMapStage =>
+        JavaUtils.bufferToArray(
+          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef))
+      case stage: ResultStage =>
+        JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, stage.func): AnyRef))
+    }
+    taskBinaries
+  }
+
   private[scheduler] def handleSubmitMissingTask(missingTask: SubmitMissingTask): Unit = {
     logDebug("submitMissingTasks(" + missingTask.stage + ")")
     if (missingTask.taskBinary == null) {
@@ -2013,11 +2017,13 @@ private[spark] class DAGScheduler(
    * This still doesn't stop the caller from updating the accumulator outside the scheduler,
    * but that's not our problem since there's nothing we can do about that.
    */
-  private def updateAccumulators(event: CompletionEvent): Unit = {
-    val task = event.task
+  private def updateAccumulators(
+      task: Task[_],
+      accumUpdates: Seq[AccumulatorV2[_, _]],
+      taskInfo: TaskInfo): Unit = {
     val stage = stageIdToStage(task.stageId)
 
-    event.accumUpdates.foreach { updates =>
+   accumUpdates.foreach { updates =>
       val id = updates.id
       try {
         // Find the corresponding accumulator on the driver and update it
@@ -2032,8 +2038,8 @@ private[spark] class DAGScheduler(
         // To avoid UI cruft, ignore cases where value wasn't updated
         if (acc.name.isDefined && !updates.isZero) {
           stage.latestInfo.accumulables(id) = acc.toInfo(None, Some(acc.value))
-          event.taskInfo.setAccumulables(
-            acc.toInfo(Some(updates.value), Some(acc.value)) +: event.taskInfo.accumulables)
+          taskInfo.setAccumulables(
+            acc.toInfo(Some(updates.value), Some(acc.value)) +: taskInfo.accumulables)
         }
       } catch {
         case NonFatal(e) =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -23,7 +23,7 @@ import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.rdd.RDD
-import org.apache.spark.util.{AccumulatorV2, CallSite}
+import org.apache.spark.util.CallSite
 
 /**
  * Types of events that can be handled by the DAGScheduler. The DAGScheduler uses an event queue
@@ -84,7 +84,6 @@ private[scheduler] case class CompletionEvent(
     task: Task[_],
     reason: TaskEndReason,
     result: Any,
-    accumUpdates: Seq[AccumulatorV2[_, _]],
     taskMetrics: TaskMetrics,
     metricPeaks: Array[Long],
     taskInfo: TaskInfo)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -22,7 +22,7 @@ import java.util.Properties
 
 import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.internal.config.APP_CALLER_CONTEXT
+import org.apache.spark.internal.config.{APP_CALLER_CONTEXT, EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED}
 import org.apache.spark.internal.plugin.PluginContainer
 import org.apache.spark.memory.{MemoryMode, TaskMemoryManager}
 import org.apache.spark.metrics.MetricsSystem
@@ -210,12 +210,28 @@ private[spark] abstract class Task[T](
       context.taskMetrics.nonZeroInternalAccums() ++
         // zero value external accumulators may still be useful, e.g. SQLMetrics, we should not
         // filter them out.
-        context.taskMetrics.externalAccums.filter(a => !taskFailed || a.countFailedValues)
+      collectExternalAccumUpdates(context.taskMetrics.externalAccums, taskFailed)
     } else {
       Seq.empty
     }
   }
 
+  private def collectExternalAccumUpdates(
+      extAccumUpdates: Seq[AccumulatorV2[_, _]], taskFailed: Boolean): Seq[AccumulatorV2[_, _]] = {
+    // Use Option to fix NPE in the test of SPARK-32160
+    val sentOnlyUpdatedMetricsExceptFirstPart = Option(SparkEnv.get)
+     .exists(_.conf.get(EXECUTOR_METRICS_SENT_UPDATED_EXCEPT_FIRST_PART_ENABLED))
+    extAccumUpdates.filter { a =>
+      var filter = !taskFailed || a.countFailedValues
+      // only send all metrics for the first part
+      // and send only updated metrics for other partitions
+      if (sentOnlyUpdatedMetricsExceptFirstPart && partitionId != 0) {
+        filter = filter && !a.isZero
+      }
+      filter
+    }
+  }
+
   /**
    * Kills a task by setting the interrupted flag to true. This relies on the upper level Spark
    * code and user code to properly handle the flag. This function should be idempotent so it can
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -4974,7 +4974,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
       } else {
         null
       }
-    CompletionEvent(task, reason, result, allAccumUpdates, taskMetrics, metricPeaks, taskInfo)
+    CompletionEvent(task, reason, result, taskMetrics, metricPeaks, taskInfo)
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -669,6 +669,48 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     assert(invocationOrder === Seq("C", "B", "A", "D"))
   }
 
+  test("Only first partition updated external accumulators will be sent back to driver") {
+    sc = new SparkContext("local", "test")
+    // Create a dummy task. We won't end up running this; we just want to collect
+    // accumulator updates from it.
+    val taskMetrics1 = TaskMetrics.registered
+    val ext1 = new LongAccumulator
+    ext1.register(sc, Some("extAccum1"))
+    taskMetrics1.registerAccumulator(ext1)
+    val task1 = new Task[Int](0, 0, 0, 1, JobArtifactSet.getActiveOrDefault(sc)) {
+      context = new TaskContextImpl(0, 0, 0, 0L, 0, 1,
+        new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
+        new Properties,
+        SparkEnv.get.metricsSystem,
+        taskMetrics1)
+
+      override def runTask(tc: TaskContext): Int = 0
+    }
+    val updatedAccums = task1.collectAccumulatorUpdates()
+    assert(updatedAccums.length == 2)
+    assert(updatedAccums(0).name == Some(InternalAccumulator.RESULT_SIZE))
+    assert(updatedAccums(0).value == 0)
+    assert(updatedAccums(1).name == Some("extAccum1"))
+    assert(updatedAccums(1).value == 0)
+
+    val taskMetrics2 = TaskMetrics.registered
+    val ext2 = new LongAccumulator
+    ext2.register(sc, Some("extAccum2"))
+    taskMetrics2.registerAccumulator(ext2)
+    val task2 = new Task[Int](0, 0, 1, 1, JobArtifactSet.getActiveOrDefault(sc)) {
+      context = new TaskContextImpl(0, 0, 1, 0L, 0, 1,
+        new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
+        new Properties,
+        SparkEnv.get.metricsSystem,
+        taskMetrics2)
+
+      override def runTask(tc: TaskContext): Int = 0
+    }
+    val updatedAccums2 = task2.collectAccumulatorUpdates()
+    // external accumulators won't be send back for the second partition
+    // when it is not updated
+    assert(updatedAccums2.length == 1)
+  }
 }
 
 private object TaskContextSuite {

Original file line number	Diff line number	Diff line change
`@@ -4974,7 +4974,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti`
`4974`	`4974`	`} else {`
`4975`	`4975`	`null`
`4976`	`4976`	`}`
`4977`		`- CompletionEvent(task, reason, result, allAccumUpdates, taskMetrics, metricPeaks, taskInfo)`
	`4977`	`+ CompletionEvent(task, reason, result, taskMetrics, metricPeaks, taskInfo)`
`4978`	`4978`	`}`
`4979`	`4979`	`}`
`4980`	`4980`