apache
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala
Lines changed: 106 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala
Lines changed: 106 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala
Lines changed: 12 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala
Lines changed: 12 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
Lines changed: 14 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala
Lines changed: 14 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
Lines changed: 37 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
Lines changed: 37 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
Lines changed: 10 additions & 8 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
Lines changed: 10 additions & 8 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
Lines changed: 4 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
Lines changed: 4 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala
Lines changed: 1 addition & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala
Lines changed: 1 addition & 1 deletion
@@ -355,6 +355,21 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
       startPartition: Int,
       endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])]
 
+  /**
+   * Called from executors to get the server URIs and output sizes for each shuffle block that
+   * needs to be read from a specific map output partitions (partitionIndex) and is
+   * produced by a range mapper (startMapId, endMapId)
+   *
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size, map index)
+   *         tuples describing the shuffle blocks that are stored at that block manager.
+   */
+  def getMapSizesByRangeMapIndex(
+      shuffleId: Int,
+      partitionIndex: Int,
+      startMapId: Int,
+      endMapId: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])]
+
   /**
    * Deletes map output status information for the specified shuffle stage.
    */
@@ -688,21 +703,28 @@ private[spark] class MapOutputTrackerMaster(
   }
 
   /**
-   * Return the location where the Mapper ran. The locations each includes both a host and an
+   * Return the locations where the Mappers ran. The locations each includes both a host and an
    * executor id on that host.
    *
    * @param dep shuffle dependency object
-   * @param mapId the map id
+   * @param startMapId the start map id
+   * @param endMapId the end map id
    * @return a sequence of locations where task runs.
    */
-  def getMapLocation(dep: ShuffleDependency[_, _, _], mapId: Int): Seq[String] =
+  def getMapLocation(
+      dep: ShuffleDependency[_, _, _],
+      startMapId: Int,
+      endMapId: Int): Seq[String] =
   {
     val shuffleStatus = shuffleStatuses.get(dep.shuffleId).orNull
     if (shuffleStatus != null) {
       shuffleStatus.withMapStatuses { statuses =>
-        if (mapId >= 0 && mapId < statuses.length) {
-          Seq( ExecutorCacheTaskLocation(statuses(mapId).location.host,
-            statuses(mapId).location.executorId).toString)
+        if (startMapId < endMapId && (startMapId >= 0 && endMapId < statuses.length)) {
+          val statusesPicked = statuses.slice(startMapId, endMapId).filter(_ != null)
+          statusesPicked.map { status =>
+            ExecutorCacheTaskLocation(status.location.host,
+              status.location.executorId).toString
+          }.toSeq
         } else {
           Nil
         }
@@ -767,6 +789,22 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
+  override def getMapSizesByRangeMapIndex(
+      shuffleId: Int,
+      partitionIndex: Int,
+      startMapId: Int,
+      endMapId: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = {
+    shuffleStatuses.get(shuffleId) match {
+      case Some(shuffleStatus) =>
+        shuffleStatus.withMapStatuses { statuses =>
+          MapOutputTracker.convertMapStatuses(
+            shuffleId, partitionIndex, statuses, startMapId, endMapId)
+        }
+      case None =>
+        Iterator.empty
+    }
+  }
+
   override def stop(): Unit = {
     mapOutputRequests.offer(PoisonPill)
     threadpool.shutdown()
@@ -831,6 +869,22 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
     }
   }
 
+  override def getMapSizesByRangeMapIndex(
+      shuffleId: Int,
+      partitionIndex: Int,
+      startMapId: Int,
+      endMapId: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = {
+    val statuses = getStatuses(shuffleId, conf)
+    try {
+      MapOutputTracker.convertMapStatuses(shuffleId, partitionIndex, statuses, startMapId, endMapId)
+    } catch {
+      case e: MetadataFetchFailedException =>
+        // We experienced a fetch failure so our mapStatuses cache is outdated; clear it:
+        mapStatuses.clear()
+        throw e
+    }
+  }
+
   /**
    * Get or fetch the array of MapStatuses for a given shuffle ID. NOTE: clients MUST synchronize
    * on this array when reading it, because on the driver, we may be changing it in place.
@@ -1013,4 +1067,50 @@ private[spark] object MapOutputTracker extends Logging {
 
     splitsByAddress.iterator
   }
+
+  /**
+   * Given an array of map statuses, a specific map output partitions and a range
+   * mappers (startMapId, endMapId),returns a sequence that, for each block manager ID,
+   * lists the shuffle block IDs and corresponding shuffle
+   * block sizes stored at that block manager.
+   * Note that empty blocks are filtered in the result.
+   *
+   * If any of the statuses is null (indicating a missing location due to a failed mapper),
+   * throws a FetchFailedException.
+   *
+   * @param shuffleId Identifier for the shuffle
+   * @param partitionIndex Specific of map output partition ID
+   * @param statuses List of map statuses, indexed by map partition index.
+   * @param startMapId Start Map ID
+   * @param endMapId End map ID
+   * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId,
+   *         and the second item is a sequence of (shuffle block id, shuffle block size, map index)
+   *         tuples describing the shuffle blocks that are stored at that block manager.
+   */
+  def convertMapStatuses(
+      shuffleId: Int,
+      partitionIndex: Int,
+      statuses: Array[MapStatus],
+      startMapId: Int,
+      endMapId: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = {
+    assert (statuses != null)
+    val splitsByAddress = new HashMap[BlockManagerId, ListBuffer[(BlockId, Long, Int)]]
+    val iter = statuses.iterator.zipWithIndex
+    for ((status, mapIndex) <- iter.slice(startMapId, endMapId)) {
+      if (status == null) {
+        val errorMessage = s"Missing an output location for shuffle $shuffleId"
+        logError(errorMessage)
+        throw new MetadataFetchFailedException(shuffleId, partitionIndex, errorMessage)
+      } else {
+        val size = status.getSizeForBlock(partitionIndex)
+        if (size != 0) {
+          splitsByAddress.getOrElseUpdate(status.location, ListBuffer()) +=
+            ((ShuffleBlockId(shuffleId, status.mapId, partitionIndex), size, mapIndex))
+        }
+      }
+    }
+
+    splitsByAddress.iterator
+  }
+
 }
@@ -66,6 +66,18 @@ private[spark] trait ShuffleManager {
       context: TaskContext,
       metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C]
 
+  /**
+   * Get a reader for the specific partitionIndex in map output statistics that are
+   * produced by range mappers. Called on executors by reduce tasks.
+   */
+  def getReaderForRangeMapper[K, C](
+      handle: ShuffleHandle,
+      partitionIndex: Int,
+      startMapId: Int,
+      endMapId: Int,
+      context: TaskContext,
+      metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C]
+
   /**
    * Remove a shuffle's metadata from the ShuffleManager.
    * @return true if the metadata removed successfully, otherwise false.
 
@@ -145,6 +145,20 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager
       shouldBatchFetch = canUseBatchFetch(startPartition, endPartition, context))
   }
 
+  override def getReaderForRangeMapper[K, C](
+      handle: ShuffleHandle,
+      partitionIndex: Int,
+      startMapId: Int,
+      endMapId: Int,
+      context: TaskContext,
+      metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = {
+    val blocksByAddress = SparkEnv.get.mapOutputTracker.getMapSizesByRangeMapIndex(
+      handle.shuffleId, partitionIndex, startMapId, endMapId)
+    new BlockStoreShuffleReader(
+      handle.asInstanceOf[BaseShuffleHandle[K, _, C]], blocksByAddress, context, metrics,
+      shouldBatchFetch = canUseBatchFetch(partitionIndex, partitionIndex + 1, context))
+  }
+
   /** Get a writer for a given partition. Called on executors by map tasks. */
   override def getWriter[K, V](
       handle: ShuffleHandle,
 
@@ -402,6 +402,34 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED = buildConf("spark.sql.adaptive.skewedJoin.enabled")
+    .doc("When true and adaptive execution is enabled, a skewed join is automatically handled at " +
+      "runtime.")
+    .booleanConf
+    .createWithDefault(true)
+
+  val ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR =
+    buildConf("spark.sql.adaptive.skewedPartitionFactor")
+      .doc("A partition is considered as a skewed partition if its size is larger than" +
+        " this factor multiple the median partition size and also larger than " +
+        "spark.sql.adaptive.skewedPartitionSizeThreshold.")
+      .intConf
+      .createWithDefault(10)
+
+  val ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD =
+    buildConf("spark.sql.adaptive.skewedPartitionSizeThreshold")
+      .doc("Configures the minimum size in bytes for a partition that is considered as a skewed " +
+        "partition in adaptive skewed join.")
+      .longConf
+      .createWithDefault(64 * 1024 * 1024L)
+
+  val ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS =
+    buildConf("spark.sql.adaptive.skewedPartitionMaxSplits")
+      .doc("Configures the maximum number of task to handle a skewed partition in adaptive skewed" +
+        "join.")
+      .intConf
+      .createWithDefault(5)
+
   val NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN =
     buildConf("spark.sql.adaptive.nonEmptyPartitionRatioForBroadcastJoin")
       .doc("The relation with a non-empty partition ratio lower than this config will not be " +
@@ -2178,6 +2206,15 @@ class SQLConf extends Serializable with Logging {
   def maxNumPostShufflePartitions: Int =
     getConf(SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS).getOrElse(numShufflePartitions)
 
+  def adaptiveSkewedJoinEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED)
+
+  def adaptiveSkewedFactor: Int = getConf(ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR)
+
+  def adaptiveSkewedSizeThreshold: Long =
+    getConf(ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD)
+
+  def adaptiveSkewedMaxSplits: Int = getConf(ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS)
+
   def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN)
 
   def maxBatchesToRetainInMemory: Int = getConf(MAX_BATCHES_TO_RETAIN_IN_MEMORY)
 
@@ -116,7 +116,8 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A
 class ShuffledRowRDD(
     var dependency: ShuffleDependency[Int, InternalRow, InternalRow],
     metrics: Map[String, SQLMetric],
-    specifiedPartitionStartIndices: Option[Array[Int]] = None)
+    specifiedPartitionStartIndices: Option[Array[Int]] = None,
+    specifiedPartitionEndIndices: Option[Array[Int]] = None)
   extends RDD[InternalRow](dependency.rdd.context, Nil) {
 
   if (SQLConf.get.fetchShuffleBlocksInBatchEnabled) {
@@ -134,23 +135,24 @@ class ShuffledRowRDD(
       (0 until numPreShufflePartitions).toArray
   }
 
-  private[this] val part: Partitioner =
-    new CoalescedPartitioner(dependency.partitioner, partitionStartIndices)
-
   override def getDependencies: Seq[Dependency[_]] = List(dependency)
 
-  override val partitioner: Option[Partitioner] = Some(part)
+  override val partitioner: Option[Partitioner] = specifiedPartitionEndIndices match {
+    case Some(indices) => None
+    case None => Some(new CoalescedPartitioner(dependency.partitioner, partitionStartIndices))
+  }
 
   override def getPartitions: Array[Partition] = {
-    assert(partitionStartIndices.length == part.numPartitions)
     Array.tabulate[Partition](partitionStartIndices.length) { i =>
       val startIndex = partitionStartIndices(i)
-      val endIndex =
-        if (i < partitionStartIndices.length - 1) {
+      val endIndex = specifiedPartitionEndIndices match {
+        case Some(indices) => indices(i)
+        case None => if (i < partitionStartIndices.length - 1) {
           partitionStartIndices(i + 1)
         } else {
           numPreShufflePartitions
         }
+      }
       new ShuffledRowRDDPartition(i, startIndex, endIndex)
     }
   }
 
@@ -99,6 +99,10 @@ case class AdaptiveSparkPlanExec(
     // This rule must be executed before `ReduceNumShufflePartitions`, as local shuffle readers
     // can't change number of partitions.
     OptimizeLocalShuffleReader(conf),
+    // Here the 'OptimizeSkewedPartitions' rule should be executed
+    // before 'ReduceNumShufflePartitions', as the skewed partition handled
+    // in 'OptimizeSkewedPartitions' rule, should be omitted in 'ReduceNumShufflePartitions'.
+    OptimizeSkewedPartitions(conf),
     ReduceNumShufflePartitions(conf),
     ApplyColumnarRulesAndInsertTransitions(session.sessionState.conf,
       session.sessionState.columnarRules),
 
@@ -69,7 +69,7 @@ class LocalShuffledRowRDD(
 
   override def getPreferredLocations(partition: Partition): Seq[String] = {
     val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
-    tracker.getMapLocation(dependency, partition.index)
+    tracker.getMapLocation(dependency, partition.index, partition.index + 1)
   }
 
   override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
Original file line number	Diff line number	Diff line change
`@@ -69,7 +69,7 @@ class LocalShuffledRowRDD(`
`69`	`69`
`70`	`70`	`override def getPreferredLocations(partition: Partition): Seq[String] = {`
`71`	`71`	`val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]`
`72`		`- tracker.getMapLocation(dependency, partition.index)`
	`72`	`+ tracker.getMapLocation(dependency, partition.index, partition.index + 1)`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {`