Address comments

imback82 · imback82 · commit 2c4925ba5258 · 2020-08-29T21:31:29.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2660,13 +2660,18 @@ object SQLConf {
   }
 
   val BUCKET_READ_STRATEGY_IN_JOIN =
-    buildConf("spark.sql.bucketing.bucketReadStrategyInJoin")
-      .doc("When set to COALESCE, if two bucketed tables with the different number of buckets " +
-        "are joined, the side with a bigger number of buckets will be coalesced to have the same " +
-        "number of buckets as the other side. When set to REPARTITION, the side with a bigger " +
-        "number of buckets will be repartitioned to have the same number of buckets as the other " +
-        "side. The bigger number of buckets must be divisible by the smaller number of buckets, " +
-        "and the strategy is applied to sort-merge joins and shuffled hash joins. " +
+    buildConf("spark.sql.sources.bucketing.readStrategyInJoin")
+      .doc("The bucket read strategy can be set to one of " +
+        BucketReadStrategyInJoin.values.mkString(", ") +
+        s". When set to ${BucketReadStrategyInJoin.COALESCE}, if two bucketed tables with " +
+        "different number of buckets are joined, the side with a bigger number of buckets will " +
+        "be coalesced to have the same number of buckets as the smaller side. When set to " +
+        s"${BucketReadStrategyInJoin.REPARTITION}, the side with a smaller number of buckets " +
+        "will be repartitioned to have the same number of buckets as the bigger side. For either " +
+        "coalescing or repartitioning to be applied, The bigger number of buckets must be " +
+        "divisible by the smaller number of buckets, and the strategy is applied to sort-merge " +
+        s"joins and shuffled hash joins. By default, the read strategy is set to " +
+        s"${BucketReadStrategyInJoin.OFF}, and neither coalescing nor reparitioning is applied. " +
         "Note: Coalescing bucketed table can avoid unnecessary shuffle in join, but it also " +
         "reduces parallelism and could possibly cause OOM for shuffled hash join. Repartitioning " +
         "bucketed table avoids unnecessary shuffle in join while maintaining the parallelism " +
@@ -2678,7 +2683,7 @@ object SQLConf {
       .createWithDefault(BucketReadStrategyInJoin.OFF.toString)
 
   val BUCKET_READ_STRATEGY_IN_JOIN_MAX_BUCKET_RATIO =
-    buildConf("spark.sql.bucketing.bucketReadStrategyInJoin.maxBucketRatio")
+    buildConf("spark.sql.sources.bucketing.readStrategyInJoin.maxBucketRatio")
       .doc("The ratio of the number of two buckets being coalesced/repartitioned should be " +
         "less than or equal to this value for bucket coalescing/repartitioning to be applied. " +
         s"This configuration only has an effect when '${BUCKET_READ_STRATEGY_IN_JOIN.key}' " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -180,8 +180,7 @@ case class FileSourceScanExec(
   }
 
   @transient private lazy val isRepartitioningBuckets: Boolean = {
-    relation.bucketSpec.isDefined &&
-      optionalNewNumBuckets.isDefined &&
+    bucketedScan && optionalNewNumBuckets.isDefined &&
       optionalNewNumBuckets.get > relation.bucketSpec.get.numBuckets
   }
 
@@ -593,7 +592,11 @@ case class FileSourceScanExec(
         driverMetrics("numFiles") = filesNum
         driverMetrics("filesSize") = filesSize
         new BucketRepartitioningRDD(
-          fsRelation.sparkSession, readFile, filePartitions, bucketSpec, newNumBuckets, output)
+          fsRelation.sparkSession,
+          readFile,
+          filePartitions,
+          outputPartitioning.asInstanceOf[HashPartitioning].partitionIdExpression,
+          output)
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/BucketRepartitioningRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/BucketRepartitioningRDD.scala
@@ -22,9 +22,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{Partition, TaskContext}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.catalog.BucketSpec
-import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
-import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, UnsafeProjection}
 import org.apache.spark.sql.execution.datasources.{FilePartition, FileScanRDD, PartitionedFile}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
@@ -35,12 +33,9 @@ private[spark] class BucketRepartitioningRDD(
     @transient private val sparkSession: SparkSession,
     readFunction: PartitionedFile => Iterator[InternalRow],
     @transient override val filePartitions: Seq[FilePartition],
-    bucketSpec: BucketSpec,
-    numRepartitionedBuckets: Int,
+    bucketIdExpression: Expression,
     output: Seq[Attribute])
   extends FileScanRDD(sparkSession, readFunction, filePartitions) {
-  assert(numRepartitionedBuckets > bucketSpec.numBuckets)
-  assert(numRepartitionedBuckets % bucketSpec.numBuckets == 0)
 
   override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
     val iter: Iterator[_] = super.compute(split, context)
@@ -51,11 +46,6 @@ private[spark] class BucketRepartitioningRDD(
   }
 
   private lazy val getBucketId: InternalRow => Int = {
-    val bucketIdExpression = {
-      val bucketColumns = bucketSpec.bucketColumnNames.map(c => output.find(_.name == c).get)
-      HashPartitioning(bucketColumns, numRepartitionedBuckets).partitionIdExpression
-    }
-
     val projection = UnsafeProjection.create(Seq(bucketIdExpression), output)
     row => projection(row).getInt(0)
   }

Original file line number	Diff line number	Diff line change
`@@ -180,8 +180,7 @@ case class FileSourceScanExec(`
`180`	`180`	`}`
`181`	`181`
`182`	`182`	`@transient private lazy val isRepartitioningBuckets: Boolean = {`
`183`		`- relation.bucketSpec.isDefined &&`
`184`		`- optionalNewNumBuckets.isDefined &&`
	`183`	`+ bucketedScan && optionalNewNumBuckets.isDefined &&`
`185`	`184`	`optionalNewNumBuckets.get > relation.bucketSpec.get.numBuckets`
`186`	`185`	`}`
`187`	`186`
`@@ -593,7 +592,11 @@ case class FileSourceScanExec(`
`593`	`592`	`driverMetrics("numFiles") = filesNum`
`594`	`593`	`driverMetrics("filesSize") = filesSize`
`595`	`594`	`new BucketRepartitioningRDD(`
`596`		`- fsRelation.sparkSession, readFile, filePartitions, bucketSpec, newNumBuckets, output)`
	`595`	`+ fsRelation.sparkSession,`
	`596`	`+ readFile,`
	`597`	`+ filePartitions,`
	`598`	`+ outputPartitioning.asInstanceOf[HashPartitioning].partitionIdExpression,`
	`599`	`+ output)`
`597`	`600`	`}`
`598`	`601`	`}`
`599`	`602`	`}`