Hybrid scan operator for leveraging index alongside newly appended data - BucketUnion (#151)

sezruby · web-flow · commit 1c3b02007129 · 2020-09-14T09:29:08.000-07:00
diff --git a/src/main/scala/com/microsoft/hyperspace/index/execution/BucketUnionExec.scala b/src/main/scala/com/microsoft/hyperspace/index/execution/BucketUnionExec.scala
@@ -0,0 +1,117 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.execution
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
+import org.apache.spark.rdd.{PartitionerAwareUnionRDD, RDD}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
+import org.apache.spark.sql.execution.SparkPlan
+
+import com.microsoft.hyperspace.index.plans.logical.BucketUnion
+
+/**
+ * [[BucketUnionRDD]] is required for the hybrid scan operation which merges index data and
+ * appended data without re-shuffling the index data. Spark does not support Union that retains
+ * output partition specification (i.e., using PartitionSpecification). The default operation
+ * [[PartitionerAwareUnionRDD]] does not retain outputPartitioning of result i.e., even if both
+ * sides are bucketed in a compatible way, it will cause a shuffle.
+ *
+ * To avoid these issues, we define a new BucketUnion operation that avoids a shuffle when
+ * the following conditions are satisfied:
+ *   - input RDDs must have the same number of partitions.
+ *   - input RDDs must have the same partitioning keys.
+ *   - input RDDs must have the same column schema.
+ *
+ * Unfortunately, since there is no explicit API to check Partitioning keys in RDD, we have to
+ * asset the partitioning keys on the caller side. Therefore, [[BucketUnionRDD]] is Hyperspace
+ * internal use only.
+ *
+ * You can find more detailed information about Bucketing optimization in:
+ * ''Bucketing 2.0: Improve Spark SQL Performance by Removing Shuffle''
+ * Video: [[https://youtu.be/7cvaH33S7uc ]]
+ */
+private[hyperspace] class BucketUnionRDD[T: ClassTag](
+    sc: SparkContext,
+    var rdds: Seq[RDD[T]],
+    bucketSpec: BucketSpec)
+    extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
+  require(rdds.nonEmpty)
+  require(rdds.forall(_.getNumPartitions == bucketSpec.numBuckets))
+
+  // copy from org.apache.spark.rdd.PartitionerAwareUnionRDD
+  override def getPartitions: Array[Partition] = {
+    val numBuckets = bucketSpec.numBuckets
+    (0 until numBuckets).map { index =>
+      new BucketUnionRDDPartition(rdds, index)
+    }.toArray
+  }
+
+  // copy from org.apache.spark.rdd.PartitionerAwareUnionRDD
+  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
+    val parentPartitions = s.asInstanceOf[BucketUnionRDDPartition].parents
+    rdds.zip(parentPartitions).iterator.flatMap {
+      case (rdd, p) => rdd.iterator(p, context)
+    }
+  }
+
+  // copy from org.apache.spark.rdd.PartitionerAwareUnionRDD
+  override def clearDependencies() {
+    super.clearDependencies()
+    rdds = null
+  }
+}
+
+/**
+ * [[BucketUnionRDDPartition]] keeps partitions for each partition index.
+ * @param rdds  Input RDDs.
+ * @param index Partition index.
+ */
+private[hyperspace] class BucketUnionRDDPartition(
+    @transient val rdds: Seq[RDD[_]],
+    override val index: Int)
+    extends Partition {
+  val parents: Array[Partition] = rdds.map(_.partitions(index)).toArray
+
+  override def hashCode(): Int = index
+  override def equals(other: Any): Boolean = super.equals(other)
+}
+
+/**
+ * [[BucketUnionExec]] is Spark Plan for [[BucketUnion]].
+ *
+ * @param children Child plans.
+ * @param bucketSpec Bucket specification.
+ */
+private[hyperspace] case class BucketUnionExec(children: Seq[SparkPlan], bucketSpec: BucketSpec)
+    extends SparkPlan {
+  override protected def doExecute(): RDD[InternalRow] = {
+    new BucketUnionRDD[InternalRow](sparkContext, children.map(_.execute()), bucketSpec)
+  }
+
+  override def output: Seq[Attribute] = children.head.output
+
+  override def outputPartitioning: Partitioning = {
+    assert(children.map(_.outputPartitioning).toSet.size == 1)
+    assert(children.head.outputPartitioning.isInstanceOf[HashPartitioning])
+    children.head.outputPartitioning
+  }
+}
diff --git a/src/main/scala/com/microsoft/hyperspace/index/execution/BucketUnionStrategy.scala b/src/main/scala/com/microsoft/hyperspace/index/execution/BucketUnionStrategy.scala
@@ -0,0 +1,34 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.execution
+
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.{SparkPlan, SparkStrategy}
+
+import com.microsoft.hyperspace.index.plans.logical.BucketUnion
+
+/**
+ * [[BucketUnionStrategy]] is SparkStrategy for converting [[BucketUnion]] (Logical Plan)
+ * to [[BucketUnionExec]] (Spark Plan)
+ */
+private[hyperspace] object BucketUnionStrategy extends SparkStrategy {
+  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+    case p: BucketUnion =>
+      BucketUnionExec(p.children.map(planLater), p.bucketSpec) :: Nil
+    case _ => Nil
+  }
+}
diff --git a/src/main/scala/com/microsoft/hyperspace/index/plans/logical/BucketUnion.scala b/src/main/scala/com/microsoft/hyperspace/index/plans/logical/BucketUnion.scala
@@ -0,0 +1,68 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index.plans.logical
+
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+
+/**
+ * [[BucketUnion]] is logical plan for Bucket-aware Union operation which retains
+ * outputPartitioning of result RDDs so as to avoid performing unnecessary shuffle after
+ * Union operation per bucket.
+ *
+ * @param children Child plans.
+ * @param bucketSpec Bucket Specification.
+ */
+private[hyperspace] case class BucketUnion(children: Seq[LogicalPlan], bucketSpec: BucketSpec)
+    extends LogicalPlan {
+  require(resolved)
+  override def output: Seq[Attribute] = children.head.output
+
+  // copy from org.apache.spark.sql.catalyst.plans.logical.Union
+  override def maxRows: Option[Long] = {
+    if (children.exists(_.maxRows.isEmpty)) {
+      None
+    } else {
+      Some(children.flatMap(_.maxRows).sum)
+    }
+  }
+
+  // copy from org.apache.spark.sql.catalyst.plans.logical.Union
+  override def maxRowsPerPartition: Option[Long] = {
+    if (children.exists(_.maxRowsPerPartition.isEmpty)) {
+      None
+    } else {
+      Some(children.flatMap(_.maxRowsPerPartition).sum)
+    }
+  }
+
+  // copy from org.apache.spark.sql.catalyst.plans.logical.Union
+  override lazy val resolved: Boolean = {
+    // allChildrenCompatible needs to be evaluated after childrenResolved
+    def allChildrenCompatible: Boolean =
+      children.tail.forall(
+        child =>
+          // compare the attribute number with the first child
+          child.output.length == children.head.output.length &&
+            // compare the data types with the first child
+            child.output.zip(children.head.output).forall {
+              case (l, r) => l.dataType.equals(r.dataType)
+          })
+    children.length > 1 && childrenResolved && allChildrenCompatible
+  }
+}
diff --git a/src/test/scala/com/microsoft/hyperspace/index/BucketUnionTest.scala b/src/test/scala/com/microsoft/hyperspace/index/BucketUnionTest.scala
@@ -0,0 +1,124 @@
+/*
+ * Copyright (2020) The Hyperspace Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.hyperspace.index
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.catalog.BucketSpec
+
+import com.microsoft.hyperspace.SparkInvolvedSuite
+import com.microsoft.hyperspace.index.execution.{BucketUnionExec, BucketUnionRDD, BucketUnionRDDPartition, BucketUnionStrategy}
+import com.microsoft.hyperspace.index.plans.logical.BucketUnion
+
+class BucketUnionTest extends SparkFunSuite with SparkInvolvedSuite {
+
+  test("BucketUnion test for operator pre-requisites") {
+    import spark.implicits._
+    val df1 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val df2 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val df3 = Seq(("name1", 1), ("name2", 2)).toDF("name", "id")
+    val df4 = Seq((1, "name1", 20), (2, "name2", 10)).toDF("id", "name", "age")
+
+    // different column schema
+    intercept[IllegalArgumentException] {
+      BucketUnion(
+        Seq(df1.queryExecution.optimizedPlan, df4.queryExecution.optimizedPlan),
+        BucketSpec(1, Seq(), Seq()))
+    }
+
+    // different order of columns
+    intercept[IllegalArgumentException] {
+      BucketUnion(
+        Seq(df1.queryExecution.optimizedPlan, df3.queryExecution.optimizedPlan),
+        BucketSpec(1, Seq(), Seq()))
+    }
+
+    BucketUnion(
+      Seq(df1.queryExecution.optimizedPlan, df2.queryExecution.optimizedPlan),
+      BucketSpec(1, Seq(), Seq()))
+  }
+
+  test("BucketUnionStrategy test if strategy introduces BucketUnionExec in the Spark Plan") {
+    import spark.implicits._
+    val df1 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val df2 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val bucket = BucketUnion(
+      Seq(df1.queryExecution.optimizedPlan, df2.queryExecution.optimizedPlan),
+      BucketSpec(1, Seq(), Seq()))
+
+    assert(BucketUnionStrategy(bucket).collect {
+      case BucketUnionExec(_, _) => true
+    }.length == 1)
+
+    assert(BucketUnionStrategy(df1.queryExecution.optimizedPlan).collect {
+      case BucketUnionExec(_, _) => true
+    }.isEmpty)
+  }
+
+  test("BucketUnionExec test that partition count matches on both sides") {
+    import spark.implicits._
+    val df1 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val p1 = df1.repartition(10)
+    val df2 = Seq((1, "name1"), (2, "name2")).toDF("id", "name")
+    val p2_1 = df2.repartition(9)
+    val p2_2 = df2.repartition(10)
+
+    // different number of partition
+    intercept[AssertionError] {
+      val bucket = BucketUnion(
+        Seq(p1.queryExecution.optimizedPlan, p2_1.queryExecution.optimizedPlan),
+        BucketSpec(10, Seq(), Seq()))
+      spark.sessionState.executePlan(bucket).sparkPlan
+    }
+
+    val bucket = BucketUnion(
+      Seq(p1.queryExecution.optimizedPlan, p2_2.queryExecution.optimizedPlan),
+      BucketSpec(10, Seq(), Seq()))
+
+    assert(BucketUnionStrategy(bucket).collect {
+      case p: BucketUnionExec =>
+        assert(p.bucketSpec.numBuckets == 10)
+        assert(p.children.length == 2)
+        assert(p.output.length == p1.schema.fields.length)
+        true
+    }.length == 1)
+  }
+
+  test("BucketUnionRDD test that partition columns with same value fall in the same partition") {
+    import spark.implicits._
+    val df1 = Seq((2, "name1"), (3, "name2")).toDF("id", "name")
+    val p1 = df1.repartition(10, $"id")
+    val df2 = Seq((2, "name3"), (3, "name4")).toDF("id", "name")
+    val p2 = df2.repartition(10, $"id")
+    val bucketSpec = BucketSpec(10, Seq("id"), Seq())
+
+    val rdd = new BucketUnionRDD[Row](spark.sparkContext, Seq(p1.rdd, p2.rdd), bucketSpec)
+    assert(
+      rdd.collect.sortBy(r => (r.getInt(0), r.getString(1))).map(r => r.toSeq.toList).toList
+         == Seq(Seq(2, "name1"), Seq(2, "name3"), Seq(3, "name2"), Seq(3, "name4")))
+    assert(rdd.getPartitions.length == 10)
+    assert(rdd.partitions.head.isInstanceOf[BucketUnionRDDPartition])
+
+    val partitionSum: Seq[Int] = rdd
+      .mapPartitions(it => Iterator.single(it.map(r => r.getInt(0)).sum))
+      .collect()
+      .toSeq
+
+    // Check if all partitioned keys with the same value fall in the same partition.
+    assert(partitionSum.equals(Seq(0, 6, 0, 0, 4, 0, 0, 0, 0, 0)))
+  }
+}