Modify getCandidateIndex for hybrid scan (#153)

sezruby · web-flow · commit b7e70af2c820 · 2020-09-14T19:37:08.000-07:00
diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala
@@ -41,14 +41,25 @@ case class Content(root: Directory, fingerprint: NoOpFingerprint = NoOpFingerpri
   @JsonIgnore
   lazy val files: Seq[Path] = {
     // Recursively find files from directory tree.
-    def rec(prefixPath: Path, directory: Directory): Seq[Path] = {
-      val files = directory.files.map(f => new Path(prefixPath, f.name))
-      files ++ directory.subDirs.flatMap { dir =>
-        rec(new Path(prefixPath, dir.name), dir)
-      }
-    }
+    rec(new Path(root.name), root, (f, prefix) => new Path(prefix, f.name))
+  }
 
-    rec(new Path(root.name), root)
+  @JsonIgnore
+  lazy val fileInfos: Set[FileInfo] = {
+    rec(
+      new Path(root.name),
+      root,
+      (f, prefix) => FileInfo(new Path(prefix, f.name).toString, f.size, f.modifiedTime)).toSet
+  }
+
+  private def rec[T](
+      prefixPath: Path,
+      directory: Directory,
+      func: (FileInfo, Path) => T): Seq[T] = {
+    val files = directory.files.map(f => func(f, prefixPath))
+    files ++ directory.subDirs.flatMap { dir =>
+      rec(new Path(prefixPath, dir.name), dir, func)
+    }
   }
 }
 
@@ -295,7 +306,18 @@ case class IndexLogEntry(
 
   def created: Boolean = state.equals(Constants.States.ACTIVE)
 
-  def relations: Seq[Relation] = source.plan.properties.relations
+  def relations: Seq[Relation] = {
+    // Only one relation is currently supported.
+    assert(source.plan.properties.relations.size == 1)
+    source.plan.properties.relations
+  }
+
+  @JsonIgnore
+  lazy val allSourceFileInfos: Set[FileInfo] = {
+    relations
+      .flatMap(_.data.properties.content.fileInfos)
+      .toSet
+  }
 
   override def equals(o: Any): Boolean = o match {
     case that: IndexLogEntry =>
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala
@@ -19,10 +19,10 @@ package com.microsoft.hyperspace.index.rules
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.execution.datasources.LogicalRelation
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitioningAwareFileIndex}
 
 import com.microsoft.hyperspace.actions.Constants
-import com.microsoft.hyperspace.index.{IndexLogEntry, IndexManager, LogicalPlanSignatureProvider}
+import com.microsoft.hyperspace.index.{FileInfo, IndexLogEntry, IndexManager, LogicalPlanSignatureProvider, PlanSignatureProvider}
 
 object RuleUtils {
 
@@ -31,9 +31,13 @@ object RuleUtils {
    *
    * @param indexManager indexManager
    * @param plan logical plan
+   * @param hybridScanEnabled Flag that checks if hybrid scan is enabled or disabled.
    * @return indexes built for this plan
    */
-  def getCandidateIndexes(indexManager: IndexManager, plan: LogicalPlan): Seq[IndexLogEntry] = {
+  def getCandidateIndexes(
+      indexManager: IndexManager,
+      plan: LogicalPlan,
+      hybridScanEnabled: Boolean = false): Seq[IndexLogEntry] = {
     // Map of a signature provider to a signature generated for the given plan.
     val signatureMap = mutable.Map[String, Option[String]]()
 
@@ -51,11 +55,41 @@ object RuleUtils {
       }
     }
 
+    def isHybridScanCandidate(entry: IndexLogEntry, filesByRelations: Seq[FileInfo]): Boolean = {
+      // TODO: Some threshold about the similarity of source data files - number of common files or
+      //  total size of common files.
+      //  See https://github.com/microsoft/hyperspace/issues/159
+      // TODO: As in [[PlanSignatureProvider]], Source plan signature comparison is required to
+      //  support arbitrary source plans at index creation.
+      //  See https://github.com/microsoft/hyperspace/issues/158
+
+      // Find a common file between the input relation & index source files.
+      // Without the threshold described above, we can utilize exists & contain functions here.
+      filesByRelations.exists(entry.allSourceFileInfos.contains)
+    }
+
     // TODO: the following check only considers indexes in ACTIVE state for usage. Update
     //  the code to support indexes in transitioning states as well.
+    //  See https://github.com/microsoft/hyperspace/issues/65
     val allIndexes = indexManager.getIndexes(Seq(Constants.States.ACTIVE))
 
-    allIndexes.filter(index => index.created && signatureValid(index))
+    if (hybridScanEnabled) {
+      val filesByRelations = plan
+        .collect {
+          case LogicalRelation(
+              HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _),
+              _,
+              _,
+              _) =>
+            location.allFiles.map(f =>
+              FileInfo(f.getPath.toString, f.getLen, f.getModificationTime))
+        }
+      assert(filesByRelations.length == 1)
+      allIndexes.filter(index =>
+        index.created && isHybridScanCandidate(index, filesByRelations.flatten))
+    } else {
+      allIndexes.filter(index => index.created && signatureValid(index))
+    }
   }
 
   /**
diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleTestSuite.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/HyperspaceRuleTestSuite.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.types.{StructField, StructType}
 import com.microsoft.hyperspace.HyperspaceException
 import com.microsoft.hyperspace.actions.Constants
 import com.microsoft.hyperspace.index._
+import com.microsoft.hyperspace.index.Hdfs.Properties
 
 trait HyperspaceRuleTestSuite extends HyperspaceSuite {
   private val filenames = Seq("f1.parquet", "f2.parquet")
@@ -39,7 +40,13 @@ trait HyperspaceRuleTestSuite extends HyperspaceSuite {
     LogicalPlanSignatureProvider.create(signClass).signature(plan) match {
       case Some(s) =>
         val sourcePlanProperties = SparkPlan.Properties(
-          Seq(),
+          Seq(
+            Relation(
+              Seq("dummy"),
+              Hdfs(Properties(Content(Directory("/")))),
+              "schema",
+              "format",
+              Map())),
           null,
           null,
           LogicalPlanFingerprint(LogicalPlanFingerprint.Properties(Seq(Signature(signClass, s)))))
diff --git a/src/test/scala/com/microsoft/hyperspace/index/rules/RuleUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/index/rules/RuleUtilsTest.scala
@@ -16,14 +16,15 @@
 
 package com.microsoft.hyperspace.index.rules
 
-import org.apache.hadoop.fs.Path
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileUtil, Path}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, IsNotNull}
 import org.apache.spark.sql.catalyst.plans.JoinType
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InMemoryFileIndex, LogicalRelation, NoopCache}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InMemoryFileIndex, LogicalRelation, NoopCache, PartitioningAwareFileIndex}
 import org.apache.spark.sql.types.{IntegerType, StringType}
 
-import com.microsoft.hyperspace.index.IndexCollectionManager
+import com.microsoft.hyperspace.index.{IndexCollectionManager, IndexConfig}
 import com.microsoft.hyperspace.util.PathUtils
 
 class RuleUtilsTest extends HyperspaceRuleTestSuite {
@@ -110,6 +111,80 @@ class RuleUtilsTest extends HyperspaceRuleTestSuite {
     assert(r.isEmpty)
   }
 
+  test("Verify getCandidateIndex for hybrid scan") {
+    val indexManager = IndexCollectionManager(spark)
+    val df = spark.range(1, 5).toDF("id")
+    val dataPath = systemPath.toString + "/hbtable"
+    df.write.parquet(dataPath)
+
+    withIndex("index1") {
+      val readDf = spark.read.parquet(dataPath)
+      indexManager.create(readDf, IndexConfig("index1", Seq("id")))
+
+      def verify(
+          plan: LogicalPlan,
+          hybridScanEnabled: Boolean,
+          expectCandidateIndex: Boolean): Unit = {
+        val indexes = RuleUtils
+          .getCandidateIndexes(indexManager, plan, hybridScanEnabled)
+        if (expectCandidateIndex) {
+          assert(indexes.length == 1)
+          assert(indexes.head.name == "index1")
+        } else {
+          assert(indexes.isEmpty)
+        }
+      }
+
+      // Verify that a candidate index is returned with the unmodified data files whether
+      // hybrid scan is enabled or not.
+      {
+        val optimizedPlan = spark.read.parquet(dataPath).queryExecution.optimizedPlan
+        verify(optimizedPlan, hybridScanEnabled = false, expectCandidateIndex = true)
+        verify(optimizedPlan, hybridScanEnabled = true, expectCandidateIndex = true)
+      }
+
+      // Scenario #1: Append new files.
+      df.write.mode("append").parquet(dataPath)
+
+      {
+        val optimizedPlan = spark.read.parquet(dataPath).queryExecution.optimizedPlan
+        verify(optimizedPlan, hybridScanEnabled = false, expectCandidateIndex = false)
+        verify(optimizedPlan, hybridScanEnabled = true, expectCandidateIndex = true)
+      }
+
+      // Scenario #2: Delete 1 file.
+      {
+        val readDf = spark.read.parquet(dataPath)
+        readDf.queryExecution.optimizedPlan foreach {
+          case LogicalRelation(
+              HadoopFsRelation(location: PartitioningAwareFileIndex, _, _, _, _, _),
+              _,
+              _,
+              _) =>
+            systemPath
+              .getFileSystem(new Configuration)
+              .delete(location.allFiles.head.getPath, false)
+          case _ =>
+        }
+      }
+
+      {
+        val optimizedPlan = spark.read.parquet(dataPath).queryExecution.optimizedPlan
+        verify(optimizedPlan, hybridScanEnabled = false, expectCandidateIndex = false)
+        verify(optimizedPlan, hybridScanEnabled = true, expectCandidateIndex = true)
+      }
+
+      // Scenario #3: Replace all files.
+      df.write.mode("overwrite").parquet(dataPath)
+
+      {
+        val optimizedPlan = spark.read.parquet(dataPath).queryExecution.optimizedPlan
+        verify(optimizedPlan, hybridScanEnabled = false, expectCandidateIndex = false)
+        verify(optimizedPlan, hybridScanEnabled = true, expectCandidateIndex = false)
+      }
+    }
+  }
+
   private def validateLogicalRelation(plan: LogicalPlan, expected: LogicalRelation): Unit = {
     val r = RuleUtils.getLogicalRelation(plan)
     assert(r.isDefined)