Support hybrid scans when filtering on nested fields with with index

andrei-ionescu · andrei-ionescu · commit ba8a68495f94 · 2021-03-24T12:12:53.000+02:00
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/PlanUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/PlanUtils.scala
@@ -74,7 +74,7 @@ object PlanUtils {
    *         contains the given name.
    */
   def extractSearchQuery(exp: Expression, name: String): (Expression, Expression) = {
-    val splits = name.split(".")
+    val splits = name.split("\\.")
     val expFound = exp.find {
       case a: AttributeReference if splits.forall(s => a.name.contains(s)) => true
       case f: GetStructField if splits.forall(s => f.toString().contains(s)) => true
@@ -119,7 +119,7 @@ object PlanUtils {
    * @return A Spark Catalyst [[AttributeReference]] pointing to the field name.
    */
   def extractAttributeRef(exp: Expression, name: String): AttributeReference = {
-    val splits = name.split(".")
+    val splits = name.split("\\.")
     val elem = exp.find {
       case a: AttributeReference if splits.contains(a.name) => true
       case _ => false
@@ -136,7 +136,7 @@ object PlanUtils {
    * @return A Spark SQL [[DataType]] of the given field name.
    */
   def extractTypeFromExpression(exp: Expression, name: String): DataType = {
-    val splits = name.split(".")
+    val splits = name.split("\\.")
     val elem = exp.flatMap {
       case a: AttributeReference =>
         if (splits.forall(s => a.name == s)) {
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/RuleUtils.scala
@@ -280,7 +280,10 @@ object RuleUtils {
           Map(IndexConstants.INDEX_RELATION_IDENTIFIER))(spark, index)
 
         val flatSchema =
-          ResolverUtils.resolve(spark, index.indexedColumns ++ index.includedColumns, relation.plan)
+          ResolverUtils.resolve(
+            spark,
+            index.indexedColumns ++ index.includedColumns,
+            relation.plan)
         // SchemaUtils.escapeFieldNames(SchemaUtils.flatten(relation.plan.schema))
         val updatedOutput =
           if (flatSchema.isDefined && SchemaUtils.containsNestedFieldNames(flatSchema.get)) {
@@ -294,8 +297,8 @@ object RuleUtils {
             }
           } else {
             relation.plan.output
-                .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
-                .map(_.asInstanceOf[AttributeReference])
+              .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
+              .map(_.asInstanceOf[AttributeReference])
           }
         relation.createLogicalRelation(indexFsRelation, updatedOutput)
 
@@ -328,7 +331,7 @@ object RuleUtils {
       useBucketSpec: Boolean,
       useBucketUnionForAppended: Boolean): LogicalPlan = {
     val provider = Hyperspace.getContext(spark).sourceProviderManager
-    var unhandledAppendedFiles: Seq[Path] = Nil
+    var unhandledAppendedFiles = Seq.empty[Path]
     // Get transformed plan with index data and appended files if applicable.
     val indexPlan = plan transformUp {
       // Use transformUp here as currently one relation is allowed (pre-requisite).
@@ -367,7 +370,7 @@ object RuleUtils {
 
         val filesToRead = {
           if (useBucketSpec || !index.hasParquetAsSourceFormat || filesDeleted.nonEmpty ||
-              relation.partitionSchema.nonEmpty) {
+              relation.partitionSchema.nonEmpty || index.usesNestedFields) {
             // Since the index data is in "parquet" format, we cannot read source files
             // in formats other than "parquet" using one FileScan node as the operator requires
             // files in one homogenous format. To address this, we need to read the appended
@@ -391,10 +394,17 @@ object RuleUtils {
         // In order to handle deleted files, read index data with the lineage column so that
         // we could inject Filter-Not-In conditions on the lineage column to exclude the indexed
         // rows from the deleted files.
+        val flatSchema = ResolverUtils.resolve(
+          spark,
+          SchemaUtils.removePrefixNestedFieldNames(index.indexedColumns ++ index.includedColumns),
+          relation.plan)
         val newSchema = StructType(
-          index.schema.filter(s =>
-            relation.plan.schema.contains(s) || (filesDeleted.nonEmpty && s.name.equals(
-              IndexConstants.DATA_FILE_NAME_ID))))
+          index.schema.filter(
+            s =>
+              (flatSchema.isDefined && SchemaUtils
+                .prefixNestedFieldNames(flatSchema.get)
+                .contains(s.name)) ||
+                (filesDeleted.nonEmpty && s.name.equals(IndexConstants.DATA_FILE_NAME_ID))))
 
         def fileIndex: InMemoryFileIndex = {
           new InMemoryFileIndex(spark, filesToRead, Map(), None)
@@ -414,9 +424,22 @@ object RuleUtils {
           new ParquetFileFormat,
           Map(IndexConstants.INDEX_RELATION_IDENTIFIER))(spark, index)
 
-        val updatedOutput = relation.plan.output
-          .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
-          .map(_.asInstanceOf[AttributeReference])
+        val updatedOutput =
+          if (flatSchema.isDefined && SchemaUtils.containsNestedFieldNames(
+                SchemaUtils.prefixNestedFieldNames(flatSchema.get))) {
+            indexFsRelation.schema.flatMap { s =>
+              val exprId = getFieldPosition(index, s.name)
+              relation.plan.output.find(a => s.name.contains(a.name)).map { a =>
+                AttributeReference(s.name, s.dataType, a.nullable, a.metadata)(
+                  ExprId(exprId),
+                  a.qualifier)
+              }
+            }
+          } else {
+            relation.plan.output
+              .filter(attr => indexFsRelation.schema.fieldNames.contains(attr.name))
+              .map(_.asInstanceOf[AttributeReference])
+          }
 
         if (filesDeleted.isEmpty) {
           relation.createLogicalRelation(indexFsRelation, updatedOutput)
@@ -428,6 +451,12 @@ object RuleUtils {
           val filterForDeleted = Filter(Not(In(lineageAttr, deletedFileIds)), rel)
           Project(updatedOutput, OptimizeIn(filterForDeleted))
         }
+      case p: Project if provider.isSupportedProject(p) =>
+        transformProject(p, index)
+
+      case f: Filter if provider.isSupportedFilter(f) =>
+        transformFilter(f, index)
+
     }
 
     if (unhandledAppendedFiles.nonEmpty) {
@@ -501,11 +530,14 @@ object RuleUtils {
         // Set the same output schema with the index plan to merge them using BucketUnion.
         // Include partition columns for data loading.
         val partitionColumns = relation.partitionSchema.map(_.name)
-        val updatedSchema = StructType(relation.plan.schema.filter(col =>
-          index.schema.contains(col) || relation.partitionSchema.contains(col)))
+        val updatedSchema = StructType(
+          relation.plan.schema.filter(col =>
+            index.schema.fieldNames.exists(n => n.contains(col.name)) ||
+              relation.partitionSchema.contains(col)))
         val updatedOutput = relation.plan.output
           .filter(attr =>
-            index.schema.fieldNames.contains(attr.name) || partitionColumns.contains(attr.name))
+            index.schema.fieldNames.exists(n => n.contains(attr.name)) ||
+              partitionColumns.contains(attr.name))
           .map(_.asInstanceOf[AttributeReference])
         val newRelation = relation.createHadoopFsRelation(
           newLocation,
diff --git a/src/main/scala/com/microsoft/hyperspace/index/sources/FileBasedSourceProviderManager.scala b/src/main/scala/com/microsoft/hyperspace/index/sources/FileBasedSourceProviderManager.scala
@@ -131,13 +131,13 @@ class FileBasedSourceProviderManager(spark: SparkSession) {
    */
   def isSupportedProject(project: Project): Boolean = {
     val containsNestedFields = SchemaUtils.containsNestedFieldNames(
-      project.projectList.flatMap(extractNamesFromExpression))
+      SchemaUtils.prefixNestedFieldNames(project.projectList.flatMap(extractNamesFromExpression)))
     var containsNestedChildren = false
     project.child.foreach {
       case f: Filter =>
         containsNestedChildren = containsNestedChildren || {
-          SchemaUtils.containsNestedFieldNames(SchemaUtils.removePrefixNestedFieldNames(
-            extractNamesFromExpression(f.condition).toSeq))
+          SchemaUtils.containsNestedFieldNames(
+            SchemaUtils.prefixNestedFieldNames(extractNamesFromExpression(f.condition).toSeq))
         }
       case _ =>
     }
@@ -153,7 +153,7 @@ class FileBasedSourceProviderManager(spark: SparkSession) {
    */
   def isSupportedFilter(filter: Filter): Boolean = {
     val containsNestedFields = SchemaUtils.containsNestedFieldNames(
-      extractNamesFromExpression(filter.condition).toSeq)
+      SchemaUtils.prefixNestedFieldNames(extractNamesFromExpression(filter.condition).toSeq))
     containsNestedFields
   }
 
diff --git a/src/test/scala/com/microsoft/hyperspace/index/HybridScanForNestedFieldsTest.scala b/src/test/scala/com/microsoft/hyperspace/index/HybridScanForNestedFieldsTest.scala