Resolve to return the nested state along with field name

andrei-ionescu · andrei-ionescu · commit 7900f1b402b9 · 2021-03-25T12:46:34.000+02:00
diff --git a/src/main/scala/com/microsoft/hyperspace/actions/CreateActionBase.scala b/src/main/scala/com/microsoft/hyperspace/actions/CreateActionBase.scala
@@ -17,7 +17,7 @@
 package com.microsoft.hyperspace.actions
 
 import org.apache.hadoop.fs.Path
-import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
+import org.apache.spark.sql.{Column, DataFrame, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.functions.{col, input_file_name}
 
@@ -104,23 +104,6 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
     }
   }
 
-  private def hasParquetAsSourceFormatProperty(
-      relation: FileBasedRelation): Option[(String, String)] = {
-    if (relation.hasParquetAsSourceFormat) {
-      Some(IndexConstants.HAS_PARQUET_AS_SOURCE_FORMAT_PROPERTY -> "true")
-    } else {
-      None
-    }
-  }
-
-  private def hasLineageProperty(spark: SparkSession): Option[(String, String)] = {
-    if (hasLineage(spark)) {
-      Some(IndexConstants.LINEAGE_PROPERTY -> "true")
-    } else {
-      None
-    }
-  }
-
   protected def write(spark: SparkSession, df: DataFrame, indexConfig: IndexConfig): Unit = {
     val numBuckets = numBucketsForIndex(spark)
 
@@ -156,9 +139,26 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
     relations.head
   }
 
+  private def hasParquetAsSourceFormatProperty(
+      relation: FileBasedRelation): Option[(String, String)] = {
+    if (relation.hasParquetAsSourceFormat) {
+      Some(IndexConstants.HAS_PARQUET_AS_SOURCE_FORMAT_PROPERTY -> "true")
+    } else {
+      None
+    }
+  }
+
+  private def hasLineageProperty(spark: SparkSession): Option[(String, String)] = {
+    if (hasLineage(spark)) {
+      Some(IndexConstants.LINEAGE_PROPERTY -> "true")
+    } else {
+      None
+    }
+  }
+
   private def resolveConfig(
       df: DataFrame,
-      indexConfig: IndexConfig): (Seq[String], Seq[String]) = {
+      indexConfig: IndexConfig): (Seq[(String, Boolean)], Seq[(String, Boolean)]) = {
     val spark = df.sparkSession
     val plan = df.queryExecution.analyzed
     val indexedColumns = indexConfig.indexedColumns
@@ -170,8 +170,8 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
       case (Some(indexed), Some(included)) => (indexed, included)
       case _ =>
         val unresolvedColumns = (indexedColumns ++ includedColumns)
-          .map(c => (c, ResolverUtils.resolve(spark, Seq(c), plan)))
-          .collect { case c if c._2.isEmpty => c._1 }
+          .map(c => (c, ResolverUtils.resolve(spark, Seq(c), plan).map(_.map(_._1))))
+          .collect { case (c, r) if r.isEmpty => c }
         throw HyperspaceException(
           s"Columns '${unresolvedColumns.mkString(",")}' could not be resolved " +
             s"from available source columns:\n${df.schema.treeString}")
@@ -183,10 +183,12 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
       df: DataFrame,
       indexConfig: IndexConfig): (DataFrame, Seq[String], Seq[String]) = {
     val (resolvedIndexedColumns, resolvedIncludedColumns) = resolveConfig(df, indexConfig)
-    val columnsFromIndexConfig = resolvedIndexedColumns ++ resolvedIncludedColumns
+    val columnsFromIndexConfig =
+      resolvedIndexedColumns.map(_._1) ++ resolvedIncludedColumns.map(_._1)
 
-    val escapedIndexedColumns = SchemaUtils.prefixNestedFieldNames(resolvedIndexedColumns)
-    val escapedIncludedColumns = SchemaUtils.prefixNestedFieldNames(resolvedIncludedColumns)
+    val prefixedIndexedColumns = SchemaUtils.prefixNestedFieldNames(resolvedIndexedColumns)
+    val prefixedIncludedColumns = SchemaUtils.prefixNestedFieldNames(resolvedIncludedColumns)
+    val prefixedColumnsFromIndexConfig = prefixedIndexedColumns ++ prefixedIncludedColumns
 
     val indexDF = if (hasLineage(spark)) {
       val relation = getRelation(spark, df)
@@ -215,19 +217,25 @@ private[actions] abstract class CreateActionBase(dataManager: IndexDataManager)
       val dataPathColumn = "_data_path"
       val lineagePairs = relation.lineagePairs(fileIdTracker)
       val lineageDF = lineagePairs.toDF(dataPathColumn, IndexConstants.DATA_FILE_NAME_ID)
+      val prefixedAllIndexColumns = prefixedColumnsFromIndexConfig ++ missingPartitionColumns
 
       df.withColumn(dataPathColumn, input_file_name())
         .join(lineageDF.hint("broadcast"), dataPathColumn)
-        .select(
-          allIndexColumns.head,
-          allIndexColumns.tail :+ IndexConstants.DATA_FILE_NAME_ID: _*)
-        .toDF(escapedIndexedColumns ++ escapedIncludedColumns ++ missingPartitionColumns :+
-          IndexConstants.DATA_FILE_NAME_ID: _*)
+        .select(prepareColumns(allIndexColumns, prefixedAllIndexColumns) :+
+          col(IndexConstants.DATA_FILE_NAME_ID): _*)
     } else {
-      df.select(columnsFromIndexConfig.head, columnsFromIndexConfig.tail: _*)
-        .toDF(escapedIndexedColumns ++ escapedIncludedColumns : _*)
+      df.select(prepareColumns(columnsFromIndexConfig, prefixedColumnsFromIndexConfig): _*)
     }
 
-    (indexDF, escapedIndexedColumns, escapedIncludedColumns)
+    (indexDF, prefixedIndexedColumns, prefixedIncludedColumns)
+  }
+
+  private def prepareColumns(
+      originalColumns: Seq[String],
+      prefixedColumns: Seq[String]): Seq[Column] = {
+    originalColumns.zip(prefixedColumns).map {
+      case (original, prefixed) =>
+        col(original).as(prefixed)
+    }
   }
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/actions/RefreshActionBase.scala b/src/main/scala/com/microsoft/hyperspace/actions/RefreshActionBase.scala
@@ -93,8 +93,10 @@ private[actions] abstract class RefreshActionBase(
     val ddColumns = previousIndexLogEntry.derivedDataset.properties.columns
     IndexConfig(
       previousIndexLogEntry.name,
-      SchemaUtils.removePrefixNestedFieldNames(ddColumns.indexed),
-      SchemaUtils.removePrefixNestedFieldNames(ddColumns.included))
+      // As indexed & included columns in previousLogEntry are resolved & prefixed names,
+      // need to remove the prefix to resolve with the dataframe for refresh.
+      SchemaUtils.removePrefixNestedFieldNames(ddColumns.indexed).map(_._1),
+      SchemaUtils.removePrefixNestedFieldNames(ddColumns.included).map(_._1))
   }
 
   final override val transientState: String = REFRESHING
diff --git a/src/main/scala/com/microsoft/hyperspace/actions/RefreshIncrementalAction.scala b/src/main/scala/com/microsoft/hyperspace/actions/RefreshIncrementalAction.scala
@@ -23,7 +23,6 @@ import com.microsoft.hyperspace.{Hyperspace, HyperspaceException}
 import com.microsoft.hyperspace.index._
 import com.microsoft.hyperspace.index.DataFrameWriterExtensions.Bucketizer
 import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEvent, RefreshIncrementalActionEvent}
-import com.microsoft.hyperspace.util.SchemaUtils
 
 /**
  * Action to refresh indexes with newly appended files and deleted files in an incremental way.
@@ -92,7 +91,8 @@ class RefreshIncrementalAction(
         refreshDF,
         indexDataPath.toString,
         previousIndexLogEntry.numBuckets,
-        SchemaUtils.prefixNestedFieldNames(indexConfig.indexedColumns),
+        // previousIndexLogEntry should contain the resolved and prefixed field names.
+        previousIndexLogEntry.derivedDataset.properties.columns.indexed,
         writeMode)
     }
   }
@@ -116,10 +116,6 @@ class RefreshIncrementalAction(
     }
   }
 
-  override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = {
-    RefreshIncrementalActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message)
-  }
-
   /**
    * Create a log entry with all source data files, and all required index content. This contains
    * ALL source data files (files which were indexed previously, and files which are being indexed
@@ -144,4 +140,8 @@ class RefreshIncrementalAction(
       entry
     }
   }
+
+  override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = {
+    RefreshIncrementalActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message)
+  }
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala b/src/main/scala/com/microsoft/hyperspace/index/IndexLogEntry.scala
@@ -557,10 +557,6 @@ case class IndexLogEntry(
     config.hashCode + signature.hashCode + numBuckets.hashCode + content.hashCode
   }
 
-  def usesNestedFields: Boolean = {
-    SchemaUtils.containsNestedFieldNames(indexedColumns ++ includedColumns)
-  }
-
   /**
    * A mutable map for holding auxiliary information of this index log entry while applying rules.
    */
diff --git a/src/main/scala/com/microsoft/hyperspace/index/rules/JoinIndexRule.scala b/src/main/scala/com/microsoft/hyperspace/index/rules/JoinIndexRule.scala
@@ -305,8 +305,8 @@ object JoinIndexRule
     val rRequiredIndexedCols = lRMap.values.toSeq
 
     // All required columns resolved with base relation.
-    val lRequiredAllCols = resolve(spark, allRequiredCols(left), leftRelation.plan).get
-    val rRequiredAllCols = resolve(spark, allRequiredCols(right), rightRelation.plan).get
+    val lRequiredAllCols = resolve(spark, allRequiredCols(left), leftRelation.plan).get.map(_._1)
+    val rRequiredAllCols = resolve(spark, allRequiredCols(right), rightRelation.plan).get.map(_._1)
 
     // Make sure required indexed columns are subset of all required columns for a subplan
     require(resolve(spark, lRequiredIndexedCols, lRequiredAllCols).isDefined)
diff --git a/src/main/scala/com/microsoft/hyperspace/util/ResolverUtils.scala b/src/main/scala/com/microsoft/hyperspace/util/ResolverUtils.scala
@@ -84,12 +84,13 @@ object ResolverUtils {
    * @param spark Spark session.
    * @param requiredStrings List of strings to resolve.
    * @param plan Logical plan to resolve against.
-   * @return Optional Seq of resolved strings if all required strings are resolved. Else, None.
+   * @return Optional sequence of tuples of resolved name string and nested state boolean
+   *         if all required strings are resolved. Else, None.
    */
   def resolve(
       spark: SparkSession,
       requiredStrings: Seq[String],
-      plan: LogicalPlan): Option[Seq[String]] = {
+      plan: LogicalPlan): Option[Seq[(String, Boolean)]] = {
     val schema = plan.schema
     val resolver = spark.sessionState.conf.resolver
     val resolved = requiredStrings.map { requiredField =>
@@ -98,7 +99,12 @@ object ResolverUtils {
         .map { expr =>
           val resolvedColNameParts = extractColumnName(expr)
           validateResolvedColumnName(requiredField, resolvedColNameParts)
-          getColumnNameFromSchema(schema, resolvedColNameParts, resolver).mkString(".")
+          getColumnNameFromSchema(schema, resolvedColNameParts, resolver)
+            .foldLeft(("", false)) { (acc, i) =>
+              val name = Seq(acc._1, i._1).filter(_.nonEmpty).mkString(".")
+              val isNested = acc._2 || i._2
+              (name, isNested)
+            }
         }
         .getOrElse { return None }
     }
@@ -110,8 +116,8 @@ object ResolverUtils {
     expr match {
       case a: Attribute =>
         Seq(a.name)
-      case g @ GetStructField(_, _, Some(name)) =>
-        extractColumnName(g.child) :+ name
+      case _ @ GetStructField(child, _, Some(name)) =>
+        extractColumnName(child) :+ name
       case _: GetArrayStructFields =>
         // TODO: Nested arrays will be supported later
         throw HyperspaceException("Array types are not supported.")
@@ -138,19 +144,19 @@ object ResolverUtils {
   private def getColumnNameFromSchema(
       schema: StructType,
       resolvedColNameParts: Seq[String],
-      resolver: Resolver): Seq[String] = resolvedColNameParts match {
+      resolver: Resolver): Seq[(String, Boolean)] = resolvedColNameParts match {
     case h :: tail =>
       val field = schema.find(f => resolver(f.name, h)).get
       field match {
         case StructField(name, s: StructType, _, _) =>
-          name +: getColumnNameFromSchema(s, tail, resolver)
+          (name, true) +: getColumnNameFromSchema(s, tail, resolver)
         case StructField(_, _: ArrayType, _, _) =>
           // TODO: Nested arrays will be supported later
           throw HyperspaceException("Array types are not supported.")
         case StructField(_, _: MapType, _, _) =>
           // TODO: Nested maps will be supported later
           throw HyperspaceException("Map types are not supported")
-        case f => Seq(f.name)
+        case f => Seq((f.name, false))
       }
   }
 }
diff --git a/src/main/scala/com/microsoft/hyperspace/util/SchemaUtils.scala b/src/main/scala/com/microsoft/hyperspace/util/SchemaUtils.scala
@@ -22,8 +22,9 @@ object SchemaUtils {
   val NESTED_FIELD_PREFIX_REGEX = "^__hs_nested\\."
 
   /**
-   * The method prefixes a nested field name. The field name must be
-   * nested (it should contain a `.`).
+   * The method prefixes a nested field name that hasn't already been prefixed.
+   * The field name must be nested (it should contain a `.` and its type
+   * should be of [[org.apache.spark.sql.types.StructType]]).
    *
    * The inverse operation is [[removePrefixNestedFieldName]].
    *
@@ -39,16 +40,23 @@ object SchemaUtils {
   }
 
   /**
-   * The method prefixes the nested field names from a collection. The field names
-   * that are not nested will not be changed.
+   * The method prefixes the nested field names from a map where the keys are
+   * the field names and the values are the nested state of that field
+   * which should be the result of [[ResolverUtils.resolve]].
+   * The field names that are not marked as nested will not be changed.
    *
-   * See [[prefixNestedFieldName]] method.
+   * See [[prefixNestedFieldName]] and [[ResolverUtils.resolve]] methods.
    *
-   * @param fieldNames The collection of field names to prefix.
+   * @param fieldNames A sequence of tuples of field names and nested status.
    * @return A collection with prefixed nested fields.
    */
-  def prefixNestedFieldNames(fieldNames: Seq[String]): Seq[String] = {
-    fieldNames.map(prefixNestedFieldName)
+  def prefixNestedFieldNames(fieldNames: Seq[(String, Boolean)]): Seq[String] = {
+    fieldNames.map {
+      case (fieldName, true) =>
+        prefixNestedFieldName(fieldName)
+      case (fieldName, false) =>
+        fieldName
+    }
   }
 
   /**
@@ -66,14 +74,21 @@ object SchemaUtils {
 
   /**
    * The method removes the prefix from a collection of prefixed nested field names.
+   * It returns the original sequence of tuples of field names and nested state.
    *
    * The inverse operation is [[prefixNestedFieldNames]].
    *
    * @param fieldNames The collection of prefixed field names.
-   * @return A collection with original nested field names.
+   * @return A sequence of tuples of field names and nested status.
    */
-  def removePrefixNestedFieldNames(fieldNames: Seq[String]): Seq[String] = {
-    fieldNames.map(removePrefixNestedFieldName)
+  def removePrefixNestedFieldNames(fieldNames: Seq[String]): Seq[(String, Boolean)] = {
+    fieldNames.map { fieldName =>
+      if (SchemaUtils.isFieldNamePrefixed(fieldName)) {
+        removePrefixNestedFieldName(fieldName) -> true
+      } else {
+        fieldName -> false
+      }
+    }
   }
 
   /**
diff --git a/src/test/scala/com/microsoft/hyperspace/index/CreateIndexNestedTest.scala b/src/test/scala/com/microsoft/hyperspace/index/CreateIndexNestedTest.scala
@@ -16,6 +16,7 @@
 
 package com.microsoft.hyperspace.index
 
+import scala.collection.immutable.ListMap
 import scala.collection.mutable.WrappedArray
 
 import org.apache.hadoop.conf.Configuration
@@ -159,8 +160,9 @@ class CreateIndexNestedTest extends HyperspaceSuite with SQLHelper {
       // should be added to index schema if they are not already among index config columns.
       assert(
         indexRecordsDF.schema.fieldNames.sorted ===
-          (SchemaUtils.prefixNestedFieldNames(indexConfig2.indexedColumns ++
-            indexConfig2.includedColumns) ++
+          (SchemaUtils.prefixNestedFieldNames(
+            indexConfig2.indexedColumns.zip(Seq(true)) ++
+            indexConfig2.includedColumns.zip(Seq(true))) ++
             Seq(IndexConstants.DATA_FILE_NAME_ID) ++ partitionKeys).sorted)
     }
   }
@@ -174,8 +176,9 @@ class CreateIndexNestedTest extends HyperspaceSuite with SQLHelper {
       // For non-partitioned data, only file name lineage column should be added to index schema.
       assert(
         indexRecordsDF.schema.fieldNames.sorted ===
-          (SchemaUtils.prefixNestedFieldNames(indexConfig1.indexedColumns ++
-            indexConfig1.includedColumns) ++
+          (SchemaUtils.prefixNestedFieldNames(
+            indexConfig1.indexedColumns.zip(Seq(true)) ++
+            indexConfig1.includedColumns.zip(Seq(false, true))) ++
             Seq(IndexConstants.DATA_FILE_NAME_ID)).sorted)
     }
   }
diff --git a/src/test/scala/com/microsoft/hyperspace/util/ResolverUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/ResolverUtilsTest.scala
diff --git a/src/test/scala/com/microsoft/hyperspace/util/SchemaUtilsTest.scala b/src/test/scala/com/microsoft/hyperspace/util/SchemaUtilsTest.scala

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,6 @@ import com.microsoft.hyperspace.{Hyperspace, HyperspaceException}`
`23`	`23`	`import com.microsoft.hyperspace.index._`
`24`	`24`	`import com.microsoft.hyperspace.index.DataFrameWriterExtensions.Bucketizer`
`25`	`25`	`import com.microsoft.hyperspace.telemetry.{AppInfo, HyperspaceEvent, RefreshIncrementalActionEvent}`
`26`		`-import com.microsoft.hyperspace.util.SchemaUtils`
`27`	`26`
`28`	`27`	`/**`
`29`	`28`	`* Action to refresh indexes with newly appended files and deleted files in an incremental way.`
`@@ -92,7 +91,8 @@ class RefreshIncrementalAction(`
`92`	`91`	`refreshDF,`
`93`	`92`	`indexDataPath.toString,`
`94`	`93`	`previousIndexLogEntry.numBuckets,`
`95`		`- SchemaUtils.prefixNestedFieldNames(indexConfig.indexedColumns),`
	`94`	`+ // previousIndexLogEntry should contain the resolved and prefixed field names.`
	`95`	`+ previousIndexLogEntry.derivedDataset.properties.columns.indexed,`
`96`	`96`	`writeMode)`
`97`	`97`	`}`
`98`	`98`	`}`
`@@ -116,10 +116,6 @@ class RefreshIncrementalAction(`
`116`	`116`	`}`
`117`	`117`	`}`
`118`	`118`
`119`		`- override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = {`
`120`		`- RefreshIncrementalActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message)`
`121`		`- }`
`122`		`-`
`123`	`119`	`/**`
`124`	`120`	`* Create a log entry with all source data files, and all required index content. This contains`
`125`	`121`	`* ALL source data files (files which were indexed previously, and files which are being indexed`
`@@ -144,4 +140,8 @@ class RefreshIncrementalAction(`
`144`	`140`	`entry`
`145`	`141`	`}`
`146`	`142`	`}`
	`143`	`+`
	`144`	`+ override protected def event(appInfo: AppInfo, message: String): HyperspaceEvent = {`
	`145`	`+ RefreshIncrementalActionEvent(appInfo, logEntry.asInstanceOf[IndexLogEntry], message)`
	`146`	`+ }`
`147`	`147`	`}`