Get the nested fields not modifying the column names

xuanyuanking · xuanyuanking · commit 1c348779d4f7 · 2016-09-07T20:07:11.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/StructType.scala
@@ -279,29 +279,6 @@ case class StructType(fields: Array[StructField]) extends DataType with Seq[Stru
     StructType(fields.filter(f => names.contains(f.name)))
   }
 
-  /**
-   * Extracts the [[StructField]] with the given name recursively.
-   *
-   * @throws IllegalArgumentException if the parent field's type is not StructType
-   */
-  def getFieldRecursively(name: String): StructField = {
-    if (name.contains(',')) {
-      val curFieldStr = name.split(",", 2)(0)
-      val nextFieldStr = name.split(",", 2)(1)
-      val curField = this.apply(curFieldStr)
-      curField.dataType match {
-        case st: StructType =>
-          val newField = StructType(st.fields).getFieldRecursively(nextFieldStr)
-          StructField(curField.name, StructType(Seq(newField)),
-            curField.nullable, curField.metadata)
-        case _ =>
-          throw new IllegalArgumentException(s"""Field "$curFieldStr" is not struct field.""")
-      }
-    } else {
-      this.apply(name)
-    }
-  }
-
   /**
    * Returns the index of a given field.
    *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types.{StructField, StructType}
 
 /**
  * A strategy for planning scans over collections of files that might be partitioned or bucketed
@@ -99,11 +99,9 @@ object FileSourceStrategy extends Strategy with Logging {
           .filter(requiredAttributes.contains)
           .filterNot(partitionColumns.contains)
       val outputSchema = if (fsRelation.sqlContext.conf.isParquetNestColumnPruning) {
-        val requiredColumnsWithNesting = generateRequiredColumnsContainsNesting(
-          projects, readDataColumns.attrs.map(_.name).toArray)
         val totalSchema = readDataColumns.toStructType
-        val prunedSchema = StructType(requiredColumnsWithNesting
-          .map(totalSchema.getFieldRecursively))
+        val prunedSchema = StructType(
+          generateStructFieldsContainsNesting(projects, totalSchema))
         // Merge schema in same StructType and merge with filterAttributes
         prunedSchema.fields.map(f => StructType(Array(f))).reduceLeft(_ merge _)
           .merge(filterAttributes.toSeq.toStructType)
@@ -137,55 +135,51 @@ object FileSourceStrategy extends Strategy with Logging {
     case _ => Nil
   }
 
-  private def generateRequiredColumnsContainsNesting(projects: Seq[Expression],
-                                      columns: Array[String]) : Array[String] = {
-    def generateAttributeMap(nestFieldMap: scala.collection.mutable.Map[String, Seq[String]],
-                             isNestField: Boolean, curString: Option[String],
-                             node: Expression) {
+  private def generateStructFieldsContainsNesting(projects: Seq[Expression],
+                                      totalSchema: StructType) : Seq[StructField] = {
+    def generateStructField(curField: List[String],
+                             node: Expression) : Seq[StructField] = {
       node match {
         case ai: GetArrayItem =>
-          // Here we drop the curString for simplify array and map support.
+          // Here we drop the previous for simplify array and map support.
           // Same strategy in GetArrayStructFields and GetMapValue
-          generateAttributeMap(nestFieldMap, isNestField = true, None, ai.child)
-
+          generateStructField(List.empty[String], ai.child)
         case asf: GetArrayStructFields =>
-          generateAttributeMap(nestFieldMap, isNestField = true, None, asf.child)
-
+          generateStructField(List.empty[String], asf.child)
         case mv: GetMapValue =>
-          generateAttributeMap(nestFieldMap, isNestField = true, None, mv.child)
-
+          generateStructField(List.empty[String], mv.child)
         case attr: AttributeReference =>
-          if (isNestField && curString.isDefined) {
-            val attrStr = attr.name
-            if (nestFieldMap.contains(attrStr)) {
-              nestFieldMap(attrStr) = nestFieldMap(attrStr) ++ Seq(attrStr + "," + curString.get)
-            } else {
-              nestFieldMap += (attrStr -> Seq(attrStr + "," + curString.get))
-            }
-          }
+          Seq(getFieldRecursively(totalSchema, attr.name :: curField))
         case sf: GetStructField =>
-          val str = if (curString.isDefined) {
-            sf.name.get + "," + curString.get
-          } else sf.name.get
-          generateAttributeMap(nestFieldMap, isNestField = true, Option(str), sf.child)
+          generateStructField(sf.name.get :: curField, sf.child)
         case _ =>
           if (node.children.nonEmpty) {
-            node.children.foreach(child => generateAttributeMap(nestFieldMap,
-              isNestField, curString, child))
+            node.children.flatMap(child => generateStructField(curField, child))
+          } else {
+            Seq.empty[StructField]
           }
       }
     }
 
-    val nestFieldMap = scala.collection.mutable.Map.empty[String, Seq[String]]
-    projects.foreach(p => generateAttributeMap(nestFieldMap, isNestField = false, None, p))
-    val col_list = columns.toList.flatMap(col => {
-      if (nestFieldMap.contains(col)) {
-        nestFieldMap.get(col).get.toList
+    def getFieldRecursively(totalSchema: StructType,
+                            name: List[String]): StructField = {
+      if (name.length > 1) {
+        val curField = name.head
+        val curFieldType = totalSchema(curField)
+        curFieldType.dataType match {
+          case st: StructType =>
+            val newField = getFieldRecursively(StructType(st.fields), name.drop(1))
+            StructField(curFieldType.name, StructType(Seq(newField)),
+              curFieldType.nullable, curFieldType.metadata)
+          case _ =>
+            throw new IllegalArgumentException(s"""Field "$curField" is not struct field.""")
+        }
       } else {
-        List(col)
+        totalSchema(name.head)
       }
-    })
-    col_list.toArray
+    }
+
+    projects.flatMap(p => generateStructField(List.empty[String], p))
   }
 
 }
diff --git a/sql/core/src/test/resources/test-data/nested-struct.snappy.parquet b/sql/core/src/test/resources/test-data/nested-struct.snappy.parquet
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -574,36 +574,29 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
   test("SPARK-4502 parquet nested fields pruning") {
     // Schema of "test-data/nested-array-struct.parquet":
     //    root
-    //    |-- primitive: integer (nullable = true)
-    //    |-- myComplex: array (nullable = true)
-    //    |    |-- element: struct (containsNull = true)
-    //    |    |    |-- id: integer (nullable = true)
-    //    |    |    |-- repeatedMessage: array (nullable = true)
-    //    |    |    |    |-- element: struct (containsNull = true)
-    //    |    |    |    |    |-- someId: integer (nullable = true)
-    val df = readResourceParquetFile("test-data/nested-array-struct.parquet")
+    //    |-- col: struct (nullable = true)
+    //    |    |-- s1: struct (nullable = true)
+    //    |    |    |-- s1_1: long (nullable = true)
+    //    |    |    |-- s1_2: long (nullable = true)
+    //    |    |-- str: string (nullable = true)
+    //    |-- num: long (nullable = true)
+    //    |-- str: string (nullable = true)
+    val df = readResourceParquetFile("test-data/nested-struct.snappy.parquet")
     df.createOrReplaceTempView("tmp_table")
     // normal test
-    val query1 = "select primitive,myComplex[0].id from tmp_table"
+    val query1 = "select num,col.s1.s1_1 from tmp_table"
     val result1 = sql(query1)
     withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
       checkAnswer(sql(query1), result1)
     }
-    // test for array in struct
-    val query2 = "select primitive,myComplex[0].repeatedMessage[0].someId from tmp_table"
+    // test for same struct meta merge
+    // col.s1.s1_1 and col.str should merge
+    // like col.[s1.s1_1, str] before pass to parquet
+    val query2 = "select col.s1.s1_1,col.str from tmp_table"
     val result2 = sql(query2)
     withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
       checkAnswer(sql(query2), result2)
     }
-    // test for same struct meta merge
-    // myComplex.id and myComplex.repeatedMessage.someId should merge
-    // like myComplex.[id, repeatedMessage.someId] before pass to parquet
-    val query3 = "select myComplex[0].id, myComplex[0].repeatedMessage[0].someId" +
-      " from tmp_table"
-    val result3 = sql(query3)
-    withSQLConf(SQLConf.PARQUET_NEST_COLUMN_PRUNING.key -> "true") {
-      checkAnswer(sql(query3), result3)
-    }
 
     spark.sessionState.catalog.dropTable(
       TableIdentifier("tmp_table"), ignoreIfNotExists = true, purge = false)