[SPARK-18355][SQL] Use Spark schema to read ORC table instead of ORC file schema

dongjoon-hyun · dongjoon-hyun · commit d11ce09d07f3 · 2017-10-10T20:50:07.000-07:00
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -138,8 +138,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       if (maybePhysicalSchema.isEmpty) {
         Iterator.empty
       } else {
-        val physicalSchema = maybePhysicalSchema.get
-        OrcRelation.setRequiredColumns(conf, physicalSchema, requiredSchema)
+        OrcRelation.setRequiredColumns(conf, dataSchema, requiredSchema)
 
         val orcRecordReader = {
           val job = Job.getInstance(conf)
@@ -163,6 +162,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
         // Unwraps `OrcStruct`s to `UnsafeRow`s
         OrcRelation.unwrapOrcStructs(
           conf,
+          dataSchema,
           requiredSchema,
           Some(orcRecordReader.getObjectInspector.asInstanceOf[StructObjectInspector]),
           recordsIterator)
@@ -272,25 +272,35 @@ private[orc] object OrcRelation extends HiveInspectors {
   def unwrapOrcStructs(
       conf: Configuration,
       dataSchema: StructType,
+      requiredSchema: StructType,
       maybeStructOI: Option[StructObjectInspector],
       iterator: Iterator[Writable]): Iterator[InternalRow] = {
     val deserializer = new OrcSerde
-    val mutableRow = new SpecificInternalRow(dataSchema.map(_.dataType))
-    val unsafeProjection = UnsafeProjection.create(dataSchema)
+    val mutableRow = new SpecificInternalRow(requiredSchema.map(_.dataType))
+    val unsafeProjection = UnsafeProjection.create(requiredSchema)
 
     def unwrap(oi: StructObjectInspector): Iterator[InternalRow] = {
-      val (fieldRefs, fieldOrdinals) = dataSchema.zipWithIndex.map {
-        case (field, ordinal) => oi.getStructFieldRef(field.name) -> ordinal
+      val (fieldRefs, fieldOrdinals) = requiredSchema.zipWithIndex.map {
+        case (field, ordinal) =>
+          var ref = oi.getStructFieldRef(field.name)
+          if (ref == null) {
+            val maybeIndex = dataSchema.getFieldIndex(field.name)
+            if (maybeIndex.isDefined) {
+              ref = oi.getStructFieldRef("_col" + maybeIndex.get)
+            }
+          }
+          ref -> ordinal
       }.unzip
 
-      val unwrappers = fieldRefs.map(unwrapperFor)
+      val unwrappers = fieldRefs.map(r => if (r == null) null else unwrapperFor(r))
 
       iterator.map { value =>
         val raw = deserializer.deserialize(value)
         var i = 0
         val length = fieldRefs.length
         while (i < length) {
-          val fieldValue = oi.getStructFieldData(raw, fieldRefs(i))
+          val fieldRef = fieldRefs(i)
+          val fieldValue = if (fieldRef == null) null else oi.getStructFieldData(raw, fieldRefs(i))
           if (fieldValue == null) {
             mutableRow.setNullAt(fieldOrdinals(i))
           } else {
@@ -306,8 +316,8 @@ private[orc] object OrcRelation extends HiveInspectors {
   }
 
   def setRequiredColumns(
-      conf: Configuration, physicalSchema: StructType, requestedSchema: StructType): Unit = {
-    val ids = requestedSchema.map(a => physicalSchema.fieldIndex(a.name): Integer)
+      conf: Configuration, dataSchema: StructType, requestedSchema: StructType): Unit = {
+    val ids = requestedSchema.map(a => dataSchema.fieldIndex(a.name): Integer)
     val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
     HiveShim.appendReadColumns(conf, sortedIDs, sortedNames)
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -34,7 +34,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
@@ -2050,4 +2050,60 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-18355 Use Spark schema to read ORC table instead of ORC file schema") {
+    val client = spark.sharedState.externalCatalog.asInstanceOf[HiveExternalCatalog].client
+
+    Seq("true", "false").foreach { value =>
+      withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> value) {
+        withTempDatabase { db =>
+          client.runSqlHive(
+            s"""
+               |CREATE TABLE $db.t(
+               |  click_id string,
+               |  search_id string,
+               |  uid bigint)
+               |PARTITIONED BY (
+               |  ts string,
+               |  hour string)
+               |STORED AS ORC
+             """.stripMargin)
+
+          client.runSqlHive(
+            s"""
+               |INSERT INTO TABLE $db.t
+               |PARTITION (ts = '98765', hour = '01')
+               |VALUES (12, 2, 12345)
+             """.stripMargin
+          )
+
+          checkAnswer(
+            sql(s"SELECT * FROM $db.t"),
+            Row("12", "2", 12345, "98765", "01"))
+
+          client.runSqlHive(s"ALTER TABLE $db.t ADD COLUMNS (dummy string)")
+
+          checkAnswer(
+            sql(s"SELECT click_id, search_id FROM $db.t"),
+            Row("12", "2"))
+
+          checkAnswer(
+            sql(s"SELECT search_id, click_id FROM $db.t"),
+            Row("2", "12"))
+
+          checkAnswer(
+            sql(s"SELECT search_id FROM $db.t"),
+            Row("2"))
+
+          checkAnswer(
+            sql(s"SELECT dummy, click_id FROM $db.t"),
+            Row(null, "12"))
+
+          checkAnswer(
+            sql(s"SELECT * FROM $db.t"),
+            Row("12", "2", 12345, null, "98765", "01"))
+        }
+      }
+    }
+  }
 }