fix1

gatorsmile · gatorsmile · commit 568f13352a30 · 2016-07-10T00:09:30.000-07:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala
@@ -137,14 +137,16 @@ case class CatalogTable(
     unsupportedFeatures: Seq[String] = Seq.empty) {
 
   // Verify that the provided columns are part of the schema
-  private val colNames = schema.map(_.name).toSet
-  private def requireSubsetOfSchema(cols: Seq[String], colType: String): Unit = {
-    require(cols.toSet.subsetOf(colNames), s"$colType columns (${cols.mkString(", ")}) " +
-      s"must be a subset of schema (${colNames.mkString(", ")}) in table '$identifier'")
-  }
-  requireSubsetOfSchema(partitionColumnNames, "partition")
-  requireSubsetOfSchema(sortColumnNames, "sort")
-  requireSubsetOfSchema(bucketColumnNames, "bucket")
+  // TODO: this restriction should be checked at the end of Analyzer. When building CatalogTable,
+  // the initial version might violate it.
+  // private val colNames = schema.map(_.name).toSet
+  // private def requireSubsetOfSchema(cols: Seq[String], colType: String): Unit = {
+  //   require(cols.toSet.subsetOf(colNames), s"$colType columns (${cols.mkString(", ")}) " +
+  //     s"must be a subset of schema (${colNames.mkString(", ")}) in table '$identifier'")
+  // }
+  // requireSubsetOfSchema(partitionColumnNames, "partition")
+  // requireSubsetOfSchema(sortColumnNames, "sort")
+  // requireSubsetOfSchema(bucketColumnNames, "bucket")
 
   /** Columns this table is partitioned by. */
   def partitionColumns: Seq[CatalogColumn] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -23,7 +23,8 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, Project}
+import org.apache.spark.sql.catalyst.catalog.{CatalogColumn, CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
 import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsingAsSelect, DataSource, HadoopFsRelation}
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils
 
@@ -366,14 +367,27 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
         throw new AnalysisException(s"Table $tableIdent already exists.")
 
       case _ =>
+        val bucketSpec = getBucketSpec
+        val sortColumnNames = bucketSpec.map(_.sortColumnNames).getOrElse(Seq.empty)
+        val bucketColumnNames = bucketSpec.map(_.bucketColumnNames).getOrElse(Seq.empty)
+        val numBuckets = bucketSpec.map(_.numBuckets).getOrElse(-1)
+
+        val tableDesc = CatalogTable(
+          identifier = tableIdent,
+          tableType = CatalogTableType.MANAGED,
+          storage = CatalogStorageFormat.empty,
+          schema = Seq.empty[CatalogColumn],
+          partitionColumnNames = partitioningColumns.getOrElse(Seq.empty[String]),
+          sortColumnNames = sortColumnNames,
+          bucketColumnNames = bucketColumnNames,
+          numBuckets = numBuckets,
+          properties = extraOptions.toMap)
+
         val cmd =
           CreateTableUsingAsSelect(
-            tableIdent,
+            tableDesc,
             source,
-            partitioningColumns.map(_.toArray).getOrElse(Array.empty[String]),
-            getBucketSpec,
             mode,
-            extraOptions.toMap,
             df.logicalPlan)
         df.sparkSession.sessionState.executePlan(cmd).toRdd
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -340,8 +340,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         SaveMode.ErrorIfExists
       }
 
+      val sortColumnNames = bucketSpec.map(_.sortColumnNames).getOrElse(Seq.empty)
+      val bucketColumnNames = bucketSpec.map(_.bucketColumnNames).getOrElse(Seq.empty)
+      val numBuckets = bucketSpec.map(_.numBuckets).getOrElse(-1)
+
+      val tableDesc = CatalogTable(
+        identifier = table,
+        tableType = CatalogTableType.MANAGED,
+        storage = CatalogStorageFormat.empty,
+        schema = Seq.empty[CatalogColumn],
+        partitionColumnNames = partitionColumnNames,
+        sortColumnNames = sortColumnNames,
+        bucketColumnNames = bucketColumnNames,
+        numBuckets = numBuckets,
+        properties = options)
+
       CreateTableUsingAsSelect(
-        table, provider, partitionColumnNames, bucketSpec, mode, options, query)
+        tableDesc = tableDesc, provider = provider, mode = mode, child = query)
     } else {
       val struct = Option(ctx.colTypeList()).map(createStructType)
       CreateTableUsing(
@@ -1025,20 +1040,17 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder {
         val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null)
         if (conf.convertCTAS && !hasStorageProperties) {
           val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists
-          // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
-          // are empty Maps.
-          val optionsWithPath = if (location.isDefined) {
+          // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties are empty.
+          // When converting Hive Table to Data Source Table, ignore user-specified table properties
+          val tableProperties = if (location.isDefined) {
             Map("path" -> location.get)
           } else {
             Map.empty[String, String]
           }
           CreateTableUsingAsSelect(
-            tableIdent = tableDesc.identifier,
+            tableDesc = tableDesc.copy(properties = tableProperties),
             provider = conf.defaultDataSourceName,
-            partitionColumns = Array.empty[String],
-            bucketSpec = None,
             mode = mode,
-            options = optionsWithPath,
             q
           )
         } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -447,12 +447,9 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case c: CreateTableUsingAsSelect =>
         val cmd =
           CreateDataSourceTableAsSelectCommand(
-            c.tableIdent,
+            c.tableDesc,
             c.provider,
-            c.partitionColumns,
-            c.bucketSpec,
             c.mode,
-            c.options,
             c.child)
         ExecutedCommandExec(cmd) :: Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -130,12 +130,9 @@ case class CreateDataSourceTableCommand(
  * }}}
  */
 case class CreateDataSourceTableAsSelectCommand(
-    tableIdent: TableIdentifier,
+    tableDesc: CatalogTable,
     provider: String,
-    partitionColumns: Array[String],
-    bucketSpec: Option[BucketSpec],
     mode: SaveMode,
-    options: Map[String, String],
     query: LogicalPlan)
   extends RunnableCommand {
 
@@ -146,31 +143,39 @@ case class CreateDataSourceTableAsSelectCommand(
     // the table name and database name we have for this query. MetaStoreUtils.validateName
     // is the method used by Hive to check if a table name or a database name is valid for
     // the metastore.
-    if (!CreateDataSourceTableUtils.validateName(tableIdent.table)) {
-      throw new AnalysisException(s"Table name ${tableIdent.table} is not a valid name for " +
-        s"metastore. Metastore only accepts table name containing characters, numbers and _.")
+    if (!CreateDataSourceTableUtils.validateName(tableDesc.identifier.table)) {
+      throw new AnalysisException(s"Table name ${tableDesc.identifier.table} is not a valid name " +
+        s"for metastore. Metastore only accepts table name containing characters, numbers and _.")
     }
-    if (tableIdent.database.isDefined &&
-      !CreateDataSourceTableUtils.validateName(tableIdent.database.get)) {
-      throw new AnalysisException(s"Database name ${tableIdent.database.get} is not a valid name " +
-        s"for metastore. Metastore only accepts database name containing " +
+    if (tableDesc.identifier.database.isDefined &&
+      !CreateDataSourceTableUtils.validateName(tableDesc.identifier.database.get)) {
+      throw new AnalysisException(s"Database name ${tableDesc.identifier.database.get} is not " +
+        s"a valid name for metastore. Metastore only accepts database name containing " +
         s"characters, numbers and _.")
     }
 
-    val tableName = tableIdent.unquotedString
+    val tableName = tableDesc.identifier.unquotedString
     val sessionState = sparkSession.sessionState
     var createMetastoreTable = false
     var isExternal = true
     val optionsWithPath =
-      if (!new CaseInsensitiveMap(options).contains("path")) {
+      if (!new CaseInsensitiveMap(tableDesc.properties).contains("path")) {
         isExternal = false
-        options + ("path" -> sessionState.catalog.defaultTablePath(tableIdent))
+        tableDesc.properties +
+          ("path" -> sessionState.catalog.defaultTablePath(tableDesc.identifier))
       } else {
-        options
+        tableDesc.properties
       }
 
+    val bucketSpec: Option[BucketSpec] = if (tableDesc.numBuckets > 0) {
+      Option(BucketSpec(
+        tableDesc.numBuckets, tableDesc.bucketColumnNames, tableDesc.sortColumnNames))
+    } else {
+      None
+    }
+
     var existingSchema = Option.empty[StructType]
-    if (sparkSession.sessionState.catalog.tableExists(tableIdent)) {
+    if (sparkSession.sessionState.catalog.tableExists(tableDesc.identifier)) {
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
@@ -187,21 +192,21 @@ case class CreateDataSourceTableAsSelectCommand(
           val dataSource = DataSource(
             sparkSession = sparkSession,
             userSpecifiedSchema = Some(query.schema.asNullable),
-            partitionColumns = partitionColumns,
+            partitionColumns = tableDesc.partitionColumnNames,
             bucketSpec = bucketSpec,
             className = provider,
             options = optionsWithPath)
           // TODO: Check that options from the resolved relation match the relation that we are
           // inserting into (i.e. using the same compression).
 
           EliminateSubqueryAliases(
-            sessionState.catalog.lookupRelation(tableIdent)) match {
+            sessionState.catalog.lookupRelation(tableDesc.identifier)) match {
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _, _) =>
               // check if the file formats match
               l.relation match {
                 case r: HadoopFsRelation if r.fileFormat.getClass != dataSource.providingClass =>
                   throw new AnalysisException(
-                    s"The file format of the existing table $tableIdent is " +
+                    s"The file format of the existing table ${tableDesc.identifier} is " +
                       s"`${r.fileFormat.getClass.getName}`. It doesn't match the specified " +
                       s"format `$provider`")
                 case _ =>
@@ -238,15 +243,15 @@ case class CreateDataSourceTableAsSelectCommand(
     val dataSource = DataSource(
       sparkSession,
       className = provider,
-      partitionColumns = partitionColumns,
+      partitionColumns = tableDesc.partitionColumnNames,
       bucketSpec = bucketSpec,
       options = optionsWithPath)
 
     val result = try {
       dataSource.write(mode, df)
     } catch {
       case ex: AnalysisException =>
-        logError(s"Failed to write to table ${tableIdent.identifier} in $mode mode", ex)
+        logError(s"Failed to write to table ${tableDesc.identifier} in $mode mode", ex)
         throw ex
     }
     if (createMetastoreTable) {
@@ -255,17 +260,17 @@ case class CreateDataSourceTableAsSelectCommand(
       // provider (for example, see org.apache.spark.sql.parquet.DefaultSource).
       CreateDataSourceTableUtils.createDataSourceTable(
         sparkSession = sparkSession,
-        tableIdent = tableIdent,
+        tableIdent = tableDesc.identifier,
         userSpecifiedSchema = Some(result.schema),
-        partitionColumns = partitionColumns,
+        partitionColumns = tableDesc.partitionColumnNames.toArray,
         bucketSpec = bucketSpec,
         provider = provider,
         options = optionsWithPath,
         isExternal = isExternal)
     }
 
     // Refresh the cache of the table in the catalog.
-    sessionState.catalog.refreshTable(tableIdent)
+    sessionState.catalog.refreshTable(tableDesc.identifier)
     Seq.empty[Row]
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.catalyst.catalog.CatalogTable
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -54,12 +55,9 @@ case class CreateTableUsing(
  * So, [[PreWriteCheck]] can detect cases that are not allowed.
  */
 case class CreateTableUsingAsSelect(
-    tableIdent: TableIdentifier,
+    tableDesc: CatalogTable,
     provider: String,
-    partitionColumns: Array[String],
-    bucketSpec: Option[BucketSpec],
     mode: SaveMode,
-    options: Map[String, String],
     child: LogicalPlan) extends logical.UnaryNode {
   override def output: Seq[Attribute] = Seq.empty[Attribute]
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -209,9 +209,9 @@ private[sql] case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
       case c: CreateTableUsingAsSelect =>
         // When the SaveMode is Overwrite, we need to check if the table is an input table of
         // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
-        if (c.mode == SaveMode.Overwrite && catalog.tableExists(c.tableIdent)) {
+        if (c.mode == SaveMode.Overwrite && catalog.tableExists(c.tableDesc.identifier)) {
           // Need to remove SubQuery operator.
-          EliminateSubqueryAliases(catalog.lookupRelation(c.tableIdent)) match {
+          EliminateSubqueryAliases(catalog.lookupRelation(c.tableDesc.identifier)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
             case l @ LogicalRelation(dest: BaseRelation, _, _) =>
@@ -221,7 +221,7 @@ private[sql] case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
               }
               if (srcRelations.contains(dest)) {
                 failAnalysis(
-                  s"Cannot overwrite table ${c.tableIdent} that is also being read from.")
+                  s"Cannot overwrite table ${c.tableDesc.identifier} that is also being read from.")
               } else {
                 // OK
               }
@@ -233,11 +233,10 @@ private[sql] case class PreWriteCheck(conf: SQLConf, catalog: SessionCatalog)
         }
 
         PartitioningUtils.validatePartitionColumn(
-          c.child.schema, c.partitionColumns, conf.caseSensitiveAnalysis)
+          c.child.schema, c.tableDesc.partitionColumnNames, conf.caseSensitiveAnalysis)
 
         for {
-          spec <- c.bucketSpec
-          sortColumnName <- spec.sortColumnNames
+          sortColumnName <- c.tableDesc.sortColumnNames
           sortColumn <- c.child.schema.find(_.name == sortColumnName)
         } {
           if (!RowOrdering.isOrderable(sortColumn.dataType)) {