address comments.

gatorsmile · gatorsmile · commit 55ee86456eb0 · 2016-09-19T14:07:18.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -144,8 +144,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       DataSource.apply(
         sparkSession,
         paths = paths,
-        inputSchema = userSpecifiedSchema,
-        isSchemaFromUsers = true,
+        userSpecifiedSchema = userSpecifiedSchema,
         className = source,
         options = extraOptions.toMap).resolveRelation())
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -64,8 +64,7 @@ case class CreateDataSourceTableCommand(table: CatalogTable, ignoreIfExists: Boo
     val dataSource: BaseRelation =
       DataSource(
         sparkSession = sparkSession,
-        inputSchema = if (table.schema.isEmpty) None else Some(table.schema),
-        isSchemaFromUsers = true,
+        userSpecifiedSchema = if (table.schema.isEmpty) None else Some(table.schema),
         className = table.provider.get,
         bucketSpec = table.bucketSpec,
         options = table.storage.properties).resolveRelation()
@@ -165,7 +164,7 @@ case class CreateDataSourceTableAsSelectCommand(
           // Check if the specified data source match the data source of the existing table.
           val dataSource = DataSource(
             sparkSession = sparkSession,
-            inputSchema = Some(query.schema.asNullable),
+            userSpecifiedSchema = Some(query.schema.asNullable),
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = provider,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -60,9 +60,8 @@ import org.apache.spark.util.Utils
  *
  * @param paths A list of file system paths that hold data.  These will be globbed before and
  *              qualified. This option only works when reading from a [[FileFormat]].
- * @param inputSchema An optional specification of the schema of the data. When present we skip
- *                   attempting to infer the schema.
- * @param isSchemaFromUsers A flag to indicate whether the schema is specified by users.
+ * @param userSpecifiedSchema An optional specification of the schema of the data. When present
+ *                            we skip attempting to infer the schema.
  * @param partitionColumns A list of column names that the relation is partitioned by. When this
  *                         list is empty, the relation is unpartitioned.
  * @param bucketSpec An optional specification for bucketing (hash-partitioning) of the data.
@@ -71,8 +70,7 @@ case class DataSource(
     sparkSession: SparkSession,
     className: String,
     paths: Seq[String] = Nil,
-    inputSchema: Option[StructType] = None,
-    isSchemaFromUsers: Boolean = false,
+    userSpecifiedSchema: Option[StructType] = None,
     partitionColumns: Seq[String] = Seq.empty,
     bucketSpec: Option[BucketSpec] = None,
     options: Map[String, String] = Map.empty) extends Logging {
@@ -189,7 +187,7 @@ case class DataSource(
   }
 
   private def inferFileFormatSchema(format: FileFormat): StructType = {
-    inputSchema.orElse {
+    userSpecifiedSchema.orElse {
       val caseInsensitiveOptions = new CaseInsensitiveMap(options)
       val allPaths = caseInsensitiveOptions.get("path")
       val globbedPaths = allPaths.toSeq.flatMap { path =>
@@ -213,7 +211,7 @@ case class DataSource(
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
         val (name, schema) = s.sourceSchema(
-          sparkSession.sqlContext, inputSchema, className, options)
+          sparkSession.sqlContext, userSpecifiedSchema, className, options)
         SourceInfo(name, schema)
 
       case format: FileFormat =>
@@ -236,7 +234,7 @@ case class DataSource(
         val isSchemaInferenceEnabled = sparkSession.sessionState.conf.streamingSchemaInference
         val isTextSource = providingClass == classOf[text.TextFileFormat]
         // If the schema inference is disabled, only text sources require schema to be specified
-        if (!isSchemaInferenceEnabled && !isTextSource && inputSchema.isEmpty) {
+        if (!isSchemaInferenceEnabled && !isTextSource && userSpecifiedSchema.isEmpty) {
           throw new IllegalArgumentException(
             "Schema must be specified when creating a streaming source DataFrame. " +
               "If some files already exist in the directory, then depending on the file format " +
@@ -255,7 +253,8 @@ case class DataSource(
   def createSource(metadataPath: String): Source = {
     providingClass.newInstance() match {
       case s: StreamSourceProvider =>
-        s.createSource(sparkSession.sqlContext, metadataPath, inputSchema, className, options)
+        s.createSource(
+          sparkSession.sqlContext, metadataPath, userSpecifiedSchema, className, options)
 
       case format: FileFormat =>
         val path = new CaseInsensitiveMap(options).getOrElse("path", {
@@ -320,28 +319,29 @@ case class DataSource(
    */
   def resolveRelation(): BaseRelation = {
     val caseInsensitiveOptions = new CaseInsensitiveMap(options)
-    val relation = (providingClass.newInstance(), inputSchema) match {
+    val relation = (providingClass.newInstance(), userSpecifiedSchema) match {
       // TODO: Throw when too much is given.
       case (dataSource: SchemaRelationProvider, Some(schema)) =>
         dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions, schema)
       case (dataSource: RelationProvider, None) =>
         dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
       case (_: SchemaRelationProvider, None) =>
         throw new AnalysisException(s"A schema needs to be specified when using $className.")
-      case (dataSource: RelationProvider, Some(_)) =>
-        if (isSchemaFromUsers) {
-          throw new AnalysisException(s"$className does not allow user-specified schemas.")
-        } else {
+      case (dataSource: RelationProvider, Some(schema)) =>
+        val baseRelation =
           dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions)
+        if (baseRelation.schema != schema) {
+          throw new AnalysisException(s"$className does not allow user-specified schemas.")
         }
+        baseRelation
 
       // We are reading from the results of a streaming query. Load files from the metadata log
       // instead of listing them using HDFS APIs.
       case (format: FileFormat, _)
           if hasMetadata(caseInsensitiveOptions.get("path").toSeq ++ paths) =>
         val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
         val fileCatalog = new MetadataLogFileCatalog(sparkSession, basePath)
-        val dataSchema = inputSchema.orElse {
+        val dataSchema = userSpecifiedSchema.orElse {
           format.inferSchema(
             sparkSession,
             caseInsensitiveOptions,
@@ -381,7 +381,7 @@ case class DataSource(
 
         // If they gave a schema, then we try and figure out the types of the partition columns
         // from that schema.
-        val partitionSchema = inputSchema.map { schema =>
+        val partitionSchema = userSpecifiedSchema.map { schema =>
           StructType(
             partitionColumns.map { c =>
               // TODO: Case sensitivity.
@@ -395,7 +395,7 @@ case class DataSource(
           new ListingFileCatalog(
             sparkSession, globbedPaths, options, partitionSchema)
 
-        val dataSchema = inputSchema.map { schema =>
+        val dataSchema = userSpecifiedSchema.map { schema =>
           val equality = sparkSession.sessionState.conf.resolver
           StructType(schema.filterNot(f => partitionColumns.exists(equality(_, f.name))))
         }.orElse {
@@ -505,7 +505,7 @@ case class DataSource(
             mode)
         sparkSession.sessionState.executePlan(plan).toRdd
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
-        copy(inputSchema = Some(data.schema.asNullable)).resolveRelation()
+        copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
 
       case _ =>
         sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -201,7 +201,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
     val dataSource =
       DataSource(
         sparkSession,
-        inputSchema = Some(table.schema),
+        userSpecifiedSchema = Some(table.schema),
         partitionColumns = table.partitionColumnNames,
         bucketSpec = table.bucketSpec,
         className = table.provider.get,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -55,8 +55,7 @@ case class CreateTempViewUsing(
   def run(sparkSession: SparkSession): Seq[Row] = {
     val dataSource = DataSource(
       sparkSession,
-      inputSchema = userSpecifiedSchema,
-      isSchemaFromUsers = true,
+      userSpecifiedSchema = userSpecifiedSchema,
       className = provider,
       options = options)
     sparkSession.sessionState.catalog.createTempView(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -133,7 +133,7 @@ class FileStreamSource(
       DataSource(
         sparkSession,
         paths = files.map(_.path),
-        inputSchema = Some(schema),
+        userSpecifiedSchema = Some(schema),
         className = fileFormatClassName,
         options = sourceOptions.optionMapWithoutPath)
     Dataset.ofRows(sparkSession, LogicalRelation(newDataSource.resolveRelation()))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -136,7 +136,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
     val dataSource =
       DataSource(
         sparkSession,
-        inputSchema = userSpecifiedSchema,
+        userSpecifiedSchema = userSpecifiedSchema,
         className = source,
         options = extraOptions.toMap)
     Dataset.ofRows(sparkSession, StreamingRelation(dataSource))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -1346,15 +1346,15 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
 
       val d1 = DataSource(
         spark,
-        inputSchema = None,
+        userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         className = classOf[JsonFileFormat].getCanonicalName,
         options = Map("path" -> path)).resolveRelation()
 
       val d2 = DataSource(
         spark,
-        inputSchema = None,
+        userSpecifiedSchema = None,
         partitionColumns = Array.empty[String],
         bucketSpec = None,
         className = classOf[JsonFileFormat].getCanonicalName,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -74,7 +74,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
         val dataSource =
           DataSource(
             sparkSession,
-            inputSchema = Some(table.schema),
+            userSpecifiedSchema = Some(table.schema),
             partitionColumns = table.partitionColumnNames,
             bucketSpec = table.bucketSpec,
             className = table.provider.get,
@@ -278,7 +278,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             DataSource(
               sparkSession = sparkSession,
               paths = paths,
-              inputSchema = Some(metastoreRelation.schema),
+              userSpecifiedSchema = Some(metastoreRelation.schema),
               bucketSpec = bucketSpec,
               options = options,
               className = fileType).resolveRelation(),