fix.

gatorsmile · gatorsmile · commit 5d38f09f47a7 · 2017-01-05T15:53:55.000-08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -213,7 +213,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
       bucketSpec = getBucketSpec,
       options = extraOptions.toMap)
 
-    dataSource.write(mode, df)
+    dataSource.write(mode, df, isForWriteOnly = true)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -413,10 +413,16 @@ case class DataSource(
     relation
   }
 
-  /** Writes the given [[DataFrame]] out to this [[DataSource]]. */
+  /**
+   * Writes the given [[DataFrame]] out to this [[DataSource]].
+   *
+   * @param isForWriteOnly Whether to exit early and just write the data without returning
+   *                       a [[BaseRelation]].
+   */
   def write(
       mode: SaveMode,
-      data: DataFrame): BaseRelation = {
+      data: DataFrame,
+      isForWriteOnly: Boolean = false): BaseRelation = {
     if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
     }
@@ -494,8 +500,13 @@ case class DataSource(
             catalogTable = catalogTable,
             fileIndex = fileIndex)
         sparkSession.sessionState.executePlan(plan).toRdd
-        // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring it.
-        copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
+        if (isForWriteOnly) {
+          // Exit earlier and return null
+          null
+        } else {
+          // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
+          copy(userSpecifiedSchema = Some(data.schema.asNullable)).resolveRelation()
+        }
 
       case _ =>
         sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/PartitionedTablePerfStatsSuite.scala
@@ -62,17 +62,12 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedHiveTable(
-      tableName: String, dir: File, scale: Int,
-      clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = {
+      tableName: String, dir: File, scale: Int, repair: Boolean = true): Unit = {
     spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
-    if (clearMetricsBeforeCreate) {
-      HiveCatalogMetrics.reset()
-    }
-
     spark.sql(s"""
       |create external table $tableName (fieldOne long)
       |partitioned by (partCol1 int, partCol2 int)
@@ -88,17 +83,12 @@ class PartitionedTablePerfStatsSuite
   }
 
   private def setupPartitionedDatasourceTable(
-      tableName: String, dir: File, scale: Int,
-      clearMetricsBeforeCreate: Boolean = false, repair: Boolean = true): Unit = {
+      tableName: String, dir: File, scale: Int, repair: Boolean = true): Unit = {
     spark.range(scale).selectExpr("id as fieldOne", "id as partCol1", "id as partCol2").write
       .partitionBy("partCol1", "partCol2")
       .mode("overwrite")
       .parquet(dir.getAbsolutePath)
 
-    if (clearMetricsBeforeCreate) {
-      HiveCatalogMetrics.reset()
-    }
-
     spark.sql(s"""
       |create table $tableName (fieldOne long, partCol1 int, partCol2 int)
       |using parquet
@@ -271,8 +261,8 @@ class PartitionedTablePerfStatsSuite
     withSQLConf(SQLConf.HIVE_MANAGE_FILESOURCE_PARTITIONS.key -> "true") {
       withTable("test") {
         withTempDir { dir =>
-          setupPartitionedDatasourceTable(
-            "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false)
+          HiveCatalogMetrics.reset()
+          setupPartitionedDatasourceTable("test", dir, scale = 10, repair = false)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
         }
@@ -285,8 +275,7 @@ class PartitionedTablePerfStatsSuite
       withTable("test") {
         withTempDir { dir =>
           HiveCatalogMetrics.reset()
-          setupPartitionedHiveTable(
-            "test", dir, scale = 10, clearMetricsBeforeCreate = true, repair = false)
+          setupPartitionedHiveTable("test", dir, scale = 10, repair = false)
           assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 0)
           assert(HiveCatalogMetrics.METRIC_FILE_CACHE_HITS.getCount() == 0)
         }
@@ -416,12 +405,8 @@ class PartitionedTablePerfStatsSuite
           })
           executorPool.shutdown()
           executorPool.awaitTermination(30, TimeUnit.SECONDS)
-          // check the cache hit, we use the metric of METRIC_FILES_DISCOVERED and
-          // METRIC_PARALLEL_LISTING_JOB_COUNT to check this, while the lock take effect,
-          // only one thread can really do the build, so the listing job count is 2, the other
-          // one is cache.load func. Also METRIC_FILES_DISCOVERED is $partition_num * 2
-          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 100)
-          assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 2)
+          assert(HiveCatalogMetrics.METRIC_FILES_DISCOVERED.getCount() == 50)
+          assert(HiveCatalogMetrics.METRIC_PARALLEL_LISTING_JOB_COUNT.getCount() == 1)
         }
       }
     }

Original file line number	Diff line number	Diff line change
`@@ -213,7 +213,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {`
`213`	`213`	`bucketSpec = getBucketSpec,`
`214`	`214`	`options = extraOptions.toMap)`
`215`	`215`
`216`		`- dataSource.write(mode, df)`
	`216`	`+ dataSource.write(mode, df, isForWriteOnly = true)`
`217`	`217`	`}`
`218`	`218`
`219`	`219`	`/**`