Address comments.

dongjoon-hyun · dongjoon-hyun · commit e7beb02ef8b1 · 2017-12-03T22:16:24.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -363,10 +363,13 @@ object SQLConf {
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
     .createWithDefault("snappy")
 
-  val ORC_ENABLED = buildConf("spark.sql.orc.enabled")
-    .doc("When true, use OrcFileFormat in sql/core module instead of the one in sql/hive module.")
+  val ORC_USE_NEW_VERSION = buildConf("spark.sql.orc.useNewVersion")
+    .doc("When true, use new OrcFileFormat in sql/core module instead of the one in sql/hive. " +
+      "Since new OrcFileFormat uses Apache ORC library instead of ORC library Hive 1.2.1, it is " +
+      "more stable and faster.")
+    .internal()
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -182,7 +182,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         "read files of Hive data source directly.")
     }
 
-    val cls = DataSource.lookupDataSource(sparkSession, source)
+    val cls = DataSource.lookupDataSource(sparkSession.sessionState.conf, source)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
       val options = new DataSourceV2Options(extraOptions.asJava)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -234,7 +234,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
     assertNotBucketed("save")
 
-    val cls = DataSource.lookupDataSource(df.sparkSession, source)
+    val cls = DataSource.lookupDataSource(df.sparkSession.sessionState.conf, source)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
       cls.newInstance() match {
         case ds: WriteSupport =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.util.Utils
@@ -190,7 +191,7 @@ case class AlterTableAddColumnsCommand(
     colsToAdd: Seq[StructField]) extends RunnableCommand {
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
-    val catalogTable = verifyAlterTableAddColumn(sparkSession, catalog, table)
+    val catalogTable = verifyAlterTableAddColumn(sparkSession.sessionState.conf, catalog, table)
 
     try {
       sparkSession.catalog.uncacheTable(table.quotedString)
@@ -216,7 +217,7 @@ case class AlterTableAddColumnsCommand(
    * For datasource table, it currently only supports parquet, json, csv.
    */
   private def verifyAlterTableAddColumn(
-      sparkSession: SparkSession,
+      conf: SQLConf,
       catalog: SessionCatalog,
       table: TableIdentifier): CatalogTable = {
     val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table)
@@ -230,7 +231,7 @@ case class AlterTableAddColumnsCommand(
     }
 
     if (DDLUtils.isDatasourceTable(catalogTable)) {
-      DataSource.lookupDataSource(sparkSession, catalogTable.provider.get).newInstance() match {
+      DataSource.lookupDataSource(conf, catalogTable.provider.get).newInstance() match {
         // For datasource table, this command can only support the following File format.
         // TextFileFormat only default to one column "value"
         // Hive type is already considered as hive serde table, so the logic will not
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -87,7 +87,8 @@ case class DataSource(
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
-  lazy val providingClass: Class[_] = DataSource.lookupDataSource(sparkSession, className)
+  lazy val providingClass: Class[_] =
+    DataSource.lookupDataSource(sparkSession.sessionState.conf, className)
   lazy val sourceInfo: SourceInfo = sourceSchema()
   private val caseInsensitiveOptions = CaseInsensitiveMap(options)
   private val equality = sparkSession.sessionState.conf.resolver
@@ -570,10 +571,9 @@ object DataSource extends Logging {
     "org.apache.spark.Logging")
 
   /** Given a provider name, look up the data source class definition. */
-  def lookupDataSource(sparkSession: SparkSession, provider: String): Class[_] = {
+  def lookupDataSource(conf: SQLConf, provider: String): Class[_] = {
     var provider1 = backwardCompatibilityMap.getOrElse(provider, provider)
-    if (Seq("orc", "org.apache.spark.sql.hive.orc.OrcFileFormat").contains(provider1.toLowerCase) &&
-        sparkSession.conf.get(SQLConf.ORC_ENABLED)) {
+    if (Seq("orc").contains(provider1.toLowerCase) && conf.getConf(SQLConf.ORC_USE_NEW_VERSION)) {
       logInfo(s"$provider1 is replaced with ${classOf[OrcFileFormat].getCanonicalName}")
       provider1 = classOf[OrcFileFormat].getCanonicalName
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -108,8 +108,9 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
       }
 
       // Check if the specified data source match the data source of the existing table.
-      val existingProvider = DataSource.lookupDataSource(sparkSession, existingTable.provider.get)
-      val specifiedProvider = DataSource.lookupDataSource(sparkSession, tableDesc.provider.get)
+      val conf = sparkSession.sessionState.conf
+      val existingProvider = DataSource.lookupDataSource(conf, existingTable.provider.get)
+      val specifiedProvider = DataSource.lookupDataSource(conf, tableDesc.provider.get)
       // TODO: Check that options from the resolved relation match the relation that we are
       // inserting into (i.e. using the same compression).
       if (existingProvider != specifiedProvider) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2786,14 +2786,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
-    withSQLConf(SQLConf.ORC_ENABLED.key -> "false") {
+    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "false") {
       val e = intercept[AnalysisException] {
         sql("CREATE TABLE spark_20728(a INT) USING ORC")
       }
       assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
     }
 
-    withSQLConf(SQLConf.ORC_ENABLED.key -> "true") {
+    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
       withTable("spark_20728") {
         sql("CREATE TABLE spark_20728(a INT) USING ORC")
         val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
@@ -59,7 +59,7 @@ class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {
     Seq(
       (true, "Unable to infer schema for ORC. It must be specified manually"),
       (false, "The ORC data source must be used with Hive support")).foreach { case (value, m) =>
-      withSQLConf(SQLConf.ORC_ENABLED.key -> s"$value") {
+      withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> s"$value") {
         val e = intercept[AnalysisException] {
           spark.read.format("orc").load()
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -478,7 +478,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
   }
 
   test("orc - API and behavior regarding schema") {
-    withSQLConf(SQLConf.ORC_ENABLED.key -> "true") {
+    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
       // Writer
       spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).orc(dir)
       val df = spark.read.orc(dir)
@@ -507,7 +507,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
   }
 
   test("column nullability and comment - write and then read") {
-    withSQLConf(SQLConf.ORC_ENABLED.key -> "true") {
+    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
       Seq("json", "orc", "parquet", "csv").foreach { format =>
         val schema = StructType(
           StructField("cl1", IntegerType, nullable = false).withComment("test") ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -195,7 +195,7 @@ case class RelationConversions(
         .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet")
     } else {
       val options = relation.tableMeta.storage.properties
-      if (conf.getConf(SQLConf.ORC_ENABLED)) {
+      if (conf.getConf(SQLConf.ORC_USE_NEW_VERSION)) {
         sessionCatalog.metastoreCatalog.convertToLogicalRelation(
           relation,
           options,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2159,7 +2159,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       (true, classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
       (false, classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (v, format) =>
 
-      withSQLConf(SQLConf.ORC_ENABLED.key -> s"$v") {
+      withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> s"$v") {
         withTable("spark_20728") {
           sql("CREATE TABLE spark_20728(a INT) USING ORC")
           val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {

Original file line number	Diff line number	Diff line change
`@@ -182,7 +182,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {`
`182`	`182`	`"read files of Hive data source directly.")`
`183`	`183`	`}`
`184`	`184`
`185`		`- val cls = DataSource.lookupDataSource(sparkSession, source)`
	`185`	`+ val cls = DataSource.lookupDataSource(sparkSession.sessionState.conf, source)`
`186`	`186`	`if (classOf[DataSourceV2].isAssignableFrom(cls)) {`
`187`	`187`	`val options = new DataSourceV2Options(extraOptions.asJava)`
`188`	`188`
Original file line number	Diff line number	Diff line change
`@@ -2786,14 +2786,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2786`	`2786`	`}`
`2787`	`2787`
`2788`	`2788`	`test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {`
`2789`		`- withSQLConf(SQLConf.ORC_ENABLED.key -> "false") {`
	`2789`	`+ withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "false") {`
`2790`	`2790`	`val e = intercept[AnalysisException] {`
`2791`	`2791`	`sql("CREATE TABLE spark_20728(a INT) USING ORC")`
`2792`	`2792`	`}`
`2793`	`2793`	`assert(e.message.contains("The ORC data source must be used with Hive support enabled"))`
`2794`	`2794`	`}`
`2795`	`2795`
`2796`		`- withSQLConf(SQLConf.ORC_ENABLED.key -> "true") {`
	`2796`	`+ withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {`
`2797`	`2797`	`withTable("spark_20728") {`
`2798`	`2798`	`sql("CREATE TABLE spark_20728(a INT) USING ORC")`
`2799`	`2799`	`val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {`
`59`	`59`	`Seq(`
`60`	`60`	`(true, "Unable to infer schema for ORC. It must be specified manually"),`
`61`	`61`	`(false, "The ORC data source must be used with Hive support")).foreach { case (value, m) =>`
`62`		`- withSQLConf(SQLConf.ORC_ENABLED.key -> s"$value") {`
	`62`	`+ withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> s"$value") {`
`63`	`63`	`val e = intercept[AnalysisException] {`
`64`	`64`	`spark.read.format("orc").load()`
`65`	`65`	`}`