[SPARK-14782][SPARK-14778][SQL] Remove HiveConf dependency from HiveSqlAstBuilder

rxin · yhuai · commit 8045814114d4 · 2016-04-20T21:20:51.000-07:00
## What changes were proposed in this pull request? The patch removes HiveConf dependency from HiveSqlAstBuilder. This is required in order to merge HiveSqlParser and SparkSqlAstBuilder, which would require getting rid of the Hive specific dependencies in HiveSqlParser. This patch also accomplishes [SPARK-14778] Remove HiveSessionState.substitutor. ## How was this patch tested? This should be covered by existing tests. Author: Reynold Xin <rxin@databricks.com> Closes #12550 from rxin/SPARK-14782.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -289,7 +289,7 @@ object SQLConf {
   val DEFAULT_DATA_SOURCE_NAME = SQLConfigBuilder("spark.sql.sources.default")
     .doc("The default data source to use in input/output.")
     .stringConf
-    .createWithDefault("org.apache.spark.sql.parquet")
+    .createWithDefault("parquet")
 
   // This is used to control the when we will split a schema's JSON string to multiple pieces
   // in order to fit the JSON string in metastore's table property (by default, the value has
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -24,7 +24,6 @@ import com.google.common.base.Objects
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.hadoop.hive.common.StatsSetupConst
-import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.{TableType => HiveTableType}
 import org.apache.hadoop.hive.metastore.api.FieldSchema
 import org.apache.hadoop.hive.ql.metadata.{Table => HiveTable, _}
@@ -46,6 +45,7 @@ import org.apache.spark.sql.execution.datasources.parquet.{DefaultSource => Parq
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.HiveNativeCommand
 import org.apache.spark.sql.hive.orc.{DefaultSource => OrcDefaultSource}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 private[hive] case class HiveSerDe(
@@ -59,10 +59,10 @@ private[hive] object HiveSerDe {
    *
    * @param source Currently the source abbreviation can be one of the following:
    *               SequenceFile, RCFile, ORC, PARQUET, and case insensitive.
-   * @param hiveConf Hive Conf
+   * @param conf SQLConf
    * @return HiveSerDe associated with the specified source
    */
-  def sourceToSerDe(source: String, hiveConf: HiveConf): Option[HiveSerDe] = {
+  def sourceToSerDe(source: String, conf: SQLConf): Option[HiveSerDe] = {
     val serdeMap = Map(
       "sequencefile" ->
         HiveSerDe(
@@ -73,7 +73,8 @@ private[hive] object HiveSerDe {
         HiveSerDe(
           inputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileInputFormat"),
           outputFormat = Option("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"),
-          serde = Option(hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTRCFILESERDE))),
+          serde = Option(conf.getConfString("hive.default.rcfile.serde",
+            "org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))),
 
       "orc" ->
         HiveSerDe(
@@ -297,7 +298,7 @@ private[hive] class HiveMetastoreCatalog(hive: SQLContext) extends Logging {
       CatalogTableType.MANAGED_TABLE
     }
 
-    val maybeSerDe = HiveSerDe.sourceToSerDe(provider, hiveconf)
+    val maybeSerDe = HiveSerDe.sourceToSerDe(provider, conf)
     val dataSource =
       DataSource(
         hive,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionState.scala
@@ -53,11 +53,6 @@ private[hive] class HiveSessionState(ctx: SQLContext) extends SessionState(ctx)
    */
   lazy val metadataHive: HiveClient = sharedState.metadataHive.newSession()
 
-  /**
-   * A Hive helper class for substituting variables in a SQL statement.
-   */
-  lazy val substitutor = new VariableSubstitution
-
   override lazy val conf: SQLConf = new SQLConf {
     override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
   }
@@ -114,7 +109,7 @@ private[hive] class HiveSessionState(ctx: SQLContext) extends SessionState(ctx)
   /**
    * Parser for HiveQl query texts.
    */
-  override lazy val sqlParser: ParserInterface = new HiveSqlParser(substitutor, hiveconf)
+  override lazy val sqlParser: ParserInterface = new HiveSqlParser(conf, hiveconf)
 
   /**
    * Planner that takes into account Hive-specific strategies.
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveSqlParser.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveSqlParser.scala
@@ -17,13 +17,12 @@
 package org.apache.spark.sql.hive.execution
 
 import scala.collection.JavaConverters._
+import scala.util.Try
 
 import org.antlr.v4.runtime.{ParserRuleContext, Token}
 import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.ql.parse.{EximUtil, VariableSubstitution}
+import org.apache.hadoop.hive.ql.parse.VariableSubstitution
 import org.apache.hadoop.hive.serde.serdeConstants
-import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.parser._
@@ -32,18 +31,16 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkSqlAstBuilder
 import org.apache.spark.sql.execution.command.{CreateTable, CreateTableLike}
 import org.apache.spark.sql.hive.{CreateTableAsSelect => CTAS, CreateViewAsSelect => CreateView, HiveSerDe}
-import org.apache.spark.sql.hive.{HiveGenericUDTF, HiveMetastoreTypes, HiveSerDe}
-import org.apache.spark.sql.hive.HiveShim.HiveFunctionWrapper
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Concrete parser for HiveQl statements.
  */
-class HiveSqlParser(
-    substitutor: VariableSubstitution,
-    hiveconf: HiveConf)
-  extends AbstractSqlParser {
+class HiveSqlParser(conf: SQLConf, hiveconf: HiveConf) extends AbstractSqlParser {
 
-  val astBuilder = new HiveSqlAstBuilder(hiveconf)
+  val astBuilder = new HiveSqlAstBuilder(conf)
+
+  lazy val substitutor = new VariableSubstitution
 
   protected override def parse[T](command: String)(toResult: SqlBaseParser => T): T = {
     super.parse(substitutor.substitute(hiveconf, command))(toResult)
@@ -57,7 +54,7 @@ class HiveSqlParser(
 /**
  * Builder that converts an ANTLR ParseTree into a LogicalPlan/Expression/TableIdentifier.
  */
-class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
+class HiveSqlAstBuilder(conf: SQLConf) extends SparkSqlAstBuilder {
   import ParserUtils._
 
   /**
@@ -184,8 +181,8 @@ class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
 
     // Storage format
     val defaultStorage: CatalogStorageFormat = {
-      val defaultStorageType = hiveConf.getVar(HiveConf.ConfVars.HIVEDEFAULTFILEFORMAT)
-      val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType, hiveConf)
+      val defaultStorageType = conf.getConfString("hive.default.fileformat", "textfile")
+      val defaultHiveSerde = HiveSerDe.sourceToSerDe(defaultStorageType, conf)
       CatalogStorageFormat(
         locationUri = None,
         inputFormat = defaultHiveSerde.flatMap(_.inputFormat)
@@ -323,7 +320,7 @@ class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
 
     // Decode and input/output format.
     type Format = (Seq[(String, String)], Option[String], Seq[(String, String)], Option[String])
-    def format(fmt: RowFormatContext, confVar: ConfVars): Format = fmt match {
+    def format(fmt: RowFormatContext, configKey: String): Format = fmt match {
       case c: RowFormatDelimitedContext =>
         // TODO we should use the visitRowFormatDelimited function here. However HiveScriptIOSchema
         // expects a seq of pairs in which the old parsers' token names are used as keys.
@@ -345,26 +342,27 @@ class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
         val CatalogStorageFormat(None, None, None, Some(name), props) = visitRowFormatSerde(c)
 
         // SPARK-10310: Special cases LazySimpleSerDe
-        val recordHandler = if (name == classOf[LazySimpleSerDe].getCanonicalName) {
-          Option(hiveConf.getVar(confVar))
+        val recordHandler = if (name == "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") {
+          Try(conf.getConfString(configKey)).toOption
         } else {
           None
         }
         (Seq.empty, Option(name), props.toSeq, recordHandler)
 
       case null =>
         // Use default (serde) format.
-        val name = hiveConf.getVar(ConfVars.HIVESCRIPTSERDE)
+        val name = conf.getConfString("hive.script.serde",
+          "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
         val props = Seq(serdeConstants.FIELD_DELIM -> "\t")
-        val recordHandler = Option(hiveConf.getVar(confVar))
+        val recordHandler = Try(conf.getConfString(configKey)).toOption
         (Nil, Option(name), props, recordHandler)
     }
 
     val (inFormat, inSerdeClass, inSerdeProps, reader) =
-      format(inRowFormat, ConfVars.HIVESCRIPTRECORDREADER)
+      format(inRowFormat, "hive.script.recordreader")
 
     val (outFormat, outSerdeClass, outSerdeProps, writer) =
-      format(inRowFormat, ConfVars.HIVESCRIPTRECORDWRITER)
+      format(outRowFormat, "hive.script.recordwriter")
 
     HiveScriptIOSchema(
       inFormat, outFormat,
@@ -374,13 +372,6 @@ class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
       schemaLess)
   }
 
-  /**
-   * Create location string.
-   */
-  override def visitLocationSpec(ctx: LocationSpecContext): String = {
-    EximUtil.relativeToAbsolutePath(hiveConf, super.visitLocationSpec(ctx))
-  }
-
   /** Empty storage format for default values and copies. */
   private val EmptyStorageFormat = CatalogStorageFormat(None, None, None, None, Map.empty)
 
@@ -402,7 +393,7 @@ class HiveSqlAstBuilder(hiveConf: HiveConf) extends SparkSqlAstBuilder {
   override def visitGenericFileFormat(
       ctx: GenericFileFormatContext): CatalogStorageFormat = withOrigin(ctx) {
     val source = ctx.identifier.getText
-    HiveSerDe.sourceToSerDe(source, hiveConf) match {
+    HiveSerDe.sourceToSerDe(source, conf) match {
       case Some(s) =>
         EmptyStorageFormat.copy(
           inputFormat = s.inputFormat,