[WIP] Use SPARK_HIVE to determine if we include Hive in packaging

aarondav · aarondav · commit a9269b582d69 · 2014-04-05T17:30:53.000-07:00
Previously, we based our decision regarding including datanucleus jars
based on the existence of a spark-hive-assembly jar, which was incididentally
built whenever "sbt assembly" is run. This means that a typical and
previously supported pathway would start using hive jars.

This patch has the following features/bug fixes:

- Use of SPARK_HIVE (default false) to determine if we should include Hive
  in the assembly jar.
- Analagous feature in Maven with -Phive.
- assemble-deps fixed since we no longer use a different ASSEMBLY_DIR

Still TODO before mergeable:
- We need to download the datanucleus jars outside of sbt. Perhaps we can have
  spark-class download them if SPARK_HIVE is set similar to how sbt downloads
  itself.
- Spark SQL documentation updates.
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -163,6 +163,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>hive</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>spark-ganglia-lgpl</id>
       <dependencies>
diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -35,17 +35,16 @@ CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 # include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
 # Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
 # the future.
-if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
+if [ "$SPARK_HIVE" = "true" ]; then
+  echo 1>&2 "SPARK_HIVE is set, including Hive support."
 
   # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
   DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
   CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-
-  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
-else
-  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
 fi
 
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
+
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
 if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
@@ -59,7 +58,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
diff --git a/bin/spark-class b/bin/spark-class
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
 fi
 
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-
-
diff --git a/dev/create-release/create-release.sh b/dev/create-release/create-release.sh
@@ -49,14 +49,14 @@ mvn -DskipTests \
   -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
   -Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
   -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-  -Pyarn -Pspark-ganglia-lgpl \
+  -Pyarn -Phive -Pspark-ganglia-lgpl\
   -Dtag=$GIT_TAG -DautoVersionSubmodules=true \
   --batch-mode release:prepare
 
 mvn -DskipTests \
   -Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
   -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-  -Pyarn -Pspark-ganglia-lgpl\
+  -Pyarn -Phive -Pspark-ganglia-lgpl\
   release:perform
 
 rm -rf spark
diff --git a/pom.xml b/pom.xml
@@ -373,7 +373,6 @@
         <groupId>org.apache.derby</groupId>
         <artifactId>derby</artifactId>
         <version>10.4.2.0</version>
-        <scope>test</scope>
       </dependency>
       <dependency>
         <groupId>net.liftweb</groupId>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -43,6 +43,8 @@ object SparkBuild extends Build {
 
   val DEFAULT_YARN = false
 
+  val DEFAULT_HIVE = false
+
   // HBase version; set as appropriate.
   val HBASE_VERSION = "0.94.6"
 
@@ -67,15 +69,17 @@ object SparkBuild extends Build {
 
   lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)
 
-  // Since hive is its own assembly, it depends on all of the modules.
-  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
+  lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
+
+  lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
+  lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
 
   lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
 
   lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
 
   lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
-    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
+    .dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
 
   lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
 
@@ -101,6 +105,11 @@ object SparkBuild extends Build {
   lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
   val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
 
+  lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
+    case None => DEFAULT_HIVE
+    case Some(v) => v.toBoolean
+  }
+
   // Include Ganglia integration if the user has enabled Ganglia
   // This is isolated from the normal build due to LGPL-licensed code in the library
   lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
@@ -141,13 +150,13 @@ object SparkBuild extends Build {
   lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
 
   lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
-    .dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
+    .dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
 
   // Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
-  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
+  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
 
   lazy val allProjects = packageProjects ++ allExternalRefs ++
-    Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
+    Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
 
   def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
     organization       := "org.apache.spark",
@@ -412,8 +421,6 @@ object SparkBuild extends Build {
   // assembly jar.
   def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
     name := "spark-hive",
-    jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
-    jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
     javaOptions += "-XX:MaxPermSize=1g",
     libraryDependencies ++= Seq(
       "org.apache.hive" % "hive-metastore" % "0.12.0",
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
@@ -63,6 +63,12 @@
             <artifactId>hive-exec</artifactId>
             <version>${hive.version}</version>
         </dependency>
+        <dependency>
+            <!-- Matches the version of jackson-core-asl pulled in by avro -->
+            <groupId>org.codehaus.jackson</groupId>
+            <artifactId>jackson-mapper-asl</artifactId>
+            <version>1.8.8</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.hive</groupId>
             <artifactId>hive-serde</artifactId>

-Original file line number
+Diff line change
 fi
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
+-
+-