Skip to content

Commit a9269b5

Browse files
committed
[WIP] Use SPARK_HIVE to determine if we include Hive in packaging
Previously, we based our decision regarding including datanucleus jars based on the existence of a spark-hive-assembly jar, which was incididentally built whenever "sbt assembly" is run. This means that a typical and previously supported pathway would start using hive jars. This patch has the following features/bug fixes: - Use of SPARK_HIVE (default false) to determine if we should include Hive in the assembly jar. - Analagous feature in Maven with -Phive. - assemble-deps fixed since we no longer use a different ASSEMBLY_DIR Still TODO before mergeable: - We need to download the datanucleus jars outside of sbt. Perhaps we can have spark-class download them if SPARK_HIVE is set similar to how sbt downloads itself. - Spark SQL documentation updates.
1 parent 6e88583 commit a9269b5

File tree

7 files changed

+38
-19
lines changed

7 files changed

+38
-19
lines changed

assembly/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,16 @@
163163
</dependency>
164164
</dependencies>
165165
</profile>
166+
<profile>
167+
<id>hive</id>
168+
<dependencies>
169+
<dependency>
170+
<groupId>org.apache.spark</groupId>
171+
<artifactId>spark-hive_${scala.binary.version}</artifactId>
172+
<version>${project.version}</version>
173+
</dependency>
174+
</dependencies>
175+
</profile>
166176
<profile>
167177
<id>spark-ganglia-lgpl</id>
168178
<dependencies>

bin/compute-classpath.sh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,16 @@ CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
3535
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
3636
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
3737
# the future.
38-
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
38+
if [ "$SPARK_HIVE" = "true" ]; then
39+
echo 1>&2 "SPARK_HIVE is set, including Hive support."
3940

4041
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
4142
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
4243
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
43-
44-
ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
45-
else
46-
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
4744
fi
4845

46+
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
47+
4948
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
5049
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5150
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
@@ -59,7 +58,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
5958
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
6059
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
6160

62-
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
61+
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
6362
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
6463
else
6564
# Else use spark-assembly jar from either RELEASE or assembly directory

bin/spark-class

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
154154
fi
155155

156156
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
157-
158-

dev/create-release/create-release.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,14 @@ mvn -DskipTests \
4949
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
5050
-Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
5151
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
52-
-Pyarn -Pspark-ganglia-lgpl \
52+
-Pyarn -Phive -Pspark-ganglia-lgpl\
5353
-Dtag=$GIT_TAG -DautoVersionSubmodules=true \
5454
--batch-mode release:prepare
5555

5656
mvn -DskipTests \
5757
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
5858
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
59-
-Pyarn -Pspark-ganglia-lgpl\
59+
-Pyarn -Phive -Pspark-ganglia-lgpl\
6060
release:perform
6161

6262
rm -rf spark

pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,6 @@
373373
<groupId>org.apache.derby</groupId>
374374
<artifactId>derby</artifactId>
375375
<version>10.4.2.0</version>
376-
<scope>test</scope>
377376
</dependency>
378377
<dependency>
379378
<groupId>net.liftweb</groupId>

project/SparkBuild.scala

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ object SparkBuild extends Build {
4343

4444
val DEFAULT_YARN = false
4545

46+
val DEFAULT_HIVE = false
47+
4648
// HBase version; set as appropriate.
4749
val HBASE_VERSION = "0.94.6"
4850

@@ -67,15 +69,17 @@ object SparkBuild extends Build {
6769

6870
lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)
6971

70-
// Since hive is its own assembly, it depends on all of the modules.
71-
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
72+
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)
73+
74+
lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
75+
lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()
7276

7377
lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)
7478

7579
lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
7680

7781
lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
78-
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
82+
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)
7983

8084
lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
8185

@@ -101,6 +105,11 @@ object SparkBuild extends Build {
101105
lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
102106
val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()
103107

108+
lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
109+
case None => DEFAULT_HIVE
110+
case Some(v) => v.toBoolean
111+
}
112+
104113
// Include Ganglia integration if the user has enabled Ganglia
105114
// This is isolated from the normal build due to LGPL-licensed code in the library
106115
lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
@@ -141,13 +150,13 @@ object SparkBuild extends Build {
141150
lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)
142151

143152
lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
144-
.dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
153+
.dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)
145154

146155
// Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
147-
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
156+
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef
148157

149158
lazy val allProjects = packageProjects ++ allExternalRefs ++
150-
Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
159+
Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests
151160

152161
def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
153162
organization := "org.apache.spark",
@@ -412,8 +421,6 @@ object SparkBuild extends Build {
412421
// assembly jar.
413422
def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
414423
name := "spark-hive",
415-
jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
416-
jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
417424
javaOptions += "-XX:MaxPermSize=1g",
418425
libraryDependencies ++= Seq(
419426
"org.apache.hive" % "hive-metastore" % "0.12.0",

sql/hive/pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,12 @@
6363
<artifactId>hive-exec</artifactId>
6464
<version>${hive.version}</version>
6565
</dependency>
66+
<dependency>
67+
<!-- Matches the version of jackson-core-asl pulled in by avro -->
68+
<groupId>org.codehaus.jackson</groupId>
69+
<artifactId>jackson-mapper-asl</artifactId>
70+
<version>1.8.8</version>
71+
</dependency>
6672
<dependency>
6773
<groupId>org.apache.hive</groupId>
6874
<artifactId>hive-serde</artifactId>

0 commit comments

Comments
 (0)