Skip to content

Commit 968ca9d

Browse files
committed
merged master
2 parents 7fc9545 + 5200872 commit 968ca9d

File tree

333 files changed

+5575
-1513
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

333 files changed

+5575
-1513
lines changed

.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,9 @@ unit-tests.log
4949
/lib/
5050
rat-results.txt
5151
scalastyle.txt
52+
53+
# For Hive
54+
metastore_db/
55+
metastore/
56+
warehouse/
57+
TempStatsStore/

assembly/pom.xml

-13
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,6 @@
4040
<deb.user>root</deb.user>
4141
</properties>
4242

43-
<repositories>
44-
<!-- A repository in the local filesystem for the Py4J JAR, which is not in Maven central -->
45-
<repository>
46-
<id>lib</id>
47-
<url>file://${project.basedir}/lib</url>
48-
</repository>
49-
</repositories>
50-
5143
<dependencies>
5244
<dependency>
5345
<groupId>org.apache.spark</groupId>
@@ -84,11 +76,6 @@
8476
<artifactId>spark-sql_${scala.binary.version}</artifactId>
8577
<version>${project.version}</version>
8678
</dependency>
87-
<dependency>
88-
<groupId>net.sf.py4j</groupId>
89-
<artifactId>py4j</artifactId>
90-
<version>0.8.1</version>
91-
</dependency>
9279
</dependencies>
9380

9481
<build>

bin/compute-classpath.sh

+38-15
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:$FWDIR/conf"
3232

3333
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
3434

35+
if [ -n "$JAVA_HOME" ]; then
36+
JAR_CMD="$JAVA_HOME/bin/jar"
37+
else
38+
JAR_CMD="jar"
39+
fi
40+
3541
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
3642
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
3743
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
@@ -44,33 +50,50 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
4450
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
4551
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
4652
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
53+
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
4754

48-
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
49-
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
55+
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
5056
else
5157
# Else use spark-assembly jar from either RELEASE or assembly directory
5258
if [ -f "$FWDIR/RELEASE" ]; then
53-
ASSEMBLY_JAR=`ls "$FWDIR"/lib/spark-assembly*hadoop*.jar`
59+
ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
5460
else
55-
ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar`
61+
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
5662
fi
57-
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
5863
fi
5964

65+
# Verify that versions of java used to build the jars and run Spark are compatible
66+
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
67+
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
68+
echo "Loading Spark jar with '$JAR_CMD' failed. "
69+
echo "This is likely because Spark was compiled with Java 7 and run "
70+
echo "with Java 6. (see SPARK-1703). Please use Java 7 to run Spark "
71+
echo "or build Spark with Java 6."
72+
exit 1
73+
fi
74+
75+
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
76+
6077
# When Hive support is needed, Datanucleus jars must be included on the classpath.
61-
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
78+
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
6279
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
6380
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
6481
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
6582
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
66-
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
67-
if [ $num_datanucleus_jars -gt 0 ]; then
68-
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
69-
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
70-
if [ $num_hive_files -gt 0 ]; then
83+
if [ -f "$FWDIR/RELEASE" ]; then
84+
datanucleus_dir="$FWDIR"/lib
85+
else
86+
datanucleus_dir="$FWDIR"/lib_managed/jars
87+
fi
88+
89+
datanucleus_jars=$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")
90+
datanucleus_jars=$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)
91+
92+
if [ -n "$datanucleus_jars" ]; then
93+
hive_files=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null)
94+
if [ -n "$hive_files" ]; then
7195
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
72-
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
73-
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
96+
CLASSPATH="$CLASSPATH:$datanucleus_jars"
7497
fi
7598
fi
7699

@@ -90,10 +113,10 @@ fi
90113
# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
91114
# Note, this assumes that there is either a HADOOP_CONF_DIR or YARN_CONF_DIR which hosts
92115
# the configurtion files.
93-
if [ "x" != "x$HADOOP_CONF_DIR" ]; then
116+
if [ -n "$HADOOP_CONF_DIR" ]; then
94117
CLASSPATH="$CLASSPATH:$HADOOP_CONF_DIR"
95118
fi
96-
if [ "x" != "x$YARN_CONF_DIR" ]; then
119+
if [ -n "$YARN_CONF_DIR" ]; then
97120
CLASSPATH="$CLASSPATH:$YARN_CONF_DIR"
98121
fi
99122

bin/pyspark

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ export PYSPARK_PYTHON
4646

4747
# Add the PySpark classes to the Python path:
4848
export PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH
49+
export PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.8.1-src.zip:$PYTHONPATH
4950

5051
# Load the PySpark shell.py script when ./pyspark is used interactively:
5152
export OLD_PYTHONSTARTUP=$PYTHONSTARTUP

bin/pyspark2.cmd

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ rem Figure out which Python to use.
4545
if "x%PYSPARK_PYTHON%"=="x" set PYSPARK_PYTHON=python
4646

4747
set PYTHONPATH=%FWDIR%python;%PYTHONPATH%
48+
set PYTHONPATH=%FWDIR%python\lib\py4j-0.8.1-src.zip;%PYTHONPATH%
4849

4950
set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
5051
set PYTHONSTARTUP=%FWDIR%python\pyspark\shell.py

bin/spark-class

+8-1
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,14 @@ if [ -e "$TOOLS_DIR"/target/spark-tools*[0-9Tg].jar ]; then
138138
fi
139139

140140
# Compute classpath using external script
141-
CLASSPATH=`$FWDIR/bin/compute-classpath.sh`
141+
classpath_output=$($FWDIR/bin/compute-classpath.sh)
142+
if [[ "$?" != "0" ]]; then
143+
echo "$classpath_output"
144+
exit 1
145+
else
146+
CLASSPATH=$classpath_output
147+
fi
148+
142149
if [[ "$1" =~ org.apache.spark.tools.* ]]; then
143150
CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
144151
fi

bin/spark-submit

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#
1919

2020
export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
21-
ORIG_ARGS=$@
21+
ORIG_ARGS=("$@")
2222

2323
while (($#)); do
2424
if [ "$1" = "--deploy-mode" ]; then
@@ -39,5 +39,5 @@ if [ ! -z $DRIVER_MEMORY ] && [ ! -z $DEPLOY_MODE ] && [ $DEPLOY_MODE = "client"
3939
export SPARK_MEM=$DRIVER_MEMORY
4040
fi
4141

42-
$SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit $ORIG_ARGS
42+
$SPARK_HOME/bin/spark-class org.apache.spark.deploy.SparkSubmit "${ORIG_ARGS[@]}"
4343

core/pom.xml

+51-6
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,6 @@
3838
<dependency>
3939
<groupId>net.java.dev.jets3t</groupId>
4040
<artifactId>jets3t</artifactId>
41-
<exclusions>
42-
<exclusion>
43-
<groupId>commons-logging</groupId>
44-
<artifactId>commons-logging</artifactId>
45-
</exclusion>
46-
</exclusions>
4741
</dependency>
4842
<dependency>
4943
<groupId>org.apache.curator</groupId>
@@ -69,6 +63,10 @@
6963
<groupId>com.google.guava</groupId>
7064
<artifactId>guava</artifactId>
7165
</dependency>
66+
<dependency>
67+
<groupId>org.apache.commons</groupId>
68+
<artifactId>commons-lang3</artifactId>
69+
</dependency>
7270
<dependency>
7371
<groupId>com.google.code.findbugs</groupId>
7472
<artifactId>jsr305</artifactId>
@@ -249,6 +247,11 @@
249247
<artifactId>pyrolite</artifactId>
250248
<version>2.0.1</version>
251249
</dependency>
250+
<dependency>
251+
<groupId>net.sf.py4j</groupId>
252+
<artifactId>py4j</artifactId>
253+
<version>0.8.1</version>
254+
</dependency>
252255
</dependencies>
253256
<build>
254257
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
@@ -294,6 +297,48 @@
294297
</environmentVariables>
295298
</configuration>
296299
</plugin>
300+
<!-- Unzip py4j so we can include its files in the jar -->
301+
<plugin>
302+
<groupId>org.codehaus.mojo</groupId>
303+
<artifactId>exec-maven-plugin</artifactId>
304+
<version>1.2.1</version>
305+
<executions>
306+
<execution>
307+
<phase>generate-resources</phase>
308+
<goals>
309+
<goal>exec</goal>
310+
</goals>
311+
</execution>
312+
</executions>
313+
<configuration>
314+
<executable>unzip</executable>
315+
<workingDirectory>../python</workingDirectory>
316+
<arguments>
317+
<argument>-o</argument>
318+
<argument>lib/py4j*.zip</argument>
319+
<argument>-d</argument>
320+
<argument>build</argument>
321+
</arguments>
322+
</configuration>
323+
</plugin>
297324
</plugins>
325+
326+
<resources>
327+
<resource>
328+
<directory>src/main/resources</directory>
329+
</resource>
330+
<resource>
331+
<directory>../python</directory>
332+
<includes>
333+
<include>pyspark/*.py</include>
334+
</includes>
335+
</resource>
336+
<resource>
337+
<directory>../python/build</directory>
338+
<includes>
339+
<include>py4j/*.py</include>
340+
</includes>
341+
</resource>
342+
</resources>
298343
</build>
299344
</project>

core/src/main/scala/org/apache/spark/SecurityManager.scala

+3-2
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,9 @@ private[spark] class SecurityManager(sparkConf: SparkConf) extends Logging {
146146
setViewAcls(defaultAclUsers, sparkConf.get("spark.ui.view.acls", ""))
147147

148148
private val secretKey = generateSecretKey()
149-
logInfo("SecurityManager, is authentication enabled: " + authOn +
150-
" are ui acls enabled: " + uiAclsOn + " users with view permissions: " + viewAcls.toString())
149+
logInfo("SecurityManager: authentication " + (if (authOn) "enabled" else "disabled") +
150+
"; ui acls " + (if (uiAclsOn) "enabled" else "disabled") +
151+
"; users with view permissions: " + viewAcls.toString())
151152

152153
// Set our own authenticator to properly negotiate user/password for HTTP connections.
153154
// This is needed by the HTTP client fetching from the HttpServer. Put here so its

core/src/main/scala/org/apache/spark/SparkEnv.scala

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
package org.apache.spark
1919

20+
import java.io.File
21+
2022
import scala.collection.JavaConversions._
2123
import scala.collection.mutable
2224
import scala.concurrent.Await
@@ -156,13 +158,11 @@ object SparkEnv extends Logging {
156158
conf.set("spark.driver.port", boundPort.toString)
157159
}
158160

159-
val classLoader = Thread.currentThread.getContextClassLoader
160-
161161
// Create an instance of the class named by the given Java system property, or by
162162
// defaultClassName if the property is not set, and return it as a T
163163
def instantiateClass[T](propertyName: String, defaultClassName: String): T = {
164164
val name = conf.get(propertyName, defaultClassName)
165-
val cls = Class.forName(name, true, classLoader)
165+
val cls = Class.forName(name, true, Utils.getContextOrSparkClassLoader)
166166
// First try with the constructor that takes SparkConf. If we can't find one,
167167
// use a no-arg constructor instead.
168168
try {
@@ -306,7 +306,7 @@ object SparkEnv extends Logging {
306306
k == "java.class.path"
307307
}.getOrElse(("", ""))
308308
val classPathEntries = classPathProperty._2
309-
.split(conf.get("path.separator", ":"))
309+
.split(File.pathSeparator)
310310
.filterNot(e => e.isEmpty)
311311
.map(e => (e, "System Classpath"))
312312
val addedJarsAndFiles = (addedJars ++ addedFiles).map((_, "Added By User"))

core/src/main/scala/org/apache/spark/TaskContext.scala

+5
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,21 @@ class TaskContext(
4242
// List of callback functions to execute when the task completes.
4343
@transient private val onCompleteCallbacks = new ArrayBuffer[() => Unit]
4444

45+
// Set to true when the task is completed, before the onCompleteCallbacks are executed.
46+
@volatile var completed: Boolean = false
47+
4548
/**
4649
* Add a callback function to be executed on task completion. An example use
4750
* is for HadoopRDD to register a callback to close the input stream.
51+
* Will be called in any situation - success, failure, or cancellation.
4852
* @param f Callback function.
4953
*/
5054
def addOnCompleteCallback(f: () => Unit) {
5155
onCompleteCallbacks += f
5256
}
5357

5458
def executeOnCompleteCallbacks() {
59+
completed = true
5560
// Process complete callbacks in the reverse order of registration
5661
onCompleteCallbacks.reverse.foreach{_()}
5762
}

core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala

+23-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
package org.apache.spark.api.java
1919

2020
import java.util.{Comparator, List => JList, Iterator => JIterator}
21-
import java.lang.{Iterable => JIterable}
21+
import java.lang.{Iterable => JIterable, Long => JLong}
2222

2323
import scala.collection.JavaConversions._
2424
import scala.reflect.ClassTag
@@ -75,11 +75,11 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
7575
* Return a new RDD by applying a function to each partition of this RDD, while tracking the index
7676
* of the original partition.
7777
*/
78-
def mapPartitionsWithIndex[R: ClassTag](
78+
def mapPartitionsWithIndex[R](
7979
f: JFunction2[java.lang.Integer, java.util.Iterator[T], java.util.Iterator[R]],
8080
preservesPartitioning: Boolean = false): JavaRDD[R] =
8181
new JavaRDD(rdd.mapPartitionsWithIndex(((a,b) => f(a,asJavaIterator(b))),
82-
preservesPartitioning))
82+
preservesPartitioning)(fakeClassTag))(fakeClassTag)
8383

8484
/**
8585
* Return a new RDD by applying a function to all elements of this RDD.
@@ -264,6 +264,26 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
264264
rdd.zipPartitions(other.rdd)(fn)(other.classTag, fakeClassTag[V]))(fakeClassTag[V])
265265
}
266266

267+
/**
268+
* Zips this RDD with generated unique Long ids. Items in the kth partition will get ids k, n+k,
269+
* 2*n+k, ..., where n is the number of partitions. So there may exist gaps, but this method
270+
* won't trigger a spark job, which is different from [[org.apache.spark.rdd.RDD#zipWithIndex]].
271+
*/
272+
def zipWithUniqueId(): JavaPairRDD[T, JLong] = {
273+
JavaPairRDD.fromRDD(rdd.zipWithUniqueId()).asInstanceOf[JavaPairRDD[T, JLong]]
274+
}
275+
276+
/**
277+
* Zips this RDD with its element indices. The ordering is first based on the partition index
278+
* and then the ordering of items within each partition. So the first item in the first
279+
* partition gets index 0, and the last item in the last partition receives the largest index.
280+
* This is similar to Scala's zipWithIndex but it uses Long instead of Int as the index type.
281+
* This method needs to trigger a spark job when this RDD contains more than one partitions.
282+
*/
283+
def zipWithIndex(): JavaPairRDD[T, JLong] = {
284+
JavaPairRDD.fromRDD(rdd.zipWithIndex()).asInstanceOf[JavaPairRDD[T, JLong]]
285+
}
286+
267287
// Actions (launch a job to return a value to the user program)
268288

269289
/**

0 commit comments

Comments
 (0)