Skip to content

Commit 192411d

Browse files
committed
Merge pull request apache#17 from YanTangZhai/master
Update
2 parents 1e1ebb4 + e4c2c0a commit 192411d

File tree

560 files changed

+13303
-5256
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

560 files changed

+13303
-5256
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
*.ipr
66
*.iml
77
*.iws
8+
*.pyc
89
.idea/
910
.idea_modules/
1011
sbt/*.jar
@@ -49,9 +50,12 @@ dependency-reduced-pom.xml
4950
checkpoint
5051
derby.log
5152
dist/
53+
dev/create-release/*txt
54+
dev/create-release/*final
5255
spark-*-bin-*.tgz
5356
unit-tests.log
5457
/lib/
58+
ec2/lib/
5559
rat-results.txt
5660
scalastyle.txt
5761
scalastyle-output.xml

.rat-excludes

+1
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,4 @@ dist/*
6464
logs
6565
.*scalastyle-output.xml
6666
.*dependency-reduced-pom.xml
67+
known_translations

LICENSE

+2-1
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,8 @@ THE SOFTWARE.
646646

647647
========================================================================
648648
For Scala Interpreter classes (all .scala files in repl/src/main/scala
649-
except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala):
649+
except for Main.Scala, SparkHelper.scala and ExecutorClassLoader.scala),
650+
and for SerializableMapWrapper in JavaUtils.scala:
650651
========================================================================
651652

652653
Copyright (c) 2002-2013 EPFL

assembly/pom.xml

-10
Original file line numberDiff line numberDiff line change
@@ -169,16 +169,6 @@
169169
</build>
170170

171171
<profiles>
172-
<profile>
173-
<id>yarn-alpha</id>
174-
<dependencies>
175-
<dependency>
176-
<groupId>org.apache.spark</groupId>
177-
<artifactId>spark-yarn-alpha_${scala.binary.version}</artifactId>
178-
<version>${project.version}</version>
179-
</dependency>
180-
</dependencies>
181-
</profile>
182172
<profile>
183173
<id>yarn</id>
184174
<dependencies>

bin/beeline.cmd

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
@echo off
2+
3+
rem
4+
rem Licensed to the Apache Software Foundation (ASF) under one or more
5+
rem contributor license agreements. See the NOTICE file distributed with
6+
rem this work for additional information regarding copyright ownership.
7+
rem The ASF licenses this file to You under the Apache License, Version 2.0
8+
rem (the "License"); you may not use this file except in compliance with
9+
rem the License. You may obtain a copy of the License at
10+
rem
11+
rem http://www.apache.org/licenses/LICENSE-2.0
12+
rem
13+
rem Unless required by applicable law or agreed to in writing, software
14+
rem distributed under the License is distributed on an "AS IS" BASIS,
15+
rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
rem See the License for the specific language governing permissions and
17+
rem limitations under the License.
18+
rem
19+
20+
set SPARK_HOME=%~dp0..
21+
cmd /V /E /C %SPARK_HOME%\bin\spark-class.cmd org.apache.hive.beeline.BeeLine %*

bin/compute-classpath.sh

+8-4
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
2525

2626
. "$FWDIR"/bin/load-spark-env.sh
2727

28-
CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH"
28+
if [ -n "$SPARK_CLASSPATH" ]; then
29+
CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH"
30+
else
31+
CLASSPATH="$SPARK_SUBMIT_CLASSPATH"
32+
fi
2933

3034
# Build up classpath
3135
if [ -n "$SPARK_CONF_DIR" ]; then
@@ -68,14 +72,14 @@ else
6872
assembly_folder="$ASSEMBLY_DIR"
6973
fi
7074

71-
num_jars="$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)"
75+
num_jars="$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar$" | wc -l)"
7276
if [ "$num_jars" -eq "0" ]; then
7377
echo "Failed to find Spark assembly in $assembly_folder"
7478
echo "You need to build Spark before running this program."
7579
exit 1
7680
fi
7781
if [ "$num_jars" -gt "1" ]; then
78-
jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
82+
jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar$")
7983
echo "Found multiple Spark assembly jars in $assembly_folder:"
8084
echo "$jars_list"
8185
echo "Please remove all but one jar."
@@ -108,7 +112,7 @@ else
108112
datanucleus_dir="$FWDIR"/lib_managed/jars
109113
fi
110114

111-
datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar")"
115+
datanucleus_jars="$(find "$datanucleus_dir" 2>/dev/null | grep "datanucleus-.*\\.jar$")"
112116
datanucleus_jars="$(echo "$datanucleus_jars" | tr "\n" : | sed s/:$//g)"
113117

114118
if [ -n "$datanucleus_jars" ]; then

bin/spark-shell

+7
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,13 @@ source "$FWDIR"/bin/utils.sh
4545
SUBMIT_USAGE_FUNCTION=usage
4646
gatherSparkSubmitOpts "$@"
4747

48+
# SPARK-4161: scala does not assume use of the java classpath,
49+
# so we need to add the "-Dscala.usejavacp=true" flag mnually. We
50+
# do this specifically for the Spark shell because the scala REPL
51+
# has its own class loader, and any additional classpath specified
52+
# through spark.driver.extraClassPath is not automatically propagated.
53+
SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Dscala.usejavacp=true"
54+
4855
function main() {
4956
if $cygwin; then
5057
# Workaround for issue involving JLine and Cygwin

bin/spark-shell2.cmd

+20-1
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,23 @@ rem
1919

2020
set SPARK_HOME=%~dp0..
2121

22-
cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd --class org.apache.spark.repl.Main %* spark-shell
22+
echo "%*" | findstr " --help -h" >nul
23+
if %ERRORLEVEL% equ 0 (
24+
call :usage
25+
exit /b 0
26+
)
27+
28+
call %SPARK_HOME%\bin\windows-utils.cmd %*
29+
if %ERRORLEVEL% equ 1 (
30+
call :usage
31+
exit /b 1
32+
)
33+
34+
cmd /V /E /C %SPARK_HOME%\bin\spark-submit.cmd --class org.apache.spark.repl.Main %SUBMISSION_OPTS% spark-shell %APPLICATION_OPTS%
35+
36+
exit /b 0
37+
38+
:usage
39+
echo "Usage: .\bin\spark-shell.cmd [options]" >&2
40+
%SPARK_HOME%\bin\spark-submit --help 2>&1 | findstr /V "Usage" 1>&2
41+
exit /b 0

bin/windows-utils.cmd

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
rem
2+
rem Licensed to the Apache Software Foundation (ASF) under one or more
3+
rem contributor license agreements. See the NOTICE file distributed with
4+
rem this work for additional information regarding copyright ownership.
5+
rem The ASF licenses this file to You under the Apache License, Version 2.0
6+
rem (the "License"); you may not use this file except in compliance with
7+
rem the License. You may obtain a copy of the License at
8+
rem
9+
rem http://www.apache.org/licenses/LICENSE-2.0
10+
rem
11+
rem Unless required by applicable law or agreed to in writing, software
12+
rem distributed under the License is distributed on an "AS IS" BASIS,
13+
rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
rem See the License for the specific language governing permissions and
15+
rem limitations under the License.
16+
rem
17+
18+
rem Gather all spark-submit options into SUBMISSION_OPTS
19+
20+
set SUBMISSION_OPTS=
21+
set APPLICATION_OPTS=
22+
23+
rem NOTE: If you add or remove spark-sumbmit options,
24+
rem modify NOT ONLY this script but also SparkSubmitArgument.scala
25+
26+
:OptsLoop
27+
if "x%1"=="x" (
28+
goto :OptsLoopEnd
29+
)
30+
31+
SET opts="\<--master\> \<--deploy-mode\> \<--class\> \<--name\> \<--jars\> \<--py-files\> \<--files\>"
32+
SET opts="%opts:~1,-1% \<--conf\> \<--properties-file\> \<--driver-memory\> \<--driver-java-options\>"
33+
SET opts="%opts:~1,-1% \<--driver-library-path\> \<--driver-class-path\> \<--executor-memory\>"
34+
SET opts="%opts:~1,-1% \<--driver-cores\> \<--total-executor-cores\> \<--executor-cores\> \<--queue\>"
35+
SET opts="%opts:~1,-1% \<--num-executors\> \<--archives\>"
36+
37+
echo %1 | findstr %opts% >nul
38+
if %ERRORLEVEL% equ 0 (
39+
if "x%2"=="x" (
40+
echo "%1" requires an argument. >&2
41+
exit /b 1
42+
)
43+
set SUBMISSION_OPTS=%SUBMISSION_OPTS% %1 %2
44+
shift
45+
shift
46+
goto :OptsLoop
47+
)
48+
echo %1 | findstr "\<--verbose\> \<-v\> \<--supervise\>" >nul
49+
if %ERRORLEVEL% equ 0 (
50+
set SUBMISSION_OPTS=%SUBMISSION_OPTS% %1
51+
shift
52+
goto :OptsLoop
53+
)
54+
set APPLICATION_OPTS=%APPLICATION_OPTS% %1
55+
shift
56+
goto :OptsLoop
57+
58+
:OptsLoopEnd
59+
exit /b 0

conf/metrics.properties.template

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@
7777
# sample false Whether to show entire set of samples for histograms ('false' or 'true')
7878
#
7979
# * Default path is /metrics/json for all instances except the master. The master has two paths:
80-
# /metrics/aplications/json # App information
81-
# /metrics/master/json # Master information
80+
# /metrics/applications/json # App information
81+
# /metrics/master/json # Master information
8282

8383
# org.apache.spark.metrics.sink.GraphiteSink
8484
# Name: Default: Description:

core/src/main/java/org/apache/spark/SparkJobInfo.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717

1818
package org.apache.spark;
1919

20+
import java.io.Serializable;
21+
2022
/**
2123
* Exposes information about Spark Jobs.
2224
*
2325
* This interface is not designed to be implemented outside of Spark. We may add additional methods
2426
* which may break binary compatibility with outside implementations.
2527
*/
26-
public interface SparkJobInfo {
28+
public interface SparkJobInfo extends Serializable {
2729
int jobId();
2830
int[] stageIds();
2931
JobExecutionStatus status();

core/src/main/java/org/apache/spark/SparkStageInfo.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717

1818
package org.apache.spark;
1919

20+
import java.io.Serializable;
21+
2022
/**
2123
* Exposes information about Spark Stages.
2224
*
2325
* This interface is not designed to be implemented outside of Spark. We may add additional methods
2426
* which may break binary compatibility with outside implementations.
2527
*/
26-
public interface SparkStageInfo {
28+
public interface SparkStageInfo extends Serializable {
2729
int stageId();
2830
int currentAttemptId();
2931
long submissionTime();

core/src/main/resources/org/apache/spark/ui/static/webui.css

+11-1
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,18 @@ span.additional-metric-title {
169169
display: inline-block;
170170
}
171171

172+
.version {
173+
line-height: 30px;
174+
vertical-align: bottom;
175+
font-size: 12px;
176+
padding: 0;
177+
margin: 0;
178+
font-weight: bold;
179+
color: #777;
180+
}
181+
172182
/* Hide all additional metrics by default. This is done here rather than using JavaScript to
173183
* avoid slow page loads for stage pages with large numbers (e.g., thousands) of tasks. */
174-
.scheduler_delay, .gc_time, .deserialization_time, .serialization_time, .getting_result_time {
184+
.scheduler_delay, .deserialization_time, .serialization_time, .getting_result_time {
175185
display: none;
176186
}

core/src/main/scala/org/apache/spark/Accumulators.scala

+8-6
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package org.apache.spark
1919

2020
import java.io.{ObjectInputStream, Serializable}
2121
import java.util.concurrent.atomic.AtomicLong
22+
import java.lang.ThreadLocal
2223

2324
import scala.collection.generic.Growable
2425
import scala.collection.mutable.Map
@@ -278,10 +279,12 @@ object AccumulatorParam {
278279

279280
// TODO: The multi-thread support in accumulators is kind of lame; check
280281
// if there's a more intuitive way of doing it right
281-
private object Accumulators {
282+
private[spark] object Accumulators {
282283
// TODO: Use soft references? => need to make readObject work properly then
283284
val originals = Map[Long, Accumulable[_, _]]()
284-
val localAccums = Map[Thread, Map[Long, Accumulable[_, _]]]()
285+
val localAccums = new ThreadLocal[Map[Long, Accumulable[_, _]]]() {
286+
override protected def initialValue() = Map[Long, Accumulable[_, _]]()
287+
}
285288
var lastId: Long = 0
286289

287290
def newId(): Long = synchronized {
@@ -293,22 +296,21 @@ private object Accumulators {
293296
if (original) {
294297
originals(a.id) = a
295298
} else {
296-
val accums = localAccums.getOrElseUpdate(Thread.currentThread, Map())
297-
accums(a.id) = a
299+
localAccums.get()(a.id) = a
298300
}
299301
}
300302

301303
// Clear the local (non-original) accumulators for the current thread
302304
def clear() {
303305
synchronized {
304-
localAccums.remove(Thread.currentThread)
306+
localAccums.get.clear
305307
}
306308
}
307309

308310
// Get the values of the local accumulators for the current thread (by ID)
309311
def values: Map[Long, Any] = synchronized {
310312
val ret = Map[Long, Any]()
311-
for ((id, accum) <- localAccums.getOrElse(Thread.currentThread, Map())) {
313+
for ((id, accum) <- localAccums.get) {
312314
ret(id) = accum.localValue
313315
}
314316
return ret

core/src/main/scala/org/apache/spark/Aggregator.scala

+6-4
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,17 @@ case class Aggregator[K, V, C] (
3434
mergeValue: (C, V) => C,
3535
mergeCombiners: (C, C) => C) {
3636

37-
private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
37+
// When spilling is enabled sorting will happen externally, but not necessarily with an
38+
// ExternalSorter.
39+
private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
3840

3941
@deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
4042
def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
4143
combineValuesByKey(iter, null)
4244

4345
def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
4446
context: TaskContext): Iterator[(K, C)] = {
45-
if (!externalSorting) {
47+
if (!isSpillEnabled) {
4648
val combiners = new AppendOnlyMap[K,C]
4749
var kv: Product2[K, V] = null
4850
val update = (hadValue: Boolean, oldValue: C) => {
@@ -71,9 +73,9 @@ case class Aggregator[K, V, C] (
7173
combineCombinersByKey(iter, null)
7274

7375
def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext)
74-
: Iterator[(K, C)] =
76+
: Iterator[(K, C)] =
7577
{
76-
if (!externalSorting) {
78+
if (!isSpillEnabled) {
7779
val combiners = new AppendOnlyMap[K,C]
7880
var kc: Product2[K, C] = null
7981
val update = (hadValue: Boolean, oldValue: C) => {

core/src/main/scala/org/apache/spark/Dependency.scala

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
6060
* @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If set to None,
6161
* the default serializer, as specified by `spark.serializer` config option, will
6262
* be used.
63+
* @param keyOrdering key ordering for RDD's shuffles
64+
* @param aggregator map/reduce-side aggregator for RDD's shuffle
65+
* @param mapSideCombine whether to perform partial aggregation (also known as map-side combine)
6366
*/
6467
@DeveloperApi
6568
class ShuffleDependency[K, V, C](

0 commit comments

Comments
 (0)