Skip to content

Commit fd92bc7

Browse files
committed
Merge branch 'master' into col-computability
Conflicts: sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
2 parents 5afe1ff + 8aa3cff commit fd92bc7

File tree

136 files changed

+7701
-446
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

136 files changed

+7701
-446
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ To build Spark and its example programs, run:
2626

2727
(You do not need to do this if you downloaded a pre-built package.)
2828
More detailed documentation is available from the project site, at
29-
["Building Spark with Maven"](http://spark.apache.org/docs/latest/building-spark.html).
29+
["Building Spark"](http://spark.apache.org/docs/latest/building-spark.html).
3030

3131
## Interactive Scala Shell
3232

bin/compute-classpath.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ fi
5050
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
5151
echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
5252
"classes ahead of assembly." >&2
53+
# Spark classes
5354
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SPARK_SCALA_VERSION/classes"
54-
CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*"
5555
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SPARK_SCALA_VERSION/classes"
5656
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SPARK_SCALA_VERSION/classes"
5757
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SPARK_SCALA_VERSION/classes"
@@ -63,6 +63,8 @@ if [ -n "$SPARK_PREPEND_CLASSES" ]; then
6363
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SPARK_SCALA_VERSION/classes"
6464
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive-thriftserver/target/scala-$SPARK_SCALA_VERSION/classes"
6565
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SPARK_SCALA_VERSION/classes"
66+
# Jars for shaded deps in their original form (copied here during build)
67+
CLASSPATH="$CLASSPATH:$FWDIR/core/target/jars/*"
6668
fi
6769

6870
# Use spark-assembly jar from either RELEASE or assembly directory

core/pom.xml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,22 +94,35 @@
9494
<groupId>org.apache.curator</groupId>
9595
<artifactId>curator-recipes</artifactId>
9696
</dependency>
97+
98+
<!-- Jetty dependencies promoted to compile here so they are shaded
99+
and inlined into spark-core jar -->
97100
<dependency>
98101
<groupId>org.eclipse.jetty</groupId>
99102
<artifactId>jetty-plus</artifactId>
103+
<scope>compile</scope>
100104
</dependency>
101105
<dependency>
102106
<groupId>org.eclipse.jetty</groupId>
103107
<artifactId>jetty-security</artifactId>
108+
<scope>compile</scope>
104109
</dependency>
105110
<dependency>
106111
<groupId>org.eclipse.jetty</groupId>
107112
<artifactId>jetty-util</artifactId>
113+
<scope>compile</scope>
108114
</dependency>
109115
<dependency>
110116
<groupId>org.eclipse.jetty</groupId>
111117
<artifactId>jetty-server</artifactId>
118+
<scope>compile</scope>
112119
</dependency>
120+
<dependency>
121+
<groupId>org.eclipse.jetty</groupId>
122+
<artifactId>jetty-http</artifactId>
123+
<scope>compile</scope>
124+
</dependency>
125+
113126
<dependency>
114127
<groupId>org.apache.commons</groupId>
115128
<artifactId>commons-lang3</artifactId>
@@ -348,19 +361,24 @@
348361
<groupId>org.apache.maven.plugins</groupId>
349362
<artifactId>maven-dependency-plugin</artifactId>
350363
<executions>
364+
<!-- When using SPARK_PREPEND_CLASSES Spark classes compiled locally don't use
365+
shaded deps. So here we store jars in their original form which are added
366+
when the classpath is computed. -->
351367
<execution>
352368
<id>copy-dependencies</id>
353369
<phase>package</phase>
354370
<goals>
355371
<goal>copy-dependencies</goal>
356372
</goals>
357-
<configuration>
373+
<configuration>
358374
<outputDirectory>${project.build.directory}</outputDirectory>
359375
<overWriteReleases>false</overWriteReleases>
360376
<overWriteSnapshots>false</overWriteSnapshots>
361377
<overWriteIfNewer>true</overWriteIfNewer>
362378
<useSubDirectoryPerType>true</useSubDirectoryPerType>
363-
<includeArtifactIds>guava</includeArtifactIds>
379+
<includeArtifactIds>
380+
guava,jetty-io,jetty-http,jetty-plus,jetty-util,jetty-server
381+
</includeArtifactIds>
364382
<silent>true</silent>
365383
</configuration>
366384
</execution>

core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ import org.apache.spark.scheduler._
4949
* spark.dynamicAllocation.enabled - Whether this feature is enabled
5050
* spark.dynamicAllocation.minExecutors - Lower bound on the number of executors
5151
* spark.dynamicAllocation.maxExecutors - Upper bound on the number of executors
52+
* spark.dynamicAllocation.initialExecutors - Number of executors to start with
5253
*
5354
* spark.dynamicAllocation.schedulerBacklogTimeout (M) -
5455
* If there are backlogged tasks for this duration, add new executors
@@ -70,9 +71,10 @@ private[spark] class ExecutorAllocationManager(
7071

7172
import ExecutorAllocationManager._
7273

73-
// Lower and upper bounds on the number of executors. These are required.
74-
private val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", -1)
75-
private val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors", -1)
74+
// Lower and upper bounds on the number of executors.
75+
private val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
76+
private val maxNumExecutors = conf.getInt("spark.dynamicAllocation.maxExecutors",
77+
Integer.MAX_VALUE)
7678

7779
// How long there must be backlogged tasks for before an addition is triggered
7880
private val schedulerBacklogTimeout = conf.getLong(
@@ -132,10 +134,10 @@ private[spark] class ExecutorAllocationManager(
132134
*/
133135
private def validateSettings(): Unit = {
134136
if (minNumExecutors < 0 || maxNumExecutors < 0) {
135-
throw new SparkException("spark.dynamicAllocation.{min/max}Executors must be set!")
137+
throw new SparkException("spark.dynamicAllocation.{min/max}Executors must be positive!")
136138
}
137-
if (minNumExecutors == 0 || maxNumExecutors == 0) {
138-
throw new SparkException("spark.dynamicAllocation.{min/max}Executors cannot be 0!")
139+
if (maxNumExecutors == 0) {
140+
throw new SparkException("spark.dynamicAllocation.maxExecutors cannot be 0!")
139141
}
140142
if (minNumExecutors > maxNumExecutors) {
141143
throw new SparkException(s"spark.dynamicAllocation.minExecutors ($minNumExecutors) must " +

core/src/main/scala/org/apache/spark/HttpFileServer.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ private[spark] class HttpFileServer(
3636
var serverUri : String = null
3737

3838
def initialize() {
39-
baseDir = Utils.createTempDir()
39+
baseDir = Utils.createTempDir(Utils.getLocalDir(conf), "httpd")
4040
fileDir = new File(baseDir, "files")
4141
jarDir = new File(baseDir, "jars")
4242
fileDir.mkdir()

core/src/main/scala/org/apache/spark/SparkConf.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import scala.collection.JavaConverters._
2323
import scala.collection.mutable.LinkedHashSet
2424

2525
import org.apache.spark.serializer.KryoSerializer
26+
import org.apache.spark.util.Utils
2627

2728
/**
2829
* Configuration for a Spark application. Used to set various Spark parameters as key-value pairs.
@@ -53,8 +54,8 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
5354

5455
if (loadDefaults) {
5556
// Load any spark.* system properties
56-
for ((k, v) <- System.getProperties.asScala if k.startsWith("spark.")) {
57-
set(k, v)
57+
for ((key, value) <- Utils.getSystemProperties if key.startsWith("spark.")) {
58+
set(key, value)
5859
}
5960
}
6061

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -687,9 +687,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
687687
* @param minPartitions Minimum number of Hadoop Splits to generate.
688688
*
689689
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
690-
* record, directly caching the returned RDD will create many references to the same object.
691-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
692-
* a `map` function.
690+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
691+
* operation will create many references to the same object.
692+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
693+
* copy them using a `map` function.
693694
*/
694695
def hadoopRDD[K, V](
695696
conf: JobConf,
@@ -705,12 +706,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
705706
}
706707

707708
/** Get an RDD for a Hadoop file with an arbitrary InputFormat
708-
*
709-
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
710-
* record, directly caching the returned RDD will create many references to the same object.
711-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
712-
* a `map` function.
713-
* */
709+
*
710+
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
711+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
712+
* operation will create many references to the same object.
713+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
714+
* copy them using a `map` function.
715+
*/
714716
def hadoopFile[K, V](
715717
path: String,
716718
inputFormatClass: Class[_ <: InputFormat[K, V]],
@@ -741,9 +743,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
741743
* }}}
742744
*
743745
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
744-
* record, directly caching the returned RDD will create many references to the same object.
745-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
746-
* a `map` function.
746+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
747+
* operation will create many references to the same object.
748+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
749+
* copy them using a `map` function.
747750
*/
748751
def hadoopFile[K, V, F <: InputFormat[K, V]]
749752
(path: String, minPartitions: Int)
@@ -764,9 +767,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
764767
* }}}
765768
*
766769
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
767-
* record, directly caching the returned RDD will create many references to the same object.
768-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
769-
* a `map` function.
770+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
771+
* operation will create many references to the same object.
772+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
773+
* copy them using a `map` function.
770774
*/
771775
def hadoopFile[K, V, F <: InputFormat[K, V]](path: String)
772776
(implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] =
@@ -788,9 +792,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
788792
* and extra configuration options to pass to the input format.
789793
*
790794
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
791-
* record, directly caching the returned RDD will create many references to the same object.
792-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
793-
* a `map` function.
795+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
796+
* operation will create many references to the same object.
797+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
798+
* copy them using a `map` function.
794799
*/
795800
def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](
796801
path: String,
@@ -810,9 +815,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
810815
* and extra configuration options to pass to the input format.
811816
*
812817
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
813-
* record, directly caching the returned RDD will create many references to the same object.
814-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
815-
* a `map` function.
818+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
819+
* operation will create many references to the same object.
820+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
821+
* copy them using a `map` function.
816822
*/
817823
def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
818824
conf: Configuration = hadoopConfiguration,
@@ -826,9 +832,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
826832
/** Get an RDD for a Hadoop SequenceFile with given key and value types.
827833
*
828834
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
829-
* record, directly caching the returned RDD will create many references to the same object.
830-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
831-
* a `map` function.
835+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
836+
* operation will create many references to the same object.
837+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
838+
* copy them using a `map` function.
832839
*/
833840
def sequenceFile[K, V](path: String,
834841
keyClass: Class[K],
@@ -843,9 +850,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
843850
/** Get an RDD for a Hadoop SequenceFile with given key and value types.
844851
*
845852
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
846-
* record, directly caching the returned RDD will create many references to the same object.
847-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
848-
* a `map` function.
853+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
854+
* operation will create many references to the same object.
855+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
856+
* copy them using a `map` function.
849857
* */
850858
def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]): RDD[(K, V)] = {
851859
assertNotStopped()
@@ -869,9 +877,10 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
869877
* allow it to figure out the Writable class to use in the subclass case.
870878
*
871879
* '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
872-
* record, directly caching the returned RDD will create many references to the same object.
873-
* If you plan to directly cache Hadoop writable objects, you should first copy them using
874-
* a `map` function.
880+
* record, directly caching the returned RDD or directly passing it to an aggregation or shuffle
881+
* operation will create many references to the same object.
882+
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
883+
* copy them using a `map` function.
875884
*/
876885
def sequenceFile[K, V]
877886
(path: String, minPartitions: Int = defaultMinPartitions)

core/src/main/scala/org/apache/spark/SparkEnv.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ object SparkEnv extends Logging {
339339
// this is a temporary directory; in distributed mode, this is the executor's current working
340340
// directory.
341341
val sparkFilesDir: String = if (isDriver) {
342-
Utils.createTempDir().getAbsolutePath
342+
Utils.createTempDir(Utils.getLocalDir(conf), "userFiles").getAbsolutePath
343343
} else {
344344
"."
345345
}

core/src/main/scala/org/apache/spark/broadcast/HttpBroadcast.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ private[broadcast] object HttpBroadcast extends Logging {
151151
}
152152

153153
private def createServer(conf: SparkConf) {
154-
broadcastDir = Utils.createTempDir(Utils.getLocalDir(conf))
154+
broadcastDir = Utils.createTempDir(Utils.getLocalDir(conf), "broadcast")
155155
val broadcastPort = conf.getInt("spark.broadcast.port", 0)
156156
server =
157157
new HttpServer(conf, broadcastDir, securityManager, broadcastPort, "HTTP broadcast server")

core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import org.apache.spark.api.python.PythonUtils
2626
import org.apache.spark.util.{RedirectThread, Utils}
2727

2828
/**
29-
* A main class used by spark-submit to launch Python applications. It executes python as a
29+
* A main class used to launch Python applications. It executes python as a
3030
* subprocess and then has it connect back to the JVM to access system properties, etc.
3131
*/
3232
object PythonRunner {

0 commit comments

Comments
 (0)