Skip to content

Commit cf0007d

Browse files
committed
Merge remote-tracking branch 'upstream/master' into ldaonline
2 parents 54cf8da + 473552f commit cf0007d

File tree

298 files changed

+14654
-2084
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

298 files changed

+14654
-2084
lines changed

LICENSE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,7 @@ BSD-style licenses
814814
The following components are provided under a BSD-style license. See project link for details.
815815

816816
(BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
817+
(BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
817818
(BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
818819
(BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
819820
(BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)

assembly/pom.xml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,6 @@
194194
<plugin>
195195
<groupId>org.apache.maven.plugins</groupId>
196196
<artifactId>maven-assembly-plugin</artifactId>
197-
<version>2.4</version>
198197
<executions>
199198
<execution>
200199
<id>dist</id>

bin/spark-class2.cmd

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,10 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
6161

6262
rem The launcher library prints the command to be executed in a single line suitable for being
6363
rem executed by the batch interpreter. So read all the output of the launcher into a variable.
64-
for /f "tokens=*" %%i in ('cmd /C ""%RUNNER%" -cp %LAUNCH_CLASSPATH% org.apache.spark.launcher.Main %*"') do (
64+
set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt
65+
"%RUNNER%" -cp %LAUNCH_CLASSPATH% org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT%
66+
for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do (
6567
set SPARK_CMD=%%i
6668
)
69+
del %LAUNCHER_OUTPUT%
6770
%SPARK_CMD%

conf/spark-env.sh.template

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# This file is sourced when running various Spark programs.
44
# Copy it as spark-env.sh and edit that to configure Spark for your site.
55

6-
# Options read when launching programs locally with
6+
# Options read when launching programs locally with
77
# ./bin/run-example or ./bin/spark-submit
88
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
99
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
@@ -39,6 +39,7 @@
3939
# - SPARK_WORKER_DIR, to set the working directory of worker processes
4040
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
4141
# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
42+
# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
4243
# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
4344
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
4445

core/pom.xml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@
9595
<artifactId>spark-network-shuffle_${scala.binary.version}</artifactId>
9696
<version>${project.version}</version>
9797
</dependency>
98+
<dependency>
99+
<groupId>org.apache.spark</groupId>
100+
<artifactId>spark-unsafe_${scala.binary.version}</artifactId>
101+
<version>${project.version}</version>
102+
</dependency>
98103
<dependency>
99104
<groupId>net.java.dev.jets3t</groupId>
100105
<artifactId>jets3t</artifactId>
@@ -478,7 +483,6 @@
478483
<plugin>
479484
<groupId>org.codehaus.mojo</groupId>
480485
<artifactId>exec-maven-plugin</artifactId>
481-
<version>1.3.2</version>
482486
<executions>
483487
<execution>
484488
<id>sparkr-pkg</id>
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.api.java.function;
19+
20+
import java.io.Serializable;
21+
22+
/**
23+
* A zero-argument function that returns an R.
24+
*/
25+
public interface Function0<R> extends Serializable {
26+
public R call() throws Exception;
27+
}

core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,15 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
7676

7777
private var timeoutCheckingTask: ScheduledFuture[_] = null
7878

79-
private val timeoutCheckingThread =
80-
ThreadUtils.newDaemonSingleThreadScheduledExecutor("heartbeat-timeout-checking-thread")
79+
// "eventLoopThread" is used to run some pretty fast actions. The actions running in it should not
80+
// block the thread for a long time.
81+
private val eventLoopThread =
82+
ThreadUtils.newDaemonSingleThreadScheduledExecutor("heartbeat-receiver-event-loop-thread")
8183

8284
private val killExecutorThread = ThreadUtils.newDaemonSingleThreadExecutor("kill-executor-thread")
8385

8486
override def onStart(): Unit = {
85-
timeoutCheckingTask = timeoutCheckingThread.scheduleAtFixedRate(new Runnable {
87+
timeoutCheckingTask = eventLoopThread.scheduleAtFixedRate(new Runnable {
8688
override def run(): Unit = Utils.tryLogNonFatalError {
8789
Option(self).foreach(_.send(ExpireDeadHosts))
8890
}
@@ -99,11 +101,15 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
99101
override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
100102
case heartbeat @ Heartbeat(executorId, taskMetrics, blockManagerId) =>
101103
if (scheduler != null) {
102-
val unknownExecutor = !scheduler.executorHeartbeatReceived(
103-
executorId, taskMetrics, blockManagerId)
104-
val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
105104
executorLastSeen(executorId) = System.currentTimeMillis()
106-
context.reply(response)
105+
eventLoopThread.submit(new Runnable {
106+
override def run(): Unit = Utils.tryLogNonFatalError {
107+
val unknownExecutor = !scheduler.executorHeartbeatReceived(
108+
executorId, taskMetrics, blockManagerId)
109+
val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
110+
context.reply(response)
111+
}
112+
})
107113
} else {
108114
// Because Executor will sleep several seconds before sending the first "Heartbeat", this
109115
// case rarely happens. However, if it really happens, log it and ask the executor to
@@ -125,7 +131,9 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
125131
if (sc.supportDynamicAllocation) {
126132
// Asynchronously kill the executor to avoid blocking the current thread
127133
killExecutorThread.submit(new Runnable {
128-
override def run(): Unit = sc.killExecutor(executorId)
134+
override def run(): Unit = Utils.tryLogNonFatalError {
135+
sc.killExecutor(executorId)
136+
}
129137
})
130138
}
131139
executorLastSeen.remove(executorId)
@@ -137,7 +145,7 @@ private[spark] class HeartbeatReceiver(sc: SparkContext)
137145
if (timeoutCheckingTask != null) {
138146
timeoutCheckingTask.cancel(true)
139147
}
140-
timeoutCheckingThread.shutdownNow()
148+
eventLoopThread.shutdownNow()
141149
killExecutorThread.shutdownNow()
142150
}
143151
}

core/src/main/scala/org/apache/spark/MapOutputTracker.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
106106
*/
107107
protected def askTracker[T: ClassTag](message: Any): T = {
108108
try {
109-
trackerEndpoint.askWithReply[T](message)
109+
trackerEndpoint.askWithRetry[T](message)
110110
} catch {
111111
case e: Exception =>
112112
logError("Error communicating with MapOutputTracker", e)

core/src/main/scala/org/apache/spark/SparkConf.scala

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,74 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
211211
Utils.timeStringAsMs(get(key, defaultValue))
212212
}
213213

214+
/**
215+
* Get a size parameter as bytes; throws a NoSuchElementException if it's not set. If no
216+
* suffix is provided then bytes are assumed.
217+
* @throws NoSuchElementException
218+
*/
219+
def getSizeAsBytes(key: String): Long = {
220+
Utils.byteStringAsBytes(get(key))
221+
}
222+
223+
/**
224+
* Get a size parameter as bytes, falling back to a default if not set. If no
225+
* suffix is provided then bytes are assumed.
226+
*/
227+
def getSizeAsBytes(key: String, defaultValue: String): Long = {
228+
Utils.byteStringAsBytes(get(key, defaultValue))
229+
}
230+
231+
/**
232+
* Get a size parameter as Kibibytes; throws a NoSuchElementException if it's not set. If no
233+
* suffix is provided then Kibibytes are assumed.
234+
* @throws NoSuchElementException
235+
*/
236+
def getSizeAsKb(key: String): Long = {
237+
Utils.byteStringAsKb(get(key))
238+
}
239+
240+
/**
241+
* Get a size parameter as Kibibytes, falling back to a default if not set. If no
242+
* suffix is provided then Kibibytes are assumed.
243+
*/
244+
def getSizeAsKb(key: String, defaultValue: String): Long = {
245+
Utils.byteStringAsKb(get(key, defaultValue))
246+
}
247+
248+
/**
249+
* Get a size parameter as Mebibytes; throws a NoSuchElementException if it's not set. If no
250+
* suffix is provided then Mebibytes are assumed.
251+
* @throws NoSuchElementException
252+
*/
253+
def getSizeAsMb(key: String): Long = {
254+
Utils.byteStringAsMb(get(key))
255+
}
256+
257+
/**
258+
* Get a size parameter as Mebibytes, falling back to a default if not set. If no
259+
* suffix is provided then Mebibytes are assumed.
260+
*/
261+
def getSizeAsMb(key: String, defaultValue: String): Long = {
262+
Utils.byteStringAsMb(get(key, defaultValue))
263+
}
264+
265+
/**
266+
* Get a size parameter as Gibibytes; throws a NoSuchElementException if it's not set. If no
267+
* suffix is provided then Gibibytes are assumed.
268+
* @throws NoSuchElementException
269+
*/
270+
def getSizeAsGb(key: String): Long = {
271+
Utils.byteStringAsGb(get(key))
272+
}
214273

274+
/**
275+
* Get a size parameter as Gibibytes, falling back to a default if not set. If no
276+
* suffix is provided then Gibibytes are assumed.
277+
*/
278+
def getSizeAsGb(key: String, defaultValue: String): Long = {
279+
Utils.byteStringAsGb(get(key, defaultValue))
280+
}
281+
215282
/** Get a parameter as an Option */
216283
def getOption(key: String): Option[String] = {
217284
Option(settings.get(key)).orElse(getDeprecatedConfig(key, this))
@@ -407,7 +474,13 @@ private[spark] object SparkConf extends Logging {
407474
"The spark.cache.class property is no longer being used! Specify storage levels using " +
408475
"the RDD.persist() method instead."),
409476
DeprecatedConfig("spark.yarn.user.classpath.first", "1.3",
410-
"Please use spark.{driver,executor}.userClassPathFirst instead."))
477+
"Please use spark.{driver,executor}.userClassPathFirst instead."),
478+
DeprecatedConfig("spark.kryoserializer.buffer.mb", "1.4",
479+
"Please use spark.kryoserializer.buffer instead. The default value for " +
480+
"spark.kryoserializer.buffer.mb was previously specified as '0.064'. Fractional values " +
481+
"are no longer accepted. To specify the equivalent now, one may use '64k'.")
482+
)
483+
411484
Map(configs.map { cfg => (cfg.key -> cfg) }:_*)
412485
}
413486

@@ -432,6 +505,21 @@ private[spark] object SparkConf extends Logging {
432505
AlternateConfig("spark.yarn.applicationMaster.waitTries", "1.3",
433506
// Translate old value to a duration, with 10s wait time per try.
434507
translation = s => s"${s.toLong * 10}s")),
508+
"spark.reducer.maxSizeInFlight" -> Seq(
509+
AlternateConfig("spark.reducer.maxMbInFlight", "1.4")),
510+
"spark.kryoserializer.buffer" ->
511+
Seq(AlternateConfig("spark.kryoserializer.buffer.mb", "1.4",
512+
translation = s => s"${s.toDouble * 1000}k")),
513+
"spark.kryoserializer.buffer.max" -> Seq(
514+
AlternateConfig("spark.kryoserializer.buffer.max.mb", "1.4")),
515+
"spark.shuffle.file.buffer" -> Seq(
516+
AlternateConfig("spark.shuffle.file.buffer.kb", "1.4")),
517+
"spark.executor.logs.rolling.maxSize" -> Seq(
518+
AlternateConfig("spark.executor.logs.rolling.size.maxBytes", "1.4")),
519+
"spark.io.compression.snappy.blockSize" -> Seq(
520+
AlternateConfig("spark.io.compression.snappy.block.size", "1.4")),
521+
"spark.io.compression.lz4.blockSize" -> Seq(
522+
AlternateConfig("spark.io.compression.lz4.block.size", "1.4")),
435523
"spark.rpc.numRetries" -> Seq(
436524
AlternateConfig("spark.akka.num.retries", "1.4")),
437525
"spark.rpc.retry.wait" -> Seq(

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -555,7 +555,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
555555
SparkEnv.executorActorSystemName,
556556
RpcAddress(host, port),
557557
ExecutorEndpoint.EXECUTOR_ENDPOINT_NAME)
558-
Some(endpointRef.askWithReply[Array[ThreadStackTrace]](TriggerThreadDump))
558+
Some(endpointRef.askWithRetry[Array[ThreadStackTrace]](TriggerThreadDump))
559559
}
560560
} catch {
561561
case e: Exception =>
@@ -713,7 +713,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
713713
RDD[(String, String)] = {
714714
assertNotStopped()
715715
val job = new NewHadoopJob(hadoopConfiguration)
716-
NewFileInputFormat.addInputPath(job, new Path(path))
716+
// Use setInputPaths so that wholeTextFiles aligns with hadoopFile/textFile in taking
717+
// comma separated files as input. (see SPARK-7155)
718+
NewFileInputFormat.setInputPaths(job, path)
717719
val updateConf = job.getConfiguration
718720
new WholeTextFileRDD(
719721
this,
@@ -759,7 +761,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
759761
RDD[(String, PortableDataStream)] = {
760762
assertNotStopped()
761763
val job = new NewHadoopJob(hadoopConfiguration)
762-
NewFileInputFormat.addInputPath(job, new Path(path))
764+
// Use setInputPaths so that binaryFiles aligns with hadoopFile/textFile in taking
765+
// comma separated files as input. (see SPARK-7155)
766+
NewFileInputFormat.setInputPaths(job, path)
763767
val updateConf = job.getConfiguration
764768
new BinaryFileRDD(
765769
this,
@@ -935,7 +939,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
935939
// The call to new NewHadoopJob automatically adds security credentials to conf,
936940
// so we don't need to explicitly add them ourselves
937941
val job = new NewHadoopJob(conf)
938-
NewFileInputFormat.addInputPath(job, new Path(path))
942+
// Use setInputPaths so that newAPIHadoopFile aligns with hadoopFile/textFile in taking
943+
// comma separated files as input. (see SPARK-7155)
944+
NewFileInputFormat.setInputPaths(job, path)
939945
val updatedConf = job.getConfiguration
940946
new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf).setName(path)
941947
}
@@ -1396,6 +1402,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
13961402
* Register an RDD to be persisted in memory and/or disk storage
13971403
*/
13981404
private[spark] def persistRDD(rdd: RDD[_]) {
1405+
_executorAllocationManager.foreach { _ =>
1406+
logWarning(
1407+
s"Dynamic allocation currently does not support cached RDDs. Cached data for RDD " +
1408+
s"${rdd.id} will be lost when executors are removed.")
1409+
}
13991410
persistentRdds(rdd.id) = rdd
14001411
}
14011412

core/src/main/scala/org/apache/spark/SparkEnv.scala

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import org.apache.spark.scheduler.OutputCommitCoordinator.OutputCommitCoordinato
4040
import org.apache.spark.serializer.Serializer
4141
import org.apache.spark.shuffle.{ShuffleMemoryManager, ShuffleManager}
4242
import org.apache.spark.storage._
43+
import org.apache.spark.unsafe.memory.{ExecutorMemoryManager, MemoryAllocator}
4344
import org.apache.spark.util.{RpcUtils, Utils}
4445

4546
/**
@@ -69,6 +70,7 @@ class SparkEnv (
6970
val sparkFilesDir: String,
7071
val metricsSystem: MetricsSystem,
7172
val shuffleMemoryManager: ShuffleMemoryManager,
73+
val executorMemoryManager: ExecutorMemoryManager,
7274
val outputCommitCoordinator: OutputCommitCoordinator,
7375
val conf: SparkConf) extends Logging {
7476

@@ -382,6 +384,15 @@ object SparkEnv extends Logging {
382384
new OutputCommitCoordinatorEndpoint(rpcEnv, outputCommitCoordinator))
383385
outputCommitCoordinator.coordinatorRef = Some(outputCommitCoordinatorRef)
384386

387+
val executorMemoryManager: ExecutorMemoryManager = {
388+
val allocator = if (conf.getBoolean("spark.unsafe.offHeap", false)) {
389+
MemoryAllocator.UNSAFE
390+
} else {
391+
MemoryAllocator.HEAP
392+
}
393+
new ExecutorMemoryManager(allocator)
394+
}
395+
385396
val envInstance = new SparkEnv(
386397
executorId,
387398
rpcEnv,
@@ -398,6 +409,7 @@ object SparkEnv extends Logging {
398409
sparkFilesDir,
399410
metricsSystem,
400411
shuffleMemoryManager,
412+
executorMemoryManager,
401413
outputCommitCoordinator,
402414
conf)
403415

core/src/main/scala/org/apache/spark/TaskContext.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import java.io.Serializable
2121

2222
import org.apache.spark.annotation.DeveloperApi
2323
import org.apache.spark.executor.TaskMetrics
24+
import org.apache.spark.unsafe.memory.TaskMemoryManager
2425
import org.apache.spark.util.TaskCompletionListener
2526

2627

@@ -133,4 +134,9 @@ abstract class TaskContext extends Serializable {
133134
/** ::DeveloperApi:: */
134135
@DeveloperApi
135136
def taskMetrics(): TaskMetrics
137+
138+
/**
139+
* Returns the manager for this task's managed memory.
140+
*/
141+
private[spark] def taskMemoryManager(): TaskMemoryManager
136142
}

0 commit comments

Comments
 (0)