Skip to content

Commit 4f5ac09

Browse files
author
Davies Liu
committed
Merge branch 'master' of github.com:apache/spark into R5
2 parents 41f8184 + 25998e4 commit 4f5ac09

File tree

325 files changed

+7187
-2302
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

325 files changed

+7187
-2302
lines changed

R/pkg/R/RDD.R

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
8585

8686
if (!inherits(prev, "PipelinedRDD") || !isPipelinable(prev)) {
8787
# This transformation is the first in its stage:
88-
.Object@func <- func
88+
.Object@func <- cleanClosure(func)
8989
.Object@prev_jrdd <- getJRDD(prev)
9090
.Object@env$prev_serializedMode <- prev@env$serializedMode
9191
# NOTE: We use prev_serializedMode to track the serialization mode of prev_JRDD
@@ -94,7 +94,7 @@ setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val)
9494
pipelinedFunc <- function(split, iterator) {
9595
func(split, prev@func(split, iterator))
9696
}
97-
.Object@func <- pipelinedFunc
97+
.Object@func <- cleanClosure(pipelinedFunc)
9898
.Object@prev_jrdd <- prev@prev_jrdd # maintain the pipeline
9999
# Get the serialization mode of the parent RDD
100100
.Object@env$prev_serializedMode <- prev@env$prev_serializedMode
@@ -144,17 +144,13 @@ setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
144144
return(rdd@env$jrdd_val)
145145
}
146146

147-
computeFunc <- function(split, part) {
148-
rdd@func(split, part)
149-
}
150-
151147
packageNamesArr <- serialize(.sparkREnv[[".packages"]],
152148
connection = NULL)
153149

154150
broadcastArr <- lapply(ls(.broadcastNames),
155151
function(name) { get(name, .broadcastNames) })
156152

157-
serializedFuncArr <- serialize(computeFunc, connection = NULL)
153+
serializedFuncArr <- serialize(rdd@func, connection = NULL)
158154

159155
prev_jrdd <- rdd@prev_jrdd
160156

@@ -279,7 +275,7 @@ setMethod("unpersist",
279275
#' @examples
280276
#'\dontrun{
281277
#' sc <- sparkR.init()
282-
#' setCheckpointDir(sc, "checkpoints")
278+
#' setCheckpointDir(sc, "checkpoint")
283279
#' rdd <- parallelize(sc, 1:10, 2L)
284280
#' checkpoint(rdd)
285281
#'}
@@ -551,11 +547,7 @@ setMethod("mapPartitions",
551547
setMethod("lapplyPartitionsWithIndex",
552548
signature(X = "RDD", FUN = "function"),
553549
function(X, FUN) {
554-
FUN <- cleanClosure(FUN)
555-
closureCapturingFunc <- function(split, part) {
556-
FUN(split, part)
557-
}
558-
PipelinedRDD(X, closureCapturingFunc)
550+
PipelinedRDD(X, FUN)
559551
})
560552

561553
#' @rdname lapplyPartitionsWithIndex

R/pkg/R/context.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ broadcast <- function(sc, object) {
216216
#' @examples
217217
#'\dontrun{
218218
#' sc <- sparkR.init()
219-
#' setCheckpointDir(sc, "~/checkpoints")
219+
#' setCheckpointDir(sc, "~/checkpoint")
220220
#' rdd <- parallelize(sc, 1:2, 2L)
221221
#' checkpoint(rdd)
222222
#'}

R/pkg/R/pairRDD.R

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -694,10 +694,6 @@ setMethod("cogroup",
694694
for (i in 1:rddsLen) {
695695
rdds[[i]] <- lapply(rdds[[i]],
696696
function(x) { list(x[[1]], list(i, x[[2]])) })
697-
# TODO(hao): As issue [SparkR-142] mentions, the right value of i
698-
# will not be captured into UDF if getJRDD is not invoked.
699-
# It should be resolved together with that issue.
700-
getJRDD(rdds[[i]]) # Capture the closure.
701697
}
702698
union.rdd <- Reduce(unionRDD, rdds)
703699
group.func <- function(vlist) {

R/pkg/inst/tests/test_binaryFile.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ test_that("saveAsObjectFile()/objectFile() following textFile() works", {
2727
fileName2 <- tempfile(pattern="spark-test", fileext=".tmp")
2828
writeLines(mockFile, fileName1)
2929

30-
rdd <- textFile(sc, fileName1)
30+
rdd <- textFile(sc, fileName1, 1)
3131
saveAsObjectFile(rdd, fileName2)
3232
rdd <- objectFile(sc, fileName2)
3333
expect_equal(collect(rdd), as.list(mockFile))
@@ -40,7 +40,7 @@ test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
4040
fileName <- tempfile(pattern="spark-test", fileext=".tmp")
4141

4242
l <- list(1, 2, 3)
43-
rdd <- parallelize(sc, l)
43+
rdd <- parallelize(sc, l, 1)
4444
saveAsObjectFile(rdd, fileName)
4545
rdd <- objectFile(sc, fileName)
4646
expect_equal(collect(rdd), l)

R/pkg/inst/tests/test_rdd.R

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkp
141141
unpersist(rdd2)
142142
expect_false(rdd2@env$isCached)
143143

144-
setCheckpointDir(sc, "checkpoints")
144+
tempDir <- tempfile(pattern = "checkpoint")
145+
setCheckpointDir(sc, tempDir)
145146
checkpoint(rdd2)
146147
expect_true(rdd2@env$isCheckpointed)
147148

@@ -152,7 +153,7 @@ test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkp
152153
# make sure the data is collectable
153154
collect(rdd2)
154155

155-
unlink("checkpoints")
156+
unlink(tempDir)
156157
})
157158

158159
test_that("reduce on RDD", {

R/pkg/inst/tests/test_textFile.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ test_that("textFile() followed by a saveAsTextFile() returns the same content",
8181
fileName2 <- tempfile(pattern="spark-test", fileext=".tmp")
8282
writeLines(mockFile, fileName1)
8383

84-
rdd <- textFile(sc, fileName1)
84+
rdd <- textFile(sc, fileName1, 1L)
8585
saveAsTextFile(rdd, fileName2)
8686
rdd <- textFile(sc, fileName2)
8787
expect_equal(collect(rdd), as.list(mockFile))
@@ -93,7 +93,7 @@ test_that("textFile() followed by a saveAsTextFile() returns the same content",
9393
test_that("saveAsTextFile() on a parallelized list works as expected", {
9494
fileName <- tempfile(pattern="spark-test", fileext=".tmp")
9595
l <- list(1, 2, 3)
96-
rdd <- parallelize(sc, l)
96+
rdd <- parallelize(sc, l, 1L)
9797
saveAsTextFile(rdd, fileName)
9898
rdd <- textFile(sc, fileName)
9999
expect_equal(collect(rdd), lapply(l, function(x) {toString(x)}))

bagel/src/test/resources/log4j.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@ log4j.appender.file.layout=org.apache.log4j.PatternLayout
2424
log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
2525

2626
# Ignore messages below warning level from Jetty, because it's a bit verbose
27-
log4j.logger.org.eclipse.jetty=WARN
27+
log4j.logger.org.spark-project.jetty=WARN

bin/load-spark-env.sh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
2121
# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
2222
# conf/ subdirectory.
23+
FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
2324

2425
if [ -z "$SPARK_ENV_LOADED" ]; then
2526
export SPARK_ENV_LOADED=1
@@ -41,8 +42,8 @@ fi
4142

4243
if [ -z "$SPARK_SCALA_VERSION" ]; then
4344

44-
ASSEMBLY_DIR2="$SPARK_HOME/assembly/target/scala-2.11"
45-
ASSEMBLY_DIR1="$SPARK_HOME/assembly/target/scala-2.10"
45+
ASSEMBLY_DIR2="$FWDIR/assembly/target/scala-2.11"
46+
ASSEMBLY_DIR1="$FWDIR/assembly/target/scala-2.10"
4647

4748
if [[ -d "$ASSEMBLY_DIR2" && -d "$ASSEMBLY_DIR1" ]]; then
4849
echo -e "Presence of build for both scala versions(SCALA 2.10 and SCALA 2.11) detected." 1>&2

conf/log4j.properties.template

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ log4j.appender.console.layout=org.apache.log4j.PatternLayout
66
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
77

88
# Settings to quiet third party logs that are too verbose
9-
log4j.logger.org.eclipse.jetty=WARN
10-
log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
9+
log4j.logger.org.spark-project.jetty=WARN
10+
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
1111
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
1212
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO

core/src/main/resources/org/apache/spark/log4j-defaults.properties

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ log4j.appender.console.layout=org.apache.log4j.PatternLayout
66
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
77

88
# Settings to quiet third party logs that are too verbose
9-
log4j.logger.org.eclipse.jetty=WARN
10-
log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
9+
log4j.logger.org.spark-project.jetty=WARN
10+
log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
1111
log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
1212
log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO

core/src/main/scala/org/apache/spark/ContextCleaner.scala

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import java.lang.ref.{ReferenceQueue, WeakReference}
2222
import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
2323

2424
import org.apache.spark.broadcast.Broadcast
25-
import org.apache.spark.rdd.RDD
25+
import org.apache.spark.rdd.{RDDCheckpointData, RDD}
2626
import org.apache.spark.util.Utils
2727

2828
/**
@@ -33,6 +33,7 @@ private case class CleanRDD(rddId: Int) extends CleanupTask
3333
private case class CleanShuffle(shuffleId: Int) extends CleanupTask
3434
private case class CleanBroadcast(broadcastId: Long) extends CleanupTask
3535
private case class CleanAccum(accId: Long) extends CleanupTask
36+
private case class CleanCheckpoint(rddId: Int) extends CleanupTask
3637

3738
/**
3839
* A WeakReference associated with a CleanupTask.
@@ -94,12 +95,12 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
9495
@volatile private var stopped = false
9596

9697
/** Attach a listener object to get information of when objects are cleaned. */
97-
def attachListener(listener: CleanerListener) {
98+
def attachListener(listener: CleanerListener): Unit = {
9899
listeners += listener
99100
}
100101

101102
/** Start the cleaner. */
102-
def start() {
103+
def start(): Unit = {
103104
cleaningThread.setDaemon(true)
104105
cleaningThread.setName("Spark Context Cleaner")
105106
cleaningThread.start()
@@ -108,7 +109,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
108109
/**
109110
* Stop the cleaning thread and wait until the thread has finished running its current task.
110111
*/
111-
def stop() {
112+
def stop(): Unit = {
112113
stopped = true
113114
// Interrupt the cleaning thread, but wait until the current task has finished before
114115
// doing so. This guards against the race condition where a cleaning thread may
@@ -121,7 +122,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
121122
}
122123

123124
/** Register a RDD for cleanup when it is garbage collected. */
124-
def registerRDDForCleanup(rdd: RDD[_]) {
125+
def registerRDDForCleanup(rdd: RDD[_]): Unit = {
125126
registerForCleanup(rdd, CleanRDD(rdd.id))
126127
}
127128

@@ -130,17 +131,22 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
130131
}
131132

132133
/** Register a ShuffleDependency for cleanup when it is garbage collected. */
133-
def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _, _]) {
134+
def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _, _]): Unit = {
134135
registerForCleanup(shuffleDependency, CleanShuffle(shuffleDependency.shuffleId))
135136
}
136137

137138
/** Register a Broadcast for cleanup when it is garbage collected. */
138-
def registerBroadcastForCleanup[T](broadcast: Broadcast[T]) {
139+
def registerBroadcastForCleanup[T](broadcast: Broadcast[T]): Unit = {
139140
registerForCleanup(broadcast, CleanBroadcast(broadcast.id))
140141
}
141142

143+
/** Register a RDDCheckpointData for cleanup when it is garbage collected. */
144+
def registerRDDCheckpointDataForCleanup[T](rdd: RDD[_], parentId: Int): Unit = {
145+
registerForCleanup(rdd, CleanCheckpoint(parentId))
146+
}
147+
142148
/** Register an object for cleanup. */
143-
private def registerForCleanup(objectForCleanup: AnyRef, task: CleanupTask) {
149+
private def registerForCleanup(objectForCleanup: AnyRef, task: CleanupTask): Unit = {
144150
referenceBuffer += new CleanupTaskWeakReference(task, objectForCleanup, referenceQueue)
145151
}
146152

@@ -164,6 +170,8 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
164170
doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks)
165171
case CleanAccum(accId) =>
166172
doCleanupAccum(accId, blocking = blockOnCleanupTasks)
173+
case CleanCheckpoint(rddId) =>
174+
doCleanCheckpoint(rddId)
167175
}
168176
}
169177
}
@@ -175,7 +183,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
175183
}
176184

177185
/** Perform RDD cleanup. */
178-
def doCleanupRDD(rddId: Int, blocking: Boolean) {
186+
def doCleanupRDD(rddId: Int, blocking: Boolean): Unit = {
179187
try {
180188
logDebug("Cleaning RDD " + rddId)
181189
sc.unpersistRDD(rddId, blocking)
@@ -187,7 +195,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
187195
}
188196

189197
/** Perform shuffle cleanup, asynchronously. */
190-
def doCleanupShuffle(shuffleId: Int, blocking: Boolean) {
198+
def doCleanupShuffle(shuffleId: Int, blocking: Boolean): Unit = {
191199
try {
192200
logDebug("Cleaning shuffle " + shuffleId)
193201
mapOutputTrackerMaster.unregisterShuffle(shuffleId)
@@ -200,7 +208,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
200208
}
201209

202210
/** Perform broadcast cleanup. */
203-
def doCleanupBroadcast(broadcastId: Long, blocking: Boolean) {
211+
def doCleanupBroadcast(broadcastId: Long, blocking: Boolean): Unit = {
204212
try {
205213
logDebug(s"Cleaning broadcast $broadcastId")
206214
broadcastManager.unbroadcast(broadcastId, true, blocking)
@@ -212,7 +220,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
212220
}
213221

214222
/** Perform accumulator cleanup. */
215-
def doCleanupAccum(accId: Long, blocking: Boolean) {
223+
def doCleanupAccum(accId: Long, blocking: Boolean): Unit = {
216224
try {
217225
logDebug("Cleaning accumulator " + accId)
218226
Accumulators.remove(accId)
@@ -223,6 +231,18 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
223231
}
224232
}
225233

234+
/** Perform checkpoint cleanup. */
235+
def doCleanCheckpoint(rddId: Int): Unit = {
236+
try {
237+
logDebug("Cleaning rdd checkpoint data " + rddId)
238+
RDDCheckpointData.clearRDDCheckpointData(sc, rddId)
239+
logInfo("Cleaned rdd checkpoint data " + rddId)
240+
}
241+
catch {
242+
case e: Exception => logError("Error cleaning rdd checkpoint data " + rddId, e)
243+
}
244+
}
245+
226246
private def blockManagerMaster = sc.env.blockManager.master
227247
private def broadcastManager = sc.env.broadcastManager
228248
private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]

0 commit comments

Comments
 (0)