Skip to content

Commit 4526acc

Browse files
GeorgeGeorge
authored andcommitted
Merge branch 'master' of github.com:apache/spark into SPARK-7422
2 parents df9538a + 97dee31 commit 4526acc

File tree

148 files changed

+6757
-1010
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+6757
-1010
lines changed

R/pkg/R/DataFrame.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ setMethod("isLocal",
150150
callJMethod(x@sdf, "isLocal")
151151
})
152152

153-
#' ShowDF
153+
#' showDF
154154
#'
155155
#' Print the first numRows rows of a DataFrame
156156
#'
@@ -170,7 +170,8 @@ setMethod("isLocal",
170170
setMethod("showDF",
171171
signature(x = "DataFrame"),
172172
function(x, numRows = 20) {
173-
callJMethod(x@sdf, "showString", numToInt(numRows))
173+
s <- callJMethod(x@sdf, "showString", numToInt(numRows))
174+
cat(s)
174175
})
175176

176177
#' show
@@ -187,7 +188,7 @@ setMethod("showDF",
187188
#' sqlCtx <- sparkRSQL.init(sc)
188189
#' path <- "path/to/file.json"
189190
#' df <- jsonFile(sqlCtx, path)
190-
#' show(df)
191+
#' df
191192
#'}
192193
setMethod("show", "DataFrame",
193194
function(object) {

R/pkg/R/RDD.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
6767
})
6868

6969
setMethod("show", "RDD",
70-
function(.Object) {
71-
cat(paste(callJMethod(.Object@jrdd, "toString"), "\n", sep=""))
70+
function(object) {
71+
cat(paste(callJMethod(getJRDD(object), "toString"), "\n", sep=""))
7272
})
7373

7474
setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {

R/pkg/R/group.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,7 @@ setMethod("agg",
102102
}
103103
}
104104
jcols <- lapply(cols, function(c) { c@jc })
105-
# the GroupedData.agg(col, cols*) API does not contain grouping Column
106-
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "aggWithGrouping",
107-
x@sgd, listToSeq(jcols))
105+
sdf <- callJMethod(x@sgd, "agg", jcols[[1]], listToSeq(jcols[-1]))
108106
} else {
109107
stop("agg can only support Column or character")
110108
}

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,8 @@ test_that("toJSON() returns an RDD of the correct values", {
653653

654654
test_that("showDF()", {
655655
df <- jsonFile(sqlCtx, jsonPath)
656-
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
656+
s <- capture.output(showDF(df))
657+
expect_output(s , "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
657658
})
658659

659660
test_that("isLocal()", {

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ can be run using:
7676
./dev/run-tests
7777

7878
Please see the guidance on how to
79-
[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting).
79+
[run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
8080

8181
## A Note About Hadoop Versions
8282

core/pom.xml

Lines changed: 0 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -381,35 +381,6 @@
381381
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
382382
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
383383
<plugins>
384-
<!-- Unzip py4j so we can include its files in the jar -->
385-
<plugin>
386-
<groupId>org.apache.maven.plugins</groupId>
387-
<artifactId>maven-antrun-plugin</artifactId>
388-
<executions>
389-
<execution>
390-
<phase>generate-resources</phase>
391-
<goals>
392-
<goal>run</goal>
393-
</goals>
394-
</execution>
395-
</executions>
396-
<configuration>
397-
<target>
398-
<unzip src="../python/lib/py4j-0.8.2.1-src.zip" dest="../python/build" />
399-
</target>
400-
</configuration>
401-
</plugin>
402-
<plugin>
403-
<artifactId>maven-clean-plugin</artifactId>
404-
<configuration>
405-
<filesets>
406-
<fileset>
407-
<directory>${basedir}/../python/build</directory>
408-
</fileset>
409-
</filesets>
410-
<verbose>true</verbose>
411-
</configuration>
412-
</plugin>
413384
<plugin>
414385
<groupId>org.apache.maven.plugins</groupId>
415386
<artifactId>maven-dependency-plugin</artifactId>
@@ -438,24 +409,6 @@
438409
</executions>
439410
</plugin>
440411
</plugins>
441-
442-
<resources>
443-
<resource>
444-
<directory>src/main/resources</directory>
445-
</resource>
446-
<resource>
447-
<directory>../python</directory>
448-
<includes>
449-
<include>pyspark/*.py</include>
450-
</includes>
451-
</resource>
452-
<resource>
453-
<directory>../python/build</directory>
454-
<includes>
455-
<include>py4j/*.py</include>
456-
</includes>
457-
</resource>
458-
</resources>
459412
</build>
460413

461414
<profiles>

core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js

Lines changed: 6 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js

Lines changed: 33 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,6 @@ function renderDagViz(forJob) {
140140
svg.selectAll("#" + nodeId).classed("cached", true);
141141
});
142142

143-
// More post-processing
144-
drawClusterLabels(svg, forJob);
145143
resizeSvg(svg);
146144
}
147145

@@ -151,7 +149,7 @@ function renderDagVizForStage(svgContainer) {
151149
var dot = metadata.select(".dot-file").text();
152150
var containerId = VizConstants.graphPrefix + metadata.attr("stage-id");
153151
var container = svgContainer.append("g").attr("id", containerId);
154-
renderDot(dot, container, StagePageVizConstants.rankSep);
152+
renderDot(dot, container, false);
155153

156154
// Round corners on rectangles
157155
svgContainer
@@ -209,7 +207,7 @@ function renderDagVizForJob(svgContainer) {
209207
}
210208

211209
// Actually render the stage
212-
renderDot(dot, container, JobPageVizConstants.rankSep);
210+
renderDot(dot, container, true);
213211

214212
// Round corners on rectangles
215213
container
@@ -231,14 +229,14 @@ function renderDagVizForJob(svgContainer) {
231229
}
232230

233231
/* Render the dot file as an SVG in the given container. */
234-
function renderDot(dot, container, rankSep) {
232+
function renderDot(dot, container, forJob) {
235233
var escaped_dot = dot
236234
.replace(/&lt;/g, "<")
237235
.replace(/&gt;/g, ">")
238236
.replace(/&quot;/g, "\"");
239237
var g = graphlibDot.read(escaped_dot);
240-
g.graph().rankSep = rankSep;
241238
var renderer = new dagreD3.render();
239+
preprocessGraphLayout(g, forJob);
242240
renderer(container, g);
243241
}
244242

@@ -251,50 +249,38 @@ function graphContainer() { return d3.select("#dag-viz-graph"); }
251249
function metadataContainer() { return d3.select("#dag-viz-metadata"); }
252250

253251
/*
254-
* Helper function to create draw a label for each cluster.
255-
*
256-
* We need to do this manually because dagre-d3 does not support labeling clusters.
257-
* In general, the clustering support for dagre-d3 is quite limited at this point.
252+
* Helper function to pre-process the graph layout.
253+
* This step is necessary for certain styles that affect the positioning
254+
* and sizes of graph elements, e.g. padding, font style, shape.
258255
*/
259-
function drawClusterLabels(svgContainer, forJob) {
260-
var clusterLabelSize, stageClusterLabelSize;
256+
function preprocessGraphLayout(g, forJob) {
257+
var nodes = g.nodes();
258+
for (var i = 0; i < nodes.length; i++) {
259+
var isCluster = g.children(nodes[i]).length > 0;
260+
if (!isCluster) {
261+
var node = g.node(nodes[i]);
262+
if (forJob) {
263+
// Do not display RDD name on job page
264+
node.shape = "circle";
265+
node.labelStyle = "font-size: 0px";
266+
} else {
267+
node.labelStyle = "font-size: 12px";
268+
}
269+
node.padding = "5";
270+
}
271+
}
272+
// Curve the edges
273+
var edges = g.edges();
274+
for (var j = 0; j < edges.length; j++) {
275+
var edge = g.edge(edges[j]);
276+
edge.lineInterpolate = "basis";
277+
}
278+
// Adjust vertical separation between nodes
261279
if (forJob) {
262-
clusterLabelSize = JobPageVizConstants.clusterLabelSize;
263-
stageClusterLabelSize = JobPageVizConstants.stageClusterLabelSize;
280+
g.graph().rankSep = JobPageVizConstants.rankSep;
264281
} else {
265-
clusterLabelSize = StagePageVizConstants.clusterLabelSize;
266-
stageClusterLabelSize = StagePageVizConstants.stageClusterLabelSize;
282+
g.graph().rankSep = StagePageVizConstants.rankSep;
267283
}
268-
svgContainer.selectAll("g.cluster").each(function() {
269-
var cluster = d3.select(this);
270-
var isStage = cluster.attr("id").indexOf(VizConstants.stageClusterPrefix) > -1;
271-
var labelSize = isStage ? stageClusterLabelSize : clusterLabelSize;
272-
drawClusterLabel(cluster, labelSize);
273-
});
274-
}
275-
276-
/*
277-
* Helper function to draw a label for the given cluster element based on its name.
278-
*
279-
* In the process, we need to expand the bounding box to make room for the label.
280-
* We need to do this because dagre-d3 did not take this into account when it first
281-
* rendered the bounding boxes. Note that this means we need to adjust the view box
282-
* of the SVG afterwards since we shifted a few boxes around.
283-
*/
284-
function drawClusterLabel(d3cluster, fontSize) {
285-
var cluster = d3cluster;
286-
var rect = d3cluster.select("rect");
287-
rect.attr("y", toFloat(rect.attr("y")) - fontSize);
288-
rect.attr("height", toFloat(rect.attr("height")) + fontSize);
289-
var labelX = toFloat(rect.attr("x")) + toFloat(rect.attr("width")) - fontSize / 2;
290-
var labelY = toFloat(rect.attr("y")) + fontSize * 1.5;
291-
var labelText = cluster.attr("name").replace(VizConstants.clusterPrefix, "");
292-
cluster.append("text")
293-
.attr("x", labelX)
294-
.attr("y", labelY)
295-
.attr("text-anchor", "end")
296-
.style("font-size", fontSize + "px")
297-
.text(labelText);
298284
}
299285

300286
/*
@@ -444,7 +430,7 @@ function addTooltipsForRDDs(svgContainer) {
444430
if (tooltipText) {
445431
node.select("circle")
446432
.attr("data-toggle", "tooltip")
447-
.attr("data-placement", "right")
433+
.attr("data-placement", "bottom")
448434
.attr("title", tooltipText)
449435
}
450436
});

core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ private[spark] object PythonUtils {
3131
def sparkPythonPath: String = {
3232
val pythonPath = new ArrayBuffer[String]
3333
for (sparkHome <- sys.env.get("SPARK_HOME")) {
34-
pythonPath += Seq(sparkHome, "python").mkString(File.separator)
34+
pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
3535
pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
3636
}
3737
pythonPath ++= SparkContext.jarOfObject(this)
@@ -53,4 +53,11 @@ private[spark] object PythonUtils {
5353
def toSeq[T](cols: JList[T]): Seq[T] = {
5454
cols.toList.toSeq
5555
}
56+
57+
/**
58+
* Convert java map of K, V into Map of K, V (for calling API with varargs)
59+
*/
60+
def toScalaMap[K, V](jm: java.util.Map[K, V]): Map[K, V] = {
61+
jm.toMap
62+
}
5663
}

core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,22 @@ import java.lang.reflect.Method
2222
import java.security.PrivilegedExceptionAction
2323
import java.util.{Arrays, Comparator}
2424

25+
import scala.collection.JavaConversions._
26+
import scala.concurrent.duration._
27+
import scala.language.postfixOps
28+
2529
import com.google.common.primitives.Longs
2630
import org.apache.hadoop.conf.Configuration
27-
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
2831
import org.apache.hadoop.fs.FileSystem.Statistics
32+
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
2933
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
3034
import org.apache.hadoop.mapred.JobConf
3135
import org.apache.hadoop.mapreduce.JobContext
3236
import org.apache.hadoop.security.{Credentials, UserGroupInformation}
3337

34-
import org.apache.spark.{Logging, SparkConf, SparkException}
3538
import org.apache.spark.annotation.DeveloperApi
3639
import org.apache.spark.util.Utils
37-
38-
import scala.collection.JavaConversions._
39-
import scala.concurrent.duration._
40-
import scala.language.postfixOps
40+
import org.apache.spark.{Logging, SparkConf, SparkException}
4141

4242
/**
4343
* :: DeveloperApi ::
@@ -199,13 +199,43 @@ class SparkHadoopUtil extends Logging {
199199
* that file.
200200
*/
201201
def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
202-
def recurse(path: Path): Array[FileStatus] = {
203-
val (directories, leaves) = fs.listStatus(path).partition(_.isDir)
204-
leaves ++ directories.flatMap(f => listLeafStatuses(fs, f.getPath))
202+
listLeafStatuses(fs, fs.getFileStatus(basePath))
203+
}
204+
205+
/**
206+
* Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the
207+
* given path points to a file, return a single-element collection containing [[FileStatus]] of
208+
* that file.
209+
*/
210+
def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = {
211+
def recurse(status: FileStatus): Seq[FileStatus] = {
212+
val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDir)
213+
leaves ++ directories.flatMap(f => listLeafStatuses(fs, f))
205214
}
206215

207-
val baseStatus = fs.getFileStatus(basePath)
208-
if (baseStatus.isDir) recurse(basePath) else Array(baseStatus)
216+
if (baseStatus.isDir) recurse(baseStatus) else Seq(baseStatus)
217+
}
218+
219+
def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
220+
listLeafDirStatuses(fs, fs.getFileStatus(basePath))
221+
}
222+
223+
def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = {
224+
def recurse(status: FileStatus): Seq[FileStatus] = {
225+
val (directories, files) = fs.listStatus(status.getPath).partition(_.isDir)
226+
val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus]
227+
leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir))
228+
}
229+
230+
assert(baseStatus.isDir)
231+
recurse(baseStatus)
232+
}
233+
234+
def globPath(pattern: Path): Seq[Path] = {
235+
val fs = pattern.getFileSystem(conf)
236+
Option(fs.globStatus(pattern)).map { statuses =>
237+
statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq
238+
}.getOrElse(Seq.empty[Path])
209239
}
210240

211241
/**

0 commit comments

Comments
 (0)