Merge branch 'master' of https://github.com/apache/spark into sparkr-…

…cran-changes
apache · Jul 14, 2016 · e4ad2ec · e4ad2ec
2 parents 3299242 + 91575ca
commit e4ad2ec
Show file tree

Hide file tree

Showing 265 changed files with 4,085 additions and 2,509 deletions.
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -341,9 +341,8 @@ export("partitionBy",
        "rowsBetween",
        "rangeBetween")
 
-export("window.partitionBy",
-       "window.orderBy")
-
+export("windowPartitionBy",
+       "windowOrderBy")
 
 S3method(print, jobj)
 S3method(print, structField)
@@ -353,4 +352,3 @@ S3method(structField, character)
 S3method(structField, jobj)
 S3method(structType, jobj)
 S3method(structType, structField)
-# window.orderBy window.partitionBy
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -48,7 +48,9 @@ getInternalType <- function(x) {
 #' @return whatever the target returns
 #' @noRd
 dispatchFunc <- function(newFuncSig, x, ...) {
-  funcName <- as.character(sys.call(sys.parent())[[1]])
+  # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame)
+  callsite <- as.character(sys.call(sys.parent())[[1]])
+  funcName <- callsite[[length(callsite)]]
   f <- get(paste0(funcName, ".default"))
   # Strip sqlContext from list of parameters and then pass the rest along.
   contextNames <- c("org.apache.spark.sql.SQLContext",

diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
@@ -22,10 +22,10 @@ NULL
 
 #' S4 class that represents a WindowSpec
 #'
-#' WindowSpec can be created by using window.partitionBy() or window.orderBy()
+#' WindowSpec can be created by using windowPartitionBy() or windowOrderBy()
 #'
 #' @rdname WindowSpec
-#' @seealso \link{window.partitionBy}, \link{window.orderBy}
+#' @seealso \link{windowPartitionBy}, \link{windowOrderBy}
 #'
 #' @param sws A Java object reference to the backing Scala WindowSpec
 #' @export

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -779,13 +779,13 @@ setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween
 #' @export
 setGeneric("rangeBetween", function(x, start, end) { standardGeneric("rangeBetween") })
 
-#' @rdname window.partitionBy
+#' @rdname windowPartitionBy
 #' @export
-setGeneric("window.partitionBy", function(x, ...) { standardGeneric("window.partitionBy") })
+setGeneric("windowPartitionBy", function(col, ...) { standardGeneric("windowPartitionBy") })
 
-#' @rdname window.orderBy
+#' @rdname windowOrderBy
 #' @export
-setGeneric("window.orderBy", function(x, ...) { standardGeneric("window.orderBy") })
+setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy") })
 
 ###################### Expression Function Methods ##########################
 
@@ -1255,7 +1255,6 @@ setGeneric("spark.glm", function(data, formula, ...) { standardGeneric("spark.gl
 #' @export
 setGeneric("glm")
 
-#' predict
 #' @rdname predict
 #' @export
 setGeneric("predict", function(object, ...) { standardGeneric("predict") })
@@ -1280,7 +1279,6 @@ setGeneric("spark.naiveBayes", function(data, formula, ...) { standardGeneric("s
 #' @export
 setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") })
 
-#' write.ml
 #' @rdname write.ml
 #' @export
 setGeneric("write.ml", function(object, path, ...) { standardGeneric("write.ml") })
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
@@ -53,10 +53,34 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj"))
 #' @note KMeansModel since 2.0.0
 setClass("KMeansModel", representation(jobj = "jobj"))
 
+#' Saves the MLlib model to the input path
+#'
+#' Saves the MLlib model to the input path. For more information, see the specific
+#' MLlib model below.
+#' @rdname write.ml
+#' @name write.ml
+#' @export
+#' @seealso \link{spark.glm}, \link{glm}
+#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+#' @seealso \link{read.ml}
+NULL
+
+#' Makes predictions from a MLlib model
+#'
+#' Makes predictions from a MLlib model. For more information, see the specific
+#' MLlib model below.
+#' @rdname predict
+#' @name predict
+#' @export
+#' @seealso \link{spark.glm}, \link{glm}
+#' @seealso \link{spark.kmeans}, \link{spark.naiveBayes}, \link{spark.survreg}
+NULL
+
 #' Generalized Linear Models
 #'
-#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the
-#' produced model and save the model to the input path.
+#' Fits generalized linear model against a Spark DataFrame.
+#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
+#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
@@ -146,7 +170,7 @@ setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDat
           })
 
 #  Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().
-#'
+
 #' @param object A fitted generalized linear model
 #' @return \code{summary} returns a summary object of the fitted model, a list of components
 #'         including at least the coefficients, null/residual deviance, null/residual degrees
@@ -186,7 +210,7 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
           })
 
 #  Prints the summary of GeneralizedLinearRegressionModel
-#'
+
 #' @rdname spark.glm
 #' @param x Summary object of fitted generalized linear model returned by \code{summary} function
 #' @export
@@ -271,7 +295,8 @@ setMethod("summary", signature(object = "NaiveBayesModel"),
 #' K-Means Clustering Model
 #'
 #' Fits a k-means clustering model against a Spark DataFrame, similarly to R's kmeans().
-#' Users can print, make predictions on the produced model and save the model to the input path.
+#' Users can call \code{summary} to print a summary of the fitted model, \code{predict} to make
+#' predictions on new data, and \code{write.ml}/\code{read.ml} to save/load fitted models.
 #'
 #' @param data SparkDataFrame for training
 #' @param formula A symbolic description of the model to be fitted. Currently only a few formula
@@ -345,7 +370,7 @@ setMethod("fitted", signature(object = "KMeansModel"),
           })
 
 #  Get the summary of a k-means model
-#'
+
 #' @param object A fitted k-means model
 #' @return \code{summary} returns the model's coefficients, size and cluster
 #' @rdname spark.kmeans
@@ -372,7 +397,7 @@ setMethod("summary", signature(object = "KMeansModel"),
           })
 
 #  Predicted values based on a k-means model
-#'
+
 #' @return \code{predict} returns the predicted values based on a k-means model
 #' @rdname spark.kmeans
 #' @export
@@ -466,7 +491,7 @@ setMethod("write.ml", signature(object = "AFTSurvivalRegressionModel", path = "c
           })
 
 #  Saves the generalized linear model to the input path.
-#'
+
 #' @param path The directory where the model is saved
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
@@ -484,7 +509,7 @@ setMethod("write.ml", signature(object = "GeneralizedLinearRegressionModel", pat
           })
 
 #  Save fitted MLlib model to the input path
-#'
+
 #' @param path The directory where the model is saved
 #' @param overwrite Overwrites or not if the output path already exists. Default is FALSE
 #'                  which means throw exception if the output path exists.
@@ -508,6 +533,7 @@ setMethod("write.ml", signature(object = "KMeansModel", path = "character"),
 #' @rdname read.ml
 #' @name read.ml
 #' @export
+#' @seealso \link{write.ml}
 #' @examples
 #' \dontrun{
 #' path <- "path/to/model"

diff --git a/R/pkg/R/window.R b/R/pkg/R/window.R
@@ -17,42 +17,47 @@
 
 # window.R - Utility functions for defining window in DataFrames
 
-#' window.partitionBy
+#' windowPartitionBy
 #'
 #' Creates a WindowSpec with the partitioning defined.
 #'
-#' @rdname window.partitionBy
-#' @name window.partitionBy
-#' @aliases window.partitionBy,character-method
+#' @param col A column name or Column by which rows are partitioned to 
+#'            windows.
+#' @param ... Optional column names or Columns in addition to col, by 
+#'            which rows are partitioned to windows.
+#'
+#' @rdname windowPartitionBy
+#' @name windowPartitionBy
+#' @aliases windowPartitionBy,character-method
 #' @export
 #' @examples
 #' \dontrun{
-#'   ws <- window.partitionBy("key1", "key2")
+#'   ws <- windowPartitionBy("key1", "key2")
 #'   df1 <- select(df, over(lead("value", 1), ws))
 #'
-#'   ws <- window.partitionBy(df$key1, df$key2)
+#'   ws <- windowPartitionBy(df$key1, df$key2)
 #'   df1 <- select(df, over(lead("value", 1), ws))
 #' }
-#' @note window.partitionBy(character) since 2.0.0
-setMethod("window.partitionBy",
-          signature(x = "character"),
-          function(x, ...) {
+#' @note windowPartitionBy(character) since 2.0.0
+setMethod("windowPartitionBy",
+          signature(col = "character"),
+          function(col, ...) {
             windowSpec(
               callJStatic("org.apache.spark.sql.expressions.Window",
                           "partitionBy",
                           x,
                           list(...)))
           })
 
-#' @rdname window.partitionBy
-#' @name window.partitionBy
-#' @aliases window.partitionBy,Column-method
+#' @rdname windowPartitionBy
+#' @name windowPartitionBy
+#' @aliases windowPartitionBy,Column-method
 #' @export
-#' @note window.partitionBy(Column) since 2.0.0
-setMethod("window.partitionBy",
-          signature(x = "Column"),
-          function(x, ...) {
-            jcols <- lapply(list(x, ...), function(c) {
+#' @note windowPartitionBy(Column) since 2.0.0
+setMethod("windowPartitionBy",
+          signature(col = "Column"),
+          function(col, ...) {
+            jcols <- lapply(list(col, ...), function(c) {
               c@jc
             })
             windowSpec(
@@ -61,42 +66,47 @@ setMethod("window.partitionBy",
                           jcols))
           })
 
-#' window.orderBy
+#' windowOrderBy
 #'
 #' Creates a WindowSpec with the ordering defined.
 #'
-#' @rdname window.orderBy
-#' @name window.orderBy
-#' @aliases window.orderBy,character-method
+#' @param col A column name or Column by which rows are ordered within 
+#'            windows.
+#' @param ... Optional column names or Columns in addition to col, by 
+#'            which rows are ordered within windows.
+#'
+#' @rdname windowOrderBy
+#' @name windowOrderBy
+#' @aliases windowOrderBy,character-method
 #' @export
 #' @examples
 #' \dontrun{
-#'   ws <- window.orderBy("key1", "key2")
+#'   ws <- windowOrderBy("key1", "key2")
 #'   df1 <- select(df, over(lead("value", 1), ws))
 #'
-#'   ws <- window.orderBy(df$key1, df$key2)
+#'   ws <- windowOrderBy(df$key1, df$key2)
 #'   df1 <- select(df, over(lead("value", 1), ws))
 #' }
-#' @note window.orderBy(character) since 2.0.0
-setMethod("window.orderBy",
-          signature(x = "character"),
-          function(x, ...) {
+#' @note windowOrderBy(character) since 2.0.0
+setMethod("windowOrderBy",
+          signature(col = "character"),
+          function(col, ...) {
             windowSpec(
               callJStatic("org.apache.spark.sql.expressions.Window",
                           "orderBy",
-                          x,
+                          col,
                           list(...)))
           })
 
-#' @rdname window.orderBy
-#' @name window.orderBy
-#' @aliases window.orderBy,Column-method
+#' @rdname windowOrderBy
+#' @name windowOrderBy
+#' @aliases windowOrderBy,Column-method
 #' @export
-#' @note window.orderBy(Column) since 2.0.0
-setMethod("window.orderBy",
-          signature(x = "Column"),
-          function(x, ...) {
-            jcols <- lapply(list(x, ...), function(c) {
+#' @note windowOrderBy(Column) since 2.0.0
+setMethod("windowOrderBy",
+          signature(col = "Column"),
+          function(col, ...) {
+            jcols <- lapply(list(col, ...), function(c) {
               c@jc
             })
             windowSpec(

diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R
@@ -16,7 +16,7 @@
 #
 library(SparkR)
 
-sparkSession <- sparkR.session()
+sparkR.session()
 
 helloTest <- SparkR:::callJStatic("sparkR.test.hello",
                                   "helloWorld",

diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R
@@ -17,7 +17,7 @@
 library(SparkR)
 library(sparkPackageTest)
 
-sparkSession <- sparkR.session()
+sparkR.session()
 
 run1 <- myfunc(5L)
 

diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -237,7 +237,7 @@ test_that("read csv as DataFrame", {
                    "Empty,Dummy,Placeholder")
   writeLines(mockLinesCsv, csvPath)
 
-  df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.string = "Empty")
+  df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty")
   expect_equal(count(df2), 4)
   withoutna2 <- na.omit(df2, how = "any", cols = "year")
   expect_equal(count(withoutna2), 3)
@@ -2376,25 +2376,25 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
 test_that("Window functions on a DataFrame", {
   df <- createDataFrame(list(list(1L, "1"), list(2L, "2"), list(1L, "1"), list(2L, "2")),
                         schema = c("key", "value"))
-  ws <- orderBy(window.partitionBy("key"), "value")
+  ws <- orderBy(windowPartitionBy("key"), "value")
   result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws)))
   names(result) <- c("key", "value")
   expected <- data.frame(key = c(1L, NA, 2L, NA),
                        value = c("1", NA, "2", NA),
                        stringsAsFactors = FALSE)
   expect_equal(result, expected)
 
-  ws <- orderBy(window.partitionBy(df$key), df$value)
+  ws <- orderBy(windowPartitionBy(df$key), df$value)
   result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws)))
   names(result) <- c("key", "value")
   expect_equal(result, expected)
 
-  ws <- partitionBy(window.orderBy("value"), "key")
+  ws <- partitionBy(windowOrderBy("value"), "key")
   result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws)))
   names(result) <- c("key", "value")
   expect_equal(result, expected)
 
-  ws <- partitionBy(window.orderBy(df$value), df$key)
+  ws <- partitionBy(windowOrderBy(df$value), df$key)
   result <- collect(select(df, over(lead("key", 1), ws), over(lead("value", 1), ws)))
   names(result) <- c("key", "value")
   expect_equal(result, expected)
@@ -2405,7 +2405,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", {
   a <- 1:3
   b <- c("a", "b", "c")
   ldf <- data.frame(a, b)
-  df <- suppressWarnings(createDataFrame(sqlContext, ldf))
+  # Call function with namespace :: operator - SPARK-16538
+  df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf))
   expect_equal(columns(df), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
   expect_equal(count(df), 3)

diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
@@ -54,6 +54,11 @@
       <artifactId>jackson-databind</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>io.dropwizard.metrics</groupId>
+      <artifactId>metrics-core</artifactId>
+    </dependency>
+
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-annotations</artifactId>