Merge branch 'master' of https://github.com/apache/spark into SPARK-3…

…2320
apache · Dec 4, 2020 · 11f3790 · 11f3790
2 parents c75cd57 + 976e897
commit 11f3790
Show file tree

Hide file tree

Showing 569 changed files with 14,138 additions and 5,126 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -153,7 +153,7 @@ jobs:
     name: "Build modules: ${{ matrix.modules }}"
     runs-on: ubuntu-20.04
     container:
-      image: dongjoon/apache-spark-github-action-image:20201015
+      image: dongjoon/apache-spark-github-action-image:20201025
     strategy:
       fail-fast: false
       matrix:

diff --git a/R/CRAN_RELEASE.md b/R/CRAN_RELEASE.md
@@ -25,7 +25,7 @@ To release SparkR as a package to CRAN, we would use the `devtools` package. Ple
 
 First, check that the `Version:` field in the `pkg/DESCRIPTION` file is updated. Also, check for stale files not under source control.
 
-Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, eg. `yum -q -y install qpdf`).
+Note that while `run-tests.sh` runs `check-cran.sh` (which runs `R CMD check`), it is doing so with `--no-manual --no-vignettes`, which skips a few vignettes or PDF checks - therefore it will be preferred to run `R CMD check` on the source package built manually before uploading a release. Also note that for CRAN checks for pdf vignettes to success, `qpdf` tool must be there (to install it, e.g. `yum -q -y install qpdf`).
 
 To upload a release, we would need to update the `cran-comments.md`. This should generally contain the results from running the `check-cran.sh` script along with comments on status of all `WARNING` (should not be any) or `NOTE`. As a part of `check-cran.sh` and the release process, the vignettes is build - make sure `SPARK_HOME` is set and Spark jars are accessible.
 

diff --git a/R/install-dev.bat b/R/install-dev.bat
@@ -26,7 +26,7 @@ MKDIR %SPARK_HOME%\R\lib
 
 rem When you pass the package path directly as an argument to R CMD INSTALL,
 rem it takes the path as 'C:\projects\spark\R\..\R\pkg"' as an example at
-rem R 4.0. To work around this, directly go to the directoy and install it.
+rem R 4.0. To work around this, directly go to the directory and install it.
 rem See also SPARK-32074
 pushd %SPARK_HOME%\R\pkg\
 R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" .

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
@@ -202,6 +202,7 @@ exportMethods("%<=>%",
               "%in%",
               "abs",
               "acos",
+              "acosh",
               "add_months",
               "alias",
               "approx_count_distinct",
@@ -222,6 +223,7 @@ exportMethods("%<=>%",
               "array_remove",
               "array_repeat",
               "array_sort",
+              "array_to_vector",
               "array_transform",
               "arrays_overlap",
               "array_union",
@@ -232,8 +234,10 @@ exportMethods("%<=>%",
               "asc_nulls_last",
               "ascii",
               "asin",
+              "asinh",
               "assert_true",
               "atan",
+              "atanh",
               "atan2",
               "avg",
               "base64",

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
@@ -2772,7 +2772,7 @@ setMethod("merge",
 #' Creates a list of columns by replacing the intersected ones with aliases
 #'
 #' Creates a list of columns by replacing the intersected ones with aliases.
-#' The name of the alias column is formed by concatanating the original column name and a suffix.
+#' The name of the alias column is formed by concatenating the original column name and a suffix.
 #'
 #' @param x a SparkDataFrame
 #' @param intersectedColNames a list of intersected column names of the SparkDataFrame
@@ -3231,7 +3231,7 @@ setMethod("describe",
 #' \item stddev
 #' \item min
 #' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
+#' \item arbitrary approximate percentiles specified as a percentage (e.g., "75\%")
 #' }
 #' If no statistics are given, this function computes count, mean, stddev, min,
 #' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
@@ -3743,7 +3743,7 @@ setMethod("histogram",
 #'
 #' @param x a SparkDataFrame.
 #' @param url JDBC database url of the form \code{jdbc:subprotocol:subname}.
-#' @param tableName yhe name of the table in the external database.
+#' @param tableName the name of the table in the external database.
 #' @param mode one of 'append', 'overwrite', 'error', 'errorifexists', 'ignore'
 #'             save mode (it is 'error' by default)
 #' @param ... additional JDBC database connection properties.

diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
@@ -970,7 +970,7 @@ setMethod("takeSample", signature(x = "RDD", withReplacement = "logical",
                                                                   MAXINT)))))
             # If the first sample didn't turn out large enough, keep trying to
             # take samples; this shouldn't happen often because we use a big
-            # multiplier for thei initial size
+            # multiplier for the initial size
             while (length(samples) < total)
               samples <- collectRDD(sampleRDD(x, withReplacement, fraction,
                                            as.integer(ceiling(stats::runif(1,
@@ -1512,7 +1512,7 @@ setMethod("glom",
 #'
 #' @param x An RDD.
 #' @param y An RDD.
-#' @return a new RDD created by performing the simple union (witout removing
+#' @return a new RDD created by performing the simple union (without removing
 #' duplicates) of two input RDDs.
 #' @examples
 #'\dontrun{

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
@@ -203,7 +203,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) {
       })
     }
 
-    # SPAKR-SQL does not support '.' in column name, so replace it with '_'
+    # SPARK-SQL does not support '.' in column name, so replace it with '_'
     # TODO(davies): remove this once SPARK-2775 is fixed
     names <- lapply(names, function(n) {
       nn <- gsub(".", "_", n, fixed = TRUE)

diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
@@ -54,7 +54,7 @@ setMethod("show", "WindowSpec",
 #' Defines the partitioning columns in a WindowSpec.
 #'
 #' @param x a WindowSpec.
-#' @param col a column to partition on (desribed by the name or Column).
+#' @param col a column to partition on (described by the name or Column).
 #' @param ... additional column(s) to partition on.
 #' @return A WindowSpec.
 #' @rdname partitionBy
@@ -231,7 +231,7 @@ setMethod("rangeBetween",
 #' @rdname over
 #' @name over
 #' @aliases over,Column,WindowSpec-method
-#' @family colum_func
+#' @family column_func
 #' @examples
 #' \dontrun{
 #'   df <- createDataFrame(mtcars)

diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
@@ -135,7 +135,7 @@ createMethods()
 #' @rdname alias
 #' @name alias
 #' @aliases alias,Column-method
-#' @family colum_func
+#' @family column_func
 #' @examples
 #' \dontrun{
 #' df <- createDataFrame(iris)
@@ -161,7 +161,7 @@ setMethod("alias",
 #'
 #' @rdname substr
 #' @name substr
-#' @family colum_func
+#' @family column_func
 #' @aliases substr,Column-method
 #'
 #' @param x a Column.
@@ -187,7 +187,7 @@ setMethod("substr", signature(x = "Column"),
 #'
 #' @rdname startsWith
 #' @name startsWith
-#' @family colum_func
+#' @family column_func
 #' @aliases startsWith,Column-method
 #'
 #' @param x vector of character string whose "starts" are considered
@@ -206,7 +206,7 @@ setMethod("startsWith", signature(x = "Column"),
 #'
 #' @rdname endsWith
 #' @name endsWith
-#' @family colum_func
+#' @family column_func
 #' @aliases endsWith,Column-method
 #'
 #' @param x vector of character string whose "ends" are considered
@@ -224,7 +224,7 @@ setMethod("endsWith", signature(x = "Column"),
 #'
 #' @rdname between
 #' @name between
-#' @family colum_func
+#' @family column_func
 #' @aliases between,Column-method
 #'
 #' @param x a Column
@@ -251,7 +251,7 @@ setMethod("between", signature(x = "Column"),
 # nolint end
 #' @rdname cast
 #' @name cast
-#' @family colum_func
+#' @family column_func
 #' @aliases cast,Column-method
 #'
 #' @examples
@@ -300,7 +300,7 @@ setMethod("%in%",
 #'              Can be a single value or a Column.
 #' @rdname otherwise
 #' @name otherwise
-#' @family colum_func
+#' @family column_func
 #' @aliases otherwise,Column-method
 #' @note otherwise since 1.5.0
 setMethod("otherwise",
@@ -440,7 +440,7 @@ setMethod("withField",
 #' )
 #'
 #' # However, if you are going to add/replace multiple nested fields,
-#' # it is preffered to extract out the nested struct before
+#' # it is preferred to extract out the nested struct before
 #' # adding/replacing multiple fields e.g.
 #' head(
 #'   withColumn(

diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
@@ -86,7 +86,7 @@ makeSplits <- function(numSerializedSlices, length) {
   # For instance, for numSerializedSlices of 22, length of 50
   #  [1]  0  0  2  2  4  4  6  6  6  9  9 11 11 13 13 15 15 15 18 18 20 20 22 22 22
   # [26] 25 25 27 27 29 29 31 31 31 34 34 36 36 38 38 40 40 40 43 43 45 45 47 47 47
-  # Notice the slice group with 3 slices (ie. 6, 15, 22) are roughly evenly spaced.
+  # Notice the slice group with 3 slices (i.e. 6, 15, 22) are roughly evenly spaced.
   # We are trying to reimplement the calculation in the positions method in ParallelCollectionRDD
   if (numSerializedSlices > 0) {
     unlist(lapply(0: (numSerializedSlices - 1), function(x) {
@@ -116,7 +116,7 @@ makeSplits <- function(numSerializedSlices, length) {
 #' This change affects both createDataFrame and spark.lapply.
 #' In the specific one case that it is used to convert R native object into SparkDataFrame, it has
 #' always been kept at the default of 1. In the case the object is large, we are explicitly setting
-#' the parallism to numSlices (which is still 1).
+#' the parallelism to numSlices (which is still 1).
 #'
 #' Specifically, we are changing to split positions to match the calculation in positions() of
 #' ParallelCollectionRDD in Spark.

diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
@@ -250,7 +250,7 @@ readDeserializeWithKeysInArrow <- function(inputCon) {
 
   keys <- readMultipleObjects(inputCon)
 
-  # Read keys to map with each groupped batch later.
+  # Read keys to map with each grouped batch later.
   list(keys = keys, data = data)
 }
 

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -144,7 +144,7 @@ NULL
 #' @param y Column to compute on.
 #' @param pos In \itemize{
 #'                \item \code{locate}: a start position of search.
-#'                \item \code{overlay}: a start postiton for replacement.
+#'                \item \code{overlay}: a start position for replacement.
 #'                }
 #' @param len In \itemize{
 #'               \item \code{lpad} the maximum length of each output result.
@@ -357,7 +357,13 @@ NULL
 #' @examples
 #' \dontrun{
 #' df <- read.df("data/mllib/sample_libsvm_data.txt", source = "libsvm")
-#' head(select(df, vector_to_array(df$features)))
+#' head(
+#'   withColumn(
+#'     withColumn(df, "array", vector_to_array(df$features)),
+#'     "vector",
+#'     array_to_vector(column("array"))
+#'   )
+#' )
 #' }
 NULL
 
@@ -455,6 +461,19 @@ setMethod("acos",
             column(jc)
           })
 
+#' @details
+#' \code{acosh}: Computes inverse hyperbolic cosine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases acosh acosh,Column-method
+#' @note acosh since 3.1.0
+setMethod("acosh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "acosh", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
 #'
@@ -522,6 +541,19 @@ setMethod("asin",
             column(jc)
           })
 
+#' @details
+#' \code{asinh}: Computes inverse hyperbolic sine of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases asinh asinh,Column-method
+#' @note asinh since 3.1.0
+setMethod("asinh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "asinh", x@jc)
+            column(jc)
+          })
+
 #' @details
 #' \code{atan}: Returns the inverse tangent of the given value,
 #' as if computed by \code{java.lang.Math.atan()}
@@ -536,6 +568,19 @@ setMethod("atan",
             column(jc)
           })
 
+#' @details
+#' \code{atanh}: Computes inverse hyperbolic tangent of the input column.
+#'
+#' @rdname column_math_functions
+#' @aliases atanh atanh,Column-method
+#' @note atanh since 3.1.0
+setMethod("atanh",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "atanh", x@jc)
+            column(jc)
+          })
+
 #' avg
 #'
 #' Aggregate function: returns the average of the values in a group.
@@ -2879,7 +2924,7 @@ setMethod("shiftRight", signature(y = "Column", x = "numeric"),
           })
 
 #' @details
-#' \code{shiftRightUnsigned}: (Unigned) shifts the given value numBits right. If the given value is
+#' \code{shiftRightUnsigned}: (Unsigned) shifts the given value numBits right. If the given value is
 #' a long value, it will return a long value else it will return an integer value.
 #'
 #' @rdname column_math_functions
@@ -4570,6 +4615,24 @@ setMethod("timestamp_seconds",
             column(jc)
           })
 
+#' @details
+#' \code{array_to_vector} Converts a column of array of numeric type into
+#' a column of dense vectors in MLlib
+#'
+#' @rdname column_ml_functions
+#' @aliases array_to_vector array_to_vector,Column-method
+#' @note array_to_vector since 3.1.0
+setMethod("array_to_vector",
+          signature(x = "Column"),
+          function(x) {
+            jc <- callJStatic(
+              "org.apache.spark.ml.functions",
+              "array_to_vector",
+              x@jc
+            )
+            column(jc)
+          })
+
 #' @details
 #' \code{vector_to_array} Converts a column of MLlib sparse/dense vectors into
 #' a column of dense arrays.

diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
@@ -838,6 +838,10 @@ setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat")
 #' @name NULL
 setGeneric("array_sort", function(x) { standardGeneric("array_sort") })
 
+#' @rdname column_ml_functions
+#' @name NULL
+setGeneric("array_to_vector", function(x) { standardGeneric("array_to_vector") })
+
 #' @rdname column_collection_functions
 #' @name NULL
 setGeneric("array_transform", function(x, f) { standardGeneric("array_transform") })

diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R
@@ -289,7 +289,7 @@ sparkCachePath <- function() {
 }
 
 # Length of the Spark cache specific relative path segments for each platform
-# eg. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix
+# e.g. "Apache\Spark\Cache" is 3 in Windows, or "spark" is 1 in unix
 # Must match sparkCachePath() exactly.
 sparkCacheRelPathLength <- function() {
   if (is_windows()) {

diff --git a/R/pkg/R/mllib_fpm.R b/R/pkg/R/mllib_fpm.R
@@ -125,7 +125,7 @@ setMethod("spark.freqItemsets", signature(object = "FPGrowthModel"),
 #'         The \code{SparkDataFrame} contains five columns:
 #'         \code{antecedent} (an array of the same type as the input column),
 #'         \code{consequent} (an array of the same type as the input column),
-#'         \code{condfidence} (confidence for the rule)
+#'         \code{confidence} (confidence for the rule)
 #'         \code{lift} (lift for the rule)
 #'         and \code{support} (support for the rule)
 #' @rdname spark.fpGrowth

diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
@@ -53,7 +53,7 @@ setClass("DecisionTreeRegressionModel", representation(jobj = "jobj"))
 #' @note DecisionTreeClassificationModel since 2.3.0
 setClass("DecisionTreeClassificationModel", representation(jobj = "jobj"))
 
-# Create the summary of a tree ensemble model (eg. Random Forest, GBT)
+# Create the summary of a tree ensemble model (e.g. Random Forest, GBT)
 summary.treeEnsemble <- function(model) {
   jobj <- model@jobj
   formula <- callJMethod(jobj, "formula")
@@ -73,7 +73,7 @@ summary.treeEnsemble <- function(model) {
        jobj = jobj)
 }
 
-# Prints the summary of tree ensemble models (eg. Random Forest, GBT)
+# Prints the summary of tree ensemble models (e.g. Random Forest, GBT)
 print.summary.treeEnsemble <- function(x) {
   jobj <- x$jobj
   cat("Formula: ", x$formula)

diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R
@@ -18,7 +18,7 @@
 # mllib_utils.R: Utilities for MLlib integration
 
 # Integration with R's standard functions.
-# Most of MLlib's argorithms are provided in two flavours:
+# Most of MLlib's algorithms are provided in two flavours:
 # - a specialization of the default R methods (glm). These methods try to respect
 #   the inputs and the outputs of R's method to the largest extent, but some small differences
 #   may exist.