Skip to content

Commit 9d354c2

Browse files
committed
Merge branch 'master' into SPARK-31102
2 parents d69d271 + a6e6fbf commit 9d354c2

File tree

620 files changed

+24854
-6345
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

620 files changed

+24854
-6345
lines changed

.asf.yaml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# https://cwiki.apache.org/confluence/display/INFRA/.asf.yaml+features+for+git+repositories
17+
---
18+
github:
19+
description: "Apache Spark - A unified analytics engine for large-scale data processing"
20+
homepage: https://spark.apache.org/
21+
labels:
22+
- python
23+
- scala
24+
- r
25+
- java
26+
- big-data
27+
- jdbc
28+
- sql
29+
- spark

.gitignore

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@
1818
.idea_modules/
1919
.project
2020
.pydevproject
21-
.python-version
22-
.ruby-version
2321
.scala_dependencies
2422
.settings
2523
/lib/

R/pkg/NAMESPACE

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ importFrom("utils", "download.file", "object.size", "packageVersion", "tail", "u
2828

2929
# S3 methods exported
3030
export("sparkR.session")
31+
export("sparkR.init")
3132
export("sparkR.session.stop")
3233
export("sparkR.stop")
3334
export("sparkR.conf")
@@ -41,6 +42,9 @@ export("sparkR.callJStatic")
4142

4243
export("install.spark")
4344

45+
export("sparkRSQL.init",
46+
"sparkRHive.init")
47+
4448
# MLlib integration
4549
exportMethods("glm",
4650
"spark.glm",
@@ -68,7 +72,10 @@ exportMethods("glm",
6872
"spark.freqItemsets",
6973
"spark.associationRules",
7074
"spark.findFrequentSequentialPatterns",
71-
"spark.assignClusters")
75+
"spark.assignClusters",
76+
"spark.fmClassifier",
77+
"spark.lm",
78+
"spark.fmRegressor")
7279

7380
# Job group lifecycle management methods
7481
export("setJobGroup",
@@ -148,6 +155,7 @@ exportMethods("arrange",
148155
"printSchema",
149156
"randomSplit",
150157
"rbind",
158+
"registerTempTable",
151159
"rename",
152160
"repartition",
153161
"repartitionByRange",
@@ -345,6 +353,7 @@ exportMethods("%<=>%",
345353
"over",
346354
"overlay",
347355
"percent_rank",
356+
"percentile_approx",
348357
"pmod",
349358
"posexplode",
350359
"posexplode_outer",
@@ -430,8 +439,10 @@ export("as.DataFrame",
430439
"cacheTable",
431440
"clearCache",
432441
"createDataFrame",
442+
"createExternalTable",
433443
"createTable",
434444
"currentDatabase",
445+
"dropTempTable",
435446
"dropTempView",
436447
"listColumns",
437448
"listDatabases",

R/pkg/R/DataFrame.R

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,32 @@ setMethod("createOrReplaceTempView",
521521
invisible(callJMethod(x@sdf, "createOrReplaceTempView", viewName))
522522
})
523523

524+
#' (Deprecated) Register Temporary Table
525+
#'
526+
#' Registers a SparkDataFrame as a Temporary Table in the SparkSession
527+
#' @param x A SparkDataFrame
528+
#' @param tableName A character vector containing the name of the table
529+
#'
530+
#' @seealso \link{createOrReplaceTempView}
531+
#' @rdname registerTempTable-deprecated
532+
#' @name registerTempTable
533+
#' @aliases registerTempTable,SparkDataFrame,character-method
534+
#' @examples
535+
#'\dontrun{
536+
#' sparkR.session()
537+
#' path <- "path/to/file.json"
538+
#' df <- read.json(path)
539+
#' registerTempTable(df, "json_df")
540+
#' new_df <- sql("SELECT * FROM json_df")
541+
#'}
542+
#' @note registerTempTable since 1.4.0
543+
setMethod("registerTempTable",
544+
signature(x = "SparkDataFrame", tableName = "character"),
545+
function(x, tableName) {
546+
.Deprecated("createOrReplaceTempView")
547+
invisible(callJMethod(x@sdf, "createOrReplaceTempView", tableName))
548+
})
549+
524550
#' insertInto
525551
#'
526552
#' Insert the contents of a SparkDataFrame into a table registered in the current SparkSession.

R/pkg/R/catalog.R

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,35 @@
1717

1818
# catalog.R: SparkSession catalog functions
1919

20+
#' (Deprecated) Create an external table
21+
#'
22+
#' Creates an external table based on the dataset in a data source,
23+
#' Returns a SparkDataFrame associated with the external table.
24+
#'
25+
#' The data source is specified by the \code{source} and a set of options(...).
26+
#' If \code{source} is not specified, the default data source configured by
27+
#' "spark.sql.sources.default" will be used.
28+
#'
29+
#' @param tableName a name of the table.
30+
#' @param path the path of files to load.
31+
#' @param source the name of external data source.
32+
#' @param schema the schema of the data required for some data sources.
33+
#' @param ... additional argument(s) passed to the method.
34+
#' @return A SparkDataFrame.
35+
#' @rdname createExternalTable-deprecated
36+
#' @seealso \link{createTable}
37+
#' @examples
38+
#'\dontrun{
39+
#' sparkR.session()
40+
#' df <- createExternalTable("myjson", path="path/to/json", source="json", schema)
41+
#' }
42+
#' @name createExternalTable
43+
#' @note createExternalTable since 1.4.0
44+
createExternalTable <- function(tableName, path = NULL, source = NULL, schema = NULL, ...) {
45+
.Deprecated("createTable", old = "createExternalTable")
46+
createTable(tableName, path, source, schema, ...)
47+
}
48+
2049
#' Creates a table based on the dataset in a data source
2150
#'
2251
#' Creates a table based on the dataset in a data source. Returns a SparkDataFrame associated with
@@ -130,6 +159,31 @@ clearCache <- function() {
130159
invisible(callJMethod(catalog, "clearCache"))
131160
}
132161

162+
#' (Deprecated) Drop Temporary Table
163+
#'
164+
#' Drops the temporary table with the given table name in the catalog.
165+
#' If the table has been cached/persisted before, it's also unpersisted.
166+
#'
167+
#' @param tableName The name of the SparkSQL table to be dropped.
168+
#' @seealso \link{dropTempView}
169+
#' @rdname dropTempTable-deprecated
170+
#' @examples
171+
#' \dontrun{
172+
#' sparkR.session()
173+
#' df <- read.df(path, "parquet")
174+
#' createOrReplaceTempView(df, "table")
175+
#' dropTempTable("table")
176+
#' }
177+
#' @name dropTempTable
178+
#' @note dropTempTable since 1.4.0
179+
dropTempTable <- function(tableName) {
180+
.Deprecated("dropTempView", old = "dropTempTable")
181+
if (class(tableName) != "character") {
182+
stop("tableName must be a string.")
183+
}
184+
dropTempView(tableName)
185+
}
186+
133187
#' Drops the temporary view with the given view name in the catalog.
134188
#'
135189
#' Drops the temporary view with the given view name in the catalog.

R/pkg/R/functions.R

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,13 @@ NULL
7777
#' days to be added to or subtracted from \code{y}. For class \code{character}, it is
7878
#' \itemize{
7979
#' \item \code{date_format}: date format specification.
80-
#' \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: time zone to use.
80+
#' \item \code{from_utc_timestamp}, \code{to_utc_timestamp}: A string detailing
81+
#' the time zone ID that the input should be adjusted to. It should be in the format
82+
#' of either region-based zone IDs or zone offsets. Region IDs must have the form
83+
#' 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in the format
84+
#' (+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported
85+
#' as aliases of '+00:00'. Other short names are not recommended to use
86+
#' because they can be ambiguous.
8187
#' \item \code{next_day}: day of the week string.
8288
#' }
8389
#' @param ... additional argument(s).
@@ -1410,6 +1416,52 @@ setMethod("quarter",
14101416
column(jc)
14111417
})
14121418

1419+
#' @details
1420+
#' \code{percentile_approx} Returns the approximate percentile value of
1421+
#' numeric column at the given percentage.
1422+
#'
1423+
#' @param percentage Numeric percentage at which percentile should be computed
1424+
#' All values should be between 0 and 1.
1425+
#' If length equals to 1 resulting column is of type double,
1426+
#' otherwise, array type of double.
1427+
#' @param accuracy A positive numeric literal (default: 10000) which
1428+
#' controls approximation accuracy at the cost of memory.
1429+
#' Higher value of accuracy yields better accuracy, 1.0/accuracy
1430+
#' is the relative error of the approximation.
1431+
#'
1432+
#' @rdname column_aggregate_functions
1433+
#' @aliases percentile_approx percentile_approx,Column-method
1434+
#' @note percentile_approx since 3.1.0
1435+
setMethod("percentile_approx",
1436+
signature(x = "characterOrColumn", percentage = "numericOrColumn"),
1437+
function(x, percentage, accuracy = 10000) {
1438+
col <- if (class(x) == "Column") {
1439+
x@jc
1440+
} else {
1441+
column(x)@jc
1442+
}
1443+
1444+
percentage <- if (class(percentage) == "Column") {
1445+
percentage@jc
1446+
} else if (length(percentage) > 1) {
1447+
do.call(create_array, lapply(percentage, lit))@jc
1448+
} else {
1449+
lit(percentage)@jc
1450+
}
1451+
1452+
accuracy <- if (class(accuracy) == "Column") {
1453+
accuracy@jc
1454+
} else {
1455+
lit(as.integer(accuracy))@jc
1456+
}
1457+
1458+
jc <- callJStatic(
1459+
"org.apache.spark.sql.functions", "percentile_approx",
1460+
col, percentage, accuracy
1461+
)
1462+
column(jc)
1463+
})
1464+
14131465
#' @details
14141466
#' \code{reverse}: Returns a reversed string or an array with reverse order of elements.
14151467
#'
@@ -1833,7 +1885,7 @@ setMethod("radians",
18331885
#' @details
18341886
#' \code{to_date}: Converts the column into a DateType. You may optionally specify
18351887
#' a format according to the rules in:
1836-
#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}.
1888+
#' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern}
18371889
#' If the string cannot be parsed according to the specified format (or default),
18381890
#' the value of the column will be null.
18391891
#' By default, it follows casting rules to a DateType if the format is omitted
@@ -1929,7 +1981,7 @@ setMethod("to_csv", signature(x = "Column"),
19291981
#' @details
19301982
#' \code{to_timestamp}: Converts the column into a TimestampType. You may optionally specify
19311983
#' a format according to the rules in:
1932-
#' \url{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}.
1984+
#' \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{Datetime Pattern}
19331985
#' If the string cannot be parsed according to the specified format (or default),
19341986
#' the value of the column will be null.
19351987
#' By default, it follows casting rules to a TimestampType if the format is omitted
@@ -2801,8 +2853,8 @@ setMethod("format_string", signature(format = "character", x = "Column"),
28012853
#' \code{from_unixtime}: Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC)
28022854
#' to a string representing the timestamp of that moment in the current system time zone in the JVM
28032855
#' in the given format.
2804-
#' See \href{https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html}{
2805-
#' Customizing Formats} for available options.
2856+
#' See \href{https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html}{
2857+
#' Datetime Pattern} for available options.
28062858
#'
28072859
#' @rdname column_datetime_functions
28082860
#'
@@ -2923,7 +2975,7 @@ setMethod("lpad", signature(x = "Column", len = "numeric", pad = "character"),
29232975

29242976
#' @details
29252977
#' \code{rand}: Generates a random column with independent and identically distributed (i.i.d.)
2926-
#' samples from U[0.0, 1.0].
2978+
#' samples uniformly distributed in [0.0, 1.0).
29272979
#' Note: the function is non-deterministic in general case.
29282980
#'
29292981
#' @rdname column_nonaggregate_functions

R/pkg/R/generics.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,9 @@ setGeneric("persist", function(x, newLevel) { standardGeneric("persist") })
528528
#' @rdname printSchema
529529
setGeneric("printSchema", function(x) { standardGeneric("printSchema") })
530530

531+
#' @rdname registerTempTable-deprecated
532+
setGeneric("registerTempTable", function(x, tableName) { standardGeneric("registerTempTable") })
533+
531534
#' @rdname rename
532535
setGeneric("rename", function(x, ...) { standardGeneric("rename") })
533536

@@ -1189,6 +1192,11 @@ setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay"
11891192
#' @name NULL
11901193
setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") })
11911194

1195+
#' @rdname column_aggregate_functions
1196+
#' @name NULL
1197+
setGeneric("percentile_approx",
1198+
function(x, percentage, ...) { standardGeneric("percentile_approx") })
1199+
11921200
#' @rdname column_math_functions
11931201
#' @name NULL
11941202
setGeneric("pmod", function(y, x) { standardGeneric("pmod") })
@@ -1471,6 +1479,14 @@ setGeneric("spark.als", function(data, ...) { standardGeneric("spark.als") })
14711479
setGeneric("spark.bisectingKmeans",
14721480
function(data, formula, ...) { standardGeneric("spark.bisectingKmeans") })
14731481

1482+
#' @rdname spark.fmClassifier
1483+
setGeneric("spark.fmClassifier",
1484+
function(data, formula, ...) { standardGeneric("spark.fmClassifier") })
1485+
1486+
#' @rdname spark.fmRegressor
1487+
setGeneric("spark.fmRegressor",
1488+
function(data, formula, ...) { standardGeneric("spark.fmRegressor") })
1489+
14741490
#' @rdname spark.gaussianMixture
14751491
setGeneric("spark.gaussianMixture",
14761492
function(data, formula, ...) { standardGeneric("spark.gaussianMixture") })
@@ -1539,6 +1555,10 @@ setGeneric("spark.findFrequentSequentialPatterns",
15391555
setGeneric("spark.assignClusters",
15401556
function(data, ...) { standardGeneric("spark.assignClusters") })
15411557

1558+
#' @rdname spark.lm
1559+
setGeneric("spark.lm",
1560+
function(data, formula, ...) { standardGeneric("spark.lm") })
1561+
15421562
#' @param object a fitted ML model object.
15431563
#' @param path the directory where the model is saved.
15441564
#' @param ... additional argument(s) passed to the method.

0 commit comments

Comments
 (0)