Skip to content

Commit c4008cd

Browse files
authored
Merge branch 'master' into sampling
2 parents 288c124 + d809cee commit c4008cd

File tree

278 files changed

+8696
-2098
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

278 files changed

+8696
-2098
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ dist: trusty
2828
# 2. Choose language and target JDKs for parallel builds.
2929
language: java
3030
jdk:
31-
- oraclejdk7
3231
- oraclejdk8
3332

3433
# 3. Setup cache directory for SBT and Maven.

R/WINDOWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ To build SparkR on Windows, the following steps are required
66
include Rtools and R in `PATH`.
77

88
2. Install
9-
[JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
9+
[JDK8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) and set
1010
`JAVA_HOME` in the system environment variables.
1111

1212
3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ exportMethods("%in%",
229229
"floor",
230230
"format_number",
231231
"format_string",
232+
"from_json",
232233
"from_unixtime",
233234
"from_utc_timestamp",
234235
"getField",
@@ -327,6 +328,7 @@ exportMethods("%in%",
327328
"toDegrees",
328329
"toRadians",
329330
"to_date",
331+
"to_json",
330332
"to_timestamp",
331333
"to_utc_timestamp",
332334
"translate",

R/pkg/R/DataFrame.R

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2642,6 +2642,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) {
26422642
#'
26432643
#' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame
26442644
#' and another SparkDataFrame. This is equivalent to \code{UNION ALL} in SQL.
2645+
#' Input SparkDataFrames can have different schemas (names and data types).
26452646
#'
26462647
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
26472648
#'
@@ -2685,7 +2686,8 @@ setMethod("unionAll",
26852686

26862687
#' Union two or more SparkDataFrames
26872688
#'
2688-
#' Union two or more SparkDataFrames. This is equivalent to \code{UNION ALL} in SQL.
2689+
#' Union two or more SparkDataFrames by row. As in R's \code{rbind}, this method
2690+
#' requires that the input SparkDataFrames have the same column names.
26892691
#'
26902692
#' Note: This does not remove duplicate rows across the two SparkDataFrames.
26912693
#'
@@ -2709,6 +2711,10 @@ setMethod("unionAll",
27092711
setMethod("rbind",
27102712
signature(... = "SparkDataFrame"),
27112713
function(x, ..., deparse.level = 1) {
2714+
nm <- lapply(list(x, ...), names)
2715+
if (length(unique(nm)) != 1) {
2716+
stop("Names of input data frames are different.")
2717+
}
27122718
if (nargs() == 3) {
27132719
union(x, ...)
27142720
} else {

R/pkg/R/functions.R

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1793,6 +1793,33 @@ setMethod("to_date",
17931793
column(jc)
17941794
})
17951795

1796+
#' to_json
1797+
#'
1798+
#' Converts a column containing a \code{structType} into a Column of JSON string.
1799+
#' Resolving the Column can fail if an unsupported type is encountered.
1800+
#'
1801+
#' @param x Column containing the struct
1802+
#' @param ... additional named properties to control how it is converted, accepts the same options
1803+
#' as the JSON data source.
1804+
#'
1805+
#' @family normal_funcs
1806+
#' @rdname to_json
1807+
#' @name to_json
1808+
#' @aliases to_json,Column-method
1809+
#' @export
1810+
#' @examples
1811+
#' \dontrun{
1812+
#' to_json(df$t, dateFormat = 'dd/MM/yyyy')
1813+
#' select(df, to_json(df$t))
1814+
#'}
1815+
#' @note to_json since 2.2.0
1816+
setMethod("to_json", signature(x = "Column"),
1817+
function(x, ...) {
1818+
options <- varargsToStrEnv(...)
1819+
jc <- callJStatic("org.apache.spark.sql.functions", "to_json", x@jc, options)
1820+
column(jc)
1821+
})
1822+
17961823
#' to_timestamp
17971824
#'
17981825
#' Converts the column into a TimestampType. You may optionally specify a format
@@ -2403,6 +2430,36 @@ setMethod("date_format", signature(y = "Column", x = "character"),
24032430
column(jc)
24042431
})
24052432

2433+
#' from_json
2434+
#'
2435+
#' Parses a column containing a JSON string into a Column of \code{structType} with the specified
2436+
#' \code{schema}. If the string is unparseable, the Column will contains the value NA.
2437+
#'
2438+
#' @param x Column containing the JSON string.
2439+
#' @param schema a structType object to use as the schema to use when parsing the JSON string.
2440+
#' @param ... additional named properties to control how the json is parsed, accepts the same
2441+
#' options as the JSON data source.
2442+
#'
2443+
#' @family normal_funcs
2444+
#' @rdname from_json
2445+
#' @name from_json
2446+
#' @aliases from_json,Column,structType-method
2447+
#' @export
2448+
#' @examples
2449+
#' \dontrun{
2450+
#' schema <- structType(structField("name", "string"),
2451+
#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy"))
2452+
#'}
2453+
#' @note from_json since 2.2.0
2454+
setMethod("from_json", signature(x = "Column", schema = "structType"),
2455+
function(x, schema, ...) {
2456+
options <- varargsToStrEnv(...)
2457+
jc <- callJStatic("org.apache.spark.sql.functions",
2458+
"from_json",
2459+
x@jc, schema$jobj, options)
2460+
column(jc)
2461+
})
2462+
24062463
#' from_utc_timestamp
24072464
#'
24082465
#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp

R/pkg/R/generics.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,10 @@ setGeneric("format_number", function(y, x) { standardGeneric("format_number") })
991991
#' @export
992992
setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") })
993993

994+
#' @rdname from_json
995+
#' @export
996+
setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") })
997+
994998
#' @rdname from_unixtime
995999
#' @export
9961000
setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") })
@@ -1265,6 +1269,10 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
12651269
#' @export
12661270
setGeneric("to_date", function(x, format) { standardGeneric("to_date") })
12671271

1272+
#' @rdname to_json
1273+
#' @export
1274+
setGeneric("to_json", function(x, ...) { standardGeneric("to_json") })
1275+
12681276
#' @rdname to_timestamp
12691277
#' @export
12701278
setGeneric("to_timestamp", function(x, format) { standardGeneric("to_timestamp") })

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ mockLinesComplexType <-
8888
complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
8989
writeLines(mockLinesComplexType, complexTypeJsonPath)
9090

91+
# For test map type and struct type in DataFrame
92+
mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
93+
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
94+
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
95+
mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
96+
writeLines(mockLinesMapType, mapTypeJsonPath)
97+
9198
test_that("calling sparkRSQL.init returns existing SQL context", {
9299
sqlContext <- suppressWarnings(sparkRSQL.init(sc))
93100
expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
@@ -466,13 +473,6 @@ test_that("create DataFrame from a data.frame with complex types", {
466473
expect_equal(ldf$an_envir, collected$an_envir)
467474
})
468475

469-
# For test map type and struct type in DataFrame
470-
mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
471-
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
472-
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
473-
mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
474-
writeLines(mockLinesMapType, mapTypeJsonPath)
475-
476476
test_that("Collect DataFrame with complex types", {
477477
# ArrayType
478478
df <- read.json(complexTypeJsonPath)
@@ -1337,6 +1337,33 @@ test_that("column functions", {
13371337
df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
13381338
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
13391339
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
1340+
1341+
# Test to_json(), from_json()
1342+
df <- read.json(mapTypeJsonPath)
1343+
j <- collect(select(df, alias(to_json(df$info), "json")))
1344+
expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
1345+
df <- as.DataFrame(j)
1346+
schema <- structType(structField("age", "integer"),
1347+
structField("height", "double"))
1348+
s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
1349+
expect_equal(ncol(s), 1)
1350+
expect_equal(nrow(s), 3)
1351+
expect_is(s[[1]][[1]], "struct")
1352+
expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
1353+
1354+
# passing option
1355+
df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
1356+
schema2 <- structType(structField("date", "date"))
1357+
expect_error(tryCatch(collect(select(df, from_json(df$col, schema2))),
1358+
error = function(e) { stop(e) }),
1359+
paste0(".*(java.lang.NumberFormatException: For input string:).*"))
1360+
s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
1361+
expect_is(s[[1]][[1]]$date, "Date")
1362+
expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
1363+
1364+
# check for unparseable
1365+
df <- as.DataFrame(list(list("a" = "")))
1366+
expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
13401367
})
13411368

13421369
test_that("column binary mathfunctions", {
@@ -1823,6 +1850,13 @@ test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
18231850
expect_equal(count(unioned2), 12)
18241851
expect_equal(first(unioned2)$name, "Michael")
18251852

1853+
df3 <- df2
1854+
names(df3)[1] <- "newName"
1855+
expect_error(rbind(df, df3),
1856+
"Names of input data frames are different.")
1857+
expect_error(rbind(df, df2, df3),
1858+
"Names of input data frames are different.")
1859+
18261860
excepted <- arrange(except(df, df2), desc(df$age))
18271861
expect_is(unioned, "SparkDataFrame")
18281862
expect_equal(count(excepted), 2)
@@ -2558,8 +2592,8 @@ test_that("coalesce, repartition, numPartitions", {
25582592

25592593
df2 <- repartition(df1, 10)
25602594
expect_equal(getNumPartitions(df2), 10)
2561-
expect_equal(getNumPartitions(coalesce(df2, 13)), 5)
2562-
expect_equal(getNumPartitions(coalesce(df2, 7)), 5)
2595+
expect_equal(getNumPartitions(coalesce(df2, 13)), 10)
2596+
expect_equal(getNumPartitions(coalesce(df2, 7)), 7)
25632597
expect_equal(getNumPartitions(coalesce(df2, 3)), 3)
25642598
})
25652599

@@ -2867,5 +2901,7 @@ unlink(parquetPath)
28672901
unlink(orcPath)
28682902
unlink(jsonPath)
28692903
unlink(jsonPathNa)
2904+
unlink(complexTypeJsonPath)
2905+
unlink(mapTypeJsonPath)
28702906

28712907
sparkR.session.stop()

0 commit comments

Comments
 (0)