Skip to content

Commit 66b2046

Browse files
committed
[SPARK-25446][R] Add schema_of_json() and schema_of_csv() to R
## What changes were proposed in this pull request? This PR proposes to expose `schema_of_json` and `schema_of_csv` at R side. **`schema_of_json`**: ```r json <- '{"name":"Bob"}' df <- sql("SELECT * FROM range(1)") head(select(df, schema_of_json(json))) ``` ``` schema_of_json({"name":"Bob"}) 1 struct<name:string> ``` **`schema_of_csv`**: ```r csv <- "Amsterdam,2018" df <- sql("SELECT * FROM range(1)") head(select(df, schema_of_csv(csv))) ``` ``` schema_of_csv(Amsterdam,2018) 1 struct<_c0:string,_c1:int> ``` ## How was this patch tested? Manually tested, unit tests added, documentation manually built and verified. Closes #22939 from HyukjinKwon/SPARK-25446. Authored-by: hyukjinkwon <gurwls223@apache.org> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
1 parent 0166c73 commit 66b2046

File tree

4 files changed

+94
-9
lines changed

4 files changed

+94
-9
lines changed

R/pkg/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,8 @@ exportMethods("%<=>%",
351351
"row_number",
352352
"rpad",
353353
"rtrim",
354+
"schema_of_csv",
355+
"schema_of_json",
354356
"second",
355357
"sha1",
356358
"sha2",

R/pkg/R/functions.R

Lines changed: 70 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,18 @@ NULL
205205
#' also supported for the schema.
206206
#' \item \code{from_csv}: a DDL-formatted string
207207
#' }
208-
#' @param ... additional argument(s). In \code{to_json}, \code{to_csv} and \code{from_json},
209-
#' this contains additional named properties to control how it is converted, accepts
210-
#' the same options as the JSON/CSV data source. Additionally \code{to_json} supports
211-
#' the "pretty" option which enables pretty JSON generation. In \code{arrays_zip},
212-
#' this contains additional Columns of arrays to be merged.
208+
#' @param ... additional argument(s).
209+
#' \itemize{
210+
#' \item \code{to_json}, \code{from_json} and \code{schema_of_json}: this contains
211+
#' additional named properties to control how it is converted and accepts the
212+
#' same options as the JSON data source.
213+
#' \item \code{to_json}: it supports the "pretty" option which enables pretty
214+
#' JSON generation.
215+
#' \item \code{to_csv}, \code{from_csv} and \code{schema_of_csv}: this contains
216+
#' additional named properties to control how it is converted and accepts the
217+
#' same options as the CSV data source.
218+
#' \item \code{arrays_zip}, this contains additional Columns of arrays to be merged.
219+
#' }
213220
#' @name column_collection_functions
214221
#' @rdname column_collection_functions
215222
#' @family collection functions
@@ -1771,12 +1778,16 @@ setMethod("to_date",
17711778
#' df2 <- mutate(df2, people_json = to_json(df2$people))
17721779
#'
17731780
#' # Converts a map into a JSON object
1774-
#' df2 <- sql("SELECT map('name', 'Bob')) as people")
1781+
#' df2 <- sql("SELECT map('name', 'Bob') as people")
17751782
#' df2 <- mutate(df2, people_json = to_json(df2$people))
17761783
#'
17771784
#' # Converts an array of maps into a JSON array
17781785
#' df2 <- sql("SELECT array(map('name', 'Bob'), map('name', 'Alice')) as people")
1779-
#' df2 <- mutate(df2, people_json = to_json(df2$people))}
1786+
#' df2 <- mutate(df2, people_json = to_json(df2$people))
1787+
#'
1788+
#' # Converts a map into a pretty JSON object
1789+
#' df2 <- sql("SELECT map('name', 'Bob') as people")
1790+
#' df2 <- mutate(df2, people_json = to_json(df2$people, pretty = TRUE))}
17801791
#' @note to_json since 2.2.0
17811792
setMethod("to_json", signature(x = "Column"),
17821793
function(x, ...) {
@@ -2285,6 +2296,32 @@ setMethod("from_json", signature(x = "Column", schema = "characterOrstructType")
22852296
column(jc)
22862297
})
22872298

2299+
#' @details
2300+
#' \code{schema_of_json}: Parses a JSON string and infers its schema in DDL format.
2301+
#'
2302+
#' @rdname column_collection_functions
2303+
#' @aliases schema_of_json schema_of_json,characterOrColumn-method
2304+
#' @examples
2305+
#'
2306+
#' \dontrun{
2307+
#' json <- "{\"name\":\"Bob\"}"
2308+
#' df <- sql("SELECT * FROM range(1)")
2309+
#' head(select(df, schema_of_json(json)))}
2310+
#' @note schema_of_json since 3.0.0
2311+
setMethod("schema_of_json", signature(x = "characterOrColumn"),
2312+
function(x, ...) {
2313+
if (class(x) == "character") {
2314+
col <- callJStatic("org.apache.spark.sql.functions", "lit", x)
2315+
} else {
2316+
col <- x@jc
2317+
}
2318+
options <- varargsToStrEnv(...)
2319+
jc <- callJStatic("org.apache.spark.sql.functions",
2320+
"schema_of_json",
2321+
col, options)
2322+
column(jc)
2323+
})
2324+
22882325
#' @details
22892326
#' \code{from_csv}: Parses a column containing a CSV string into a Column of \code{structType}
22902327
#' with the specified \code{schema}.
@@ -2315,6 +2352,32 @@ setMethod("from_csv", signature(x = "Column", schema = "characterOrColumn"),
23152352
column(jc)
23162353
})
23172354

2355+
#' @details
2356+
#' \code{schema_of_csv}: Parses a CSV string and infers its schema in DDL format.
2357+
#'
2358+
#' @rdname column_collection_functions
2359+
#' @aliases schema_of_csv schema_of_csv,characterOrColumn-method
2360+
#' @examples
2361+
#'
2362+
#' \dontrun{
2363+
#' csv <- "Amsterdam,2018"
2364+
#' df <- sql("SELECT * FROM range(1)")
2365+
#' head(select(df, schema_of_csv(csv)))}
2366+
#' @note schema_of_csv since 3.0.0
2367+
setMethod("schema_of_csv", signature(x = "characterOrColumn"),
2368+
function(x, ...) {
2369+
if (class(x) == "character") {
2370+
col <- callJStatic("org.apache.spark.sql.functions", "lit", x)
2371+
} else {
2372+
col <- x@jc
2373+
}
2374+
options <- varargsToStrEnv(...)
2375+
jc <- callJStatic("org.apache.spark.sql.functions",
2376+
"schema_of_csv",
2377+
col, options)
2378+
column(jc)
2379+
})
2380+
23182381
#' @details
23192382
#' \code{from_utc_timestamp}: This is a common function for databases supporting TIMESTAMP WITHOUT
23202383
#' TIMEZONE. This function takes a timestamp which is timezone-agnostic, and interprets it as a

R/pkg/R/generics.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1206,6 +1206,14 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
12061206
#' @name NULL
12071207
setGeneric("rtrim", function(x, trimString) { standardGeneric("rtrim") })
12081208

1209+
#' @rdname column_collection_functions
1210+
#' @name NULL
1211+
setGeneric("schema_of_csv", function(x, ...) { standardGeneric("schema_of_csv") })
1212+
1213+
#' @rdname column_collection_functions
1214+
#' @name NULL
1215+
setGeneric("schema_of_json", function(x, ...) { standardGeneric("schema_of_json") })
1216+
12091217
#' @rdname column_aggregate_functions
12101218
#' @name NULL
12111219
setGeneric("sd", function(x, na.rm = FALSE) { standardGeneric("sd") })

R/pkg/tests/fulltests/test_sparkSQL.R

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1620,14 +1620,20 @@ test_that("column functions", {
16201620
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
16211621
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
16221622

1623-
# Test from_csv()
1623+
# Test from_csv(), schema_of_csv()
16241624
df <- as.DataFrame(list(list("col" = "1")))
16251625
c <- collect(select(df, alias(from_csv(df$col, "a INT"), "csv")))
16261626
expect_equal(c[[1]][[1]]$a, 1)
16271627
c <- collect(select(df, alias(from_csv(df$col, lit("a INT")), "csv")))
16281628
expect_equal(c[[1]][[1]]$a, 1)
16291629

1630-
# Test to_json(), from_json()
1630+
df <- as.DataFrame(list(list("col" = "1")))
1631+
c <- collect(select(df, schema_of_csv("Amsterdam,2018")))
1632+
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
1633+
c <- collect(select(df, schema_of_csv(lit("Amsterdam,2018"))))
1634+
expect_equal(c[[1]], "struct<_c0:string,_c1:int>")
1635+
1636+
# Test to_json(), from_json(), schema_of_json()
16311637
df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
16321638
j <- collect(select(df, alias(to_json(df$people), "json")))
16331639
expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
@@ -1654,6 +1660,12 @@ test_that("column functions", {
16541660
expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 })))
16551661
}
16561662

1663+
df <- as.DataFrame(list(list("col" = "1")))
1664+
c <- collect(select(df, schema_of_json('{"name":"Bob"}')))
1665+
expect_equal(c[[1]], "struct<name:string>")
1666+
c <- collect(select(df, schema_of_json(lit('{"name":"Bob"}'))))
1667+
expect_equal(c[[1]], "struct<name:string>")
1668+
16571669
# Test to_json() supports arrays of primitive types and arrays
16581670
df <- sql("SELECT array(19, 42, 70) as age")
16591671
j <- collect(select(df, alias(to_json(df$age), "json")))

0 commit comments

Comments
 (0)