Skip to content

Commit 9cfe90e

Browse files
neilalexFelix Cheung
authored and
Felix Cheung
committed
[SPARK-21727][R] Allow multi-element atomic vector as column type in SparkR DataFrame
## What changes were proposed in this pull request? A fix to https://issues.apache.org/jira/browse/SPARK-21727, "Operating on an ArrayType in a SparkR DataFrame throws error" ## How was this patch tested? - Ran tests at R\pkg\tests\run-all.R (see below attached results) - Tested the following lines in SparkR, which now seem to execute without error: ``` indices <- 1:4 myDf <- data.frame(indices) myDf$data <- list(rep(0, 20)) mySparkDf <- as.DataFrame(myDf) collect(mySparkDf) ``` [2018-01-22 SPARK-21727 Test Results.txt](https://github.com/apache/spark/files/1653535/2018-01-22.SPARK-21727.Test.Results.txt) felixcheung yanboliang sun-rui shivaram _The contribution is my original work and I license the work to the project under the project’s open source license_ Author: neilalex <neil@neilalex.com> Closes #20352 from neilalex/neilalex-sparkr-arraytype. (cherry picked from commit f54b65c) Signed-off-by: Felix Cheung <felixcheung@apache.org>
1 parent 3316a9d commit 9cfe90e

File tree

2 files changed

+53
-5
lines changed

2 files changed

+53
-5
lines changed

R/pkg/R/serialize.R

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,17 @@
3030
# POSIXct,POSIXlt -> Time
3131
#
3232
# list[T] -> Array[T], where T is one of above mentioned types
33+
# Multi-element vector of any of the above (except raw) -> Array[T]
3334
# environment -> Map[String, T], where T is a native type
3435
# jobj -> Object, where jobj is an object created in the backend
3536
# nolint end
3637

3738
getSerdeType <- function(object) {
3839
type <- class(object)[[1]]
39-
if (type != "list") {
40-
type
40+
if (is.atomic(object) & !is.raw(object) & length(object) > 1) {
41+
"array"
42+
} else if (type != "list") {
43+
type
4144
} else {
4245
# Check if all elements are of same type
4346
elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
@@ -50,9 +53,7 @@ getSerdeType <- function(object) {
5053
}
5154

5255
writeObject <- function(con, object, writeType = TRUE) {
53-
# NOTE: In R vectors have same type as objects. So we don't support
54-
# passing in vectors as arrays and instead require arrays to be passed
55-
# as lists.
56+
# NOTE: In R vectors have same type as objects
5657
type <- class(object)[[1]] # class of POSIXlt is c("POSIXlt", "POSIXt")
5758
# Checking types is needed here, since 'is.na' only handles atomic vectors,
5859
# lists and pairlists

R/pkg/tests/fulltests/test_Serde.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,53 @@ test_that("SerDe of primitive types", {
3737
expect_equal(class(x), "character")
3838
})
3939

40+
test_that("SerDe of multi-element primitive vectors inside R data.frame", {
41+
# vector of integers embedded in R data.frame
42+
indices <- 1L:3L
43+
myDf <- data.frame(indices)
44+
myDf$data <- list(rep(0L, 3L))
45+
mySparkDf <- as.DataFrame(myDf)
46+
myResultingDf <- collect(mySparkDf)
47+
myDfListedData <- data.frame(indices)
48+
myDfListedData$data <- list(as.list(rep(0L, 3L)))
49+
expect_equal(myResultingDf, myDfListedData)
50+
expect_equal(class(myResultingDf[["data"]][[1]]), "list")
51+
expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "integer")
52+
53+
# vector of numeric embedded in R data.frame
54+
myDf <- data.frame(indices)
55+
myDf$data <- list(rep(0, 3L))
56+
mySparkDf <- as.DataFrame(myDf)
57+
myResultingDf <- collect(mySparkDf)
58+
myDfListedData <- data.frame(indices)
59+
myDfListedData$data <- list(as.list(rep(0, 3L)))
60+
expect_equal(myResultingDf, myDfListedData)
61+
expect_equal(class(myResultingDf[["data"]][[1]]), "list")
62+
expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "numeric")
63+
64+
# vector of logical embedded in R data.frame
65+
myDf <- data.frame(indices)
66+
myDf$data <- list(rep(TRUE, 3L))
67+
mySparkDf <- as.DataFrame(myDf)
68+
myResultingDf <- collect(mySparkDf)
69+
myDfListedData <- data.frame(indices)
70+
myDfListedData$data <- list(as.list(rep(TRUE, 3L)))
71+
expect_equal(myResultingDf, myDfListedData)
72+
expect_equal(class(myResultingDf[["data"]][[1]]), "list")
73+
expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "logical")
74+
75+
# vector of character embedded in R data.frame
76+
myDf <- data.frame(indices)
77+
myDf$data <- list(rep("abc", 3L))
78+
mySparkDf <- as.DataFrame(myDf)
79+
myResultingDf <- collect(mySparkDf)
80+
myDfListedData <- data.frame(indices)
81+
myDfListedData$data <- list(as.list(rep("abc", 3L)))
82+
expect_equal(myResultingDf, myDfListedData)
83+
expect_equal(class(myResultingDf[["data"]][[1]]), "list")
84+
expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "character")
85+
})
86+
4087
test_that("SerDe of list of primitive types", {
4188
x <- list(1L, 2L, 3L)
4289
y <- callJStatic("SparkRHandler", "echo", x)

0 commit comments

Comments
 (0)