[SPARK-21727][R] Allow multi-element atomic vector as column type in SparkR DataFrame

neilalex · Felix Cheung · commit 9cfe90e5abb2 · 2018-01-23T22:31:39.000-08:00
## What changes were proposed in this pull request? A fix to https://issues.apache.org/jira/browse/SPARK-21727, "Operating on an ArrayType in a SparkR DataFrame throws error" ## How was this patch tested? - Ran tests at R\pkg\tests\run-all.R (see below attached results) - Tested the following lines in SparkR, which now seem to execute without error: ``` indices <- 1:4 myDf <- data.frame(indices) myDf$data <- list(rep(0, 20)) mySparkDf <- as.DataFrame(myDf) collect(mySparkDf) ``` [2018-01-22 SPARK-21727 Test Results.txt](https://github.com/apache/spark/files/1653535/2018-01-22.SPARK-21727.Test.Results.txt) felixcheung yanboliang sun-rui shivaram _The contribution is my original work and I license the work to the project under the project’s open source license_ Author: neilalex <neil@neilalex.com> Closes #20352 from neilalex/neilalex-sparkr-arraytype. (cherry picked from commit f54b65c) Signed-off-by: Felix Cheung <felixcheung@apache.org>
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
@@ -30,14 +30,17 @@
 # POSIXct,POSIXlt -> Time
 #
 # list[T] -> Array[T], where T is one of above mentioned types
+# Multi-element vector of any of the above (except raw) -> Array[T]
 # environment -> Map[String, T], where T is a native type
 # jobj -> Object, where jobj is an object created in the backend
 # nolint end
 
 getSerdeType <- function(object) {
   type <- class(object)[[1]]
-  if (type != "list") {
-    type
+  if (is.atomic(object) & !is.raw(object) & length(object) > 1) {
+    "array"
+  } else if (type != "list") {
+     type
   } else {
     # Check if all elements are of same type
     elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
@@ -50,9 +53,7 @@ getSerdeType <- function(object) {
 }
 
 writeObject <- function(con, object, writeType = TRUE) {
-  # NOTE: In R vectors have same type as objects. So we don't support
-  # passing in vectors as arrays and instead require arrays to be passed
-  # as lists.
+  # NOTE: In R vectors have same type as objects
   type <- class(object)[[1]]  # class of POSIXlt is c("POSIXlt", "POSIXt")
   # Checking types is needed here, since 'is.na' only handles atomic vectors,
   # lists and pairlists
diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
@@ -37,6 +37,53 @@ test_that("SerDe of primitive types", {
   expect_equal(class(x), "character")
 })
 
+test_that("SerDe of multi-element primitive vectors inside R data.frame", {
+  # vector of integers embedded in R data.frame
+  indices <- 1L:3L
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(0L, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(0L, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "integer")
+
+  # vector of numeric embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(0, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(0, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "numeric")
+
+  # vector of logical embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep(TRUE, 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep(TRUE, 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "logical")
+
+  # vector of character embedded in R data.frame
+  myDf <- data.frame(indices)
+  myDf$data <- list(rep("abc", 3L))
+  mySparkDf <- as.DataFrame(myDf)
+  myResultingDf <- collect(mySparkDf)
+  myDfListedData <- data.frame(indices)
+  myDfListedData$data <- list(as.list(rep("abc", 3L)))
+  expect_equal(myResultingDf, myDfListedData)
+  expect_equal(class(myResultingDf[["data"]][[1]]), "list")
+  expect_equal(class(myResultingDf[["data"]][[1]][[1]]), "character")
+})
+
 test_that("SerDe of list of primitive types", {
   x <- list(1L, 2L, 3L)
   y <- callJStatic("SparkRHandler", "echo", x)