Skip to content

Commit 173c238

Browse files
titicacaFelix Cheung
authored and
Felix Cheung
committed
[SPARK-19342][SPARKR] bug fixed in collect method for collecting timestamp column
## What changes were proposed in this pull request? Fix a bug in collect method for collecting timestamp column, the bug can be reproduced as shown in the following codes and outputs: ``` library(SparkR) sparkR.session(master = "local") df <- data.frame(col1 = c(0, 1, 2), col2 = c(as.POSIXct("2017-01-01 00:00:01"), NA, as.POSIXct("2017-01-01 12:00:01"))) sdf1 <- createDataFrame(df) print(dtypes(sdf1)) df1 <- collect(sdf1) print(lapply(df1, class)) sdf2 <- filter(sdf1, "col1 > 0") print(dtypes(sdf2)) df2 <- collect(sdf2) print(lapply(df2, class)) ``` As we can see from the printed output, the column type of col2 in df2 is converted to numeric unexpectedly, when NA exists at the top of the column. This is caused by method `do.call(c, list)`, if we convert a list, i.e. `do.call(c, list(NA, as.POSIXct("2017-01-01 12:00:01"))`, the class of the result is numeric instead of POSIXct. Therefore, we need to cast the data type of the vector explicitly. ## How was this patch tested? The patch can be tested manually with the same code above. Author: titicaca <fangzhou.yang@hotmail.com> Closes #16689 from titicaca/sparkr-dev. (cherry picked from commit bc0a0e6) Signed-off-by: Felix Cheung <felixcheung@apache.org>
1 parent e580bb0 commit 173c238

File tree

3 files changed

+43
-4
lines changed

3 files changed

+43
-4
lines changed

R/pkg/R/DataFrame.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,7 @@ setMethod("coltypes",
417417
type <- PRIMITIVE_TYPES[[specialtype]]
418418
}
419419
}
420-
type
420+
type[[1]]
421421
})
422422

423423
# Find which types don't have mapping to R
@@ -1132,6 +1132,7 @@ setMethod("collect",
11321132
if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
11331133
vec <- do.call(c, col)
11341134
stopifnot(class(vec) != "list")
1135+
class(vec) <- PRIMITIVE_TYPES[[colType]]
11351136
df[[colIndex]] <- vec
11361137
} else {
11371138
df[[colIndex]] <- col

R/pkg/R/types.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ PRIMITIVE_TYPES <- as.environment(list(
2929
"string" = "character",
3030
"binary" = "raw",
3131
"boolean" = "logical",
32-
"timestamp" = "POSIXct",
32+
"timestamp" = c("POSIXct", "POSIXt"),
3333
"date" = "Date",
3434
# following types are not SQL types returned by dtypes(). They are listed here for usage
3535
# by checkType() in schema.R.

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1297,9 +1297,9 @@ test_that("column functions", {
12971297

12981298
# Test first(), last()
12991299
df <- read.json(jsonPath)
1300-
expect_equal(collect(select(df, first(df$age)))[[1]], NA)
1300+
expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
13011301
expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
1302-
expect_equal(collect(select(df, first("age")))[[1]], NA)
1302+
expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
13031303
expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
13041304
expect_equal(collect(select(df, last(df$age)))[[1]], 19)
13051305
expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
@@ -2767,6 +2767,44 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
27672767
"Unnamed arguments ignored: 2, 3, a.")
27682768
})
27692769

2770+
test_that("Collect on DataFrame when NAs exists at the top of a timestamp column", {
2771+
ldf <- data.frame(col1 = c(0, 1, 2),
2772+
col2 = c(as.POSIXct("2017-01-01 00:00:01"),
2773+
NA,
2774+
as.POSIXct("2017-01-01 12:00:01")),
2775+
col3 = c(as.POSIXlt("2016-01-01 00:59:59"),
2776+
NA,
2777+
as.POSIXlt("2016-01-01 12:01:01")))
2778+
sdf1 <- createDataFrame(ldf)
2779+
ldf1 <- collect(sdf1)
2780+
expect_equal(dtypes(sdf1), list(c("col1", "double"),
2781+
c("col2", "timestamp"),
2782+
c("col3", "timestamp")))
2783+
expect_equal(class(ldf1$col1), "numeric")
2784+
expect_equal(class(ldf1$col2), c("POSIXct", "POSIXt"))
2785+
expect_equal(class(ldf1$col3), c("POSIXct", "POSIXt"))
2786+
2787+
# Columns with NAs at the top
2788+
sdf2 <- filter(sdf1, "col1 > 1")
2789+
ldf2 <- collect(sdf2)
2790+
expect_equal(dtypes(sdf2), list(c("col1", "double"),
2791+
c("col2", "timestamp"),
2792+
c("col3", "timestamp")))
2793+
expect_equal(class(ldf2$col1), "numeric")
2794+
expect_equal(class(ldf2$col2), c("POSIXct", "POSIXt"))
2795+
expect_equal(class(ldf2$col3), c("POSIXct", "POSIXt"))
2796+
2797+
# Columns with only NAs, the type will also be cast to PRIMITIVE_TYPE
2798+
sdf3 <- filter(sdf1, "col1 == 0")
2799+
ldf3 <- collect(sdf3)
2800+
expect_equal(dtypes(sdf3), list(c("col1", "double"),
2801+
c("col2", "timestamp"),
2802+
c("col3", "timestamp")))
2803+
expect_equal(class(ldf3$col1), "numeric")
2804+
expect_equal(class(ldf3$col2), c("POSIXct", "POSIXt"))
2805+
expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
2806+
})
2807+
27702808
unlink(parquetPath)
27712809
unlink(orcPath)
27722810
unlink(jsonPath)

0 commit comments

Comments
 (0)