Skip to content

Commit

Permalink
[SPARK-20727] Skip tests that use Hadoop utils on CRAN Windows
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

This change skips tests that use the Hadoop libraries while running
on CRAN check with Windows as the operating system. This is to handle
cases where the Hadoop winutils binaries are missing on the target
system. The skipped tests consist of
1. Tests that save, load a model in MLlib
2. Tests that save, load CSV, JSON and Parquet files in SQL
3. Hive tests

## How was this patch tested?

Tested by running on a local windows VM with HADOOP_HOME unset. Also testing with https://win-builder.r-project.org

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes apache#17966 from shivaram/sparkr-windows-cran.
  • Loading branch information
shivaram authored and Felix Cheung committed May 23, 2017
1 parent 4dbb63f commit d06610f
Show file tree
Hide file tree
Showing 8 changed files with 445 additions and 381 deletions.
16 changes: 16 additions & 0 deletions R/pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -907,3 +907,19 @@ basenameSansExtFromUrl <- function(url) {
isAtomicLengthOne <- function(x) {
is.atomic(x) && length(x) == 1
}

is_cran <- function() {
!identical(Sys.getenv("NOT_CRAN"), "true")
}

is_windows <- function() {
.Platform$OS.type == "windows"
}

hadoop_home_set <- function() {
!identical(Sys.getenv("HADOOP_HOME"), "")
}

not_cran_or_windows_with_hadoop <- function() {
!is_cran() && (!is_windows() || hadoop_home_set())
}
90 changes: 49 additions & 41 deletions R/pkg/inst/tests/testthat/test_mllib_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,17 @@ test_that("spark.svmLinear", {
expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)

# Test model save and load
modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
coefs <- summary(model)$coefficients
coefs2 <- summary(model2)$coefficients
expect_equal(coefs, coefs2)
unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
coefs <- summary(model)$coefficients
coefs2 <- summary(model2)$coefficients
expect_equal(coefs, coefs2)
unlink(modelPath)
}

# Test prediction with numeric label
label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
Expand Down Expand Up @@ -128,15 +130,17 @@ test_that("spark.logit", {
expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))

# Test model save and load
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
coefs <- summary(model)$coefficients
coefs2 <- summary(model2)$coefficients
expect_equal(coefs, coefs2)
unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
coefs <- summary(model)$coefficients
coefs2 <- summary(model2)$coefficients
expect_equal(coefs, coefs2)
unlink(modelPath)
}

# R code to reproduce the result.
# nolint start
Expand Down Expand Up @@ -243,19 +247,21 @@ test_that("spark.mlp", {
expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))

# Test model save/load
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)

expect_equal(summary2$numOfInputs, 4)
expect_equal(summary2$numOfOutputs, 3)
expect_equal(summary2$layers, c(4, 5, 4, 3))
expect_equal(length(summary2$weights), 64)

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)

expect_equal(summary2$numOfInputs, 4)
expect_equal(summary2$numOfOutputs, 3)
expect_equal(summary2$layers, c(4, 5, 4, 3))
expect_equal(length(summary2$weights), 64)

unlink(modelPath)
}

# Test default parameter
model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
Expand Down Expand Up @@ -354,16 +360,18 @@ test_that("spark.naiveBayes", {
"Yes", "Yes", "No", "No"))

# Test model save/load
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
write.ml(m, modelPath)
expect_error(write.ml(m, modelPath))
write.ml(m, modelPath, overwrite = TRUE)
m2 <- read.ml(modelPath)
s2 <- summary(m2)
expect_equal(s$apriori, s2$apriori)
expect_equal(s$tables, s2$tables)

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
write.ml(m, modelPath)
expect_error(write.ml(m, modelPath))
write.ml(m, modelPath, overwrite = TRUE)
m2 <- read.ml(modelPath)
s2 <- summary(m2)
expect_equal(s$apriori, s2$apriori)
expect_equal(s$tables, s2$tables)

unlink(modelPath)
}

# Test e1071::naiveBayes
if (requireNamespace("e1071", quietly = TRUE)) {
Expand Down
112 changes: 60 additions & 52 deletions R/pkg/inst/tests/testthat/test_mllib_clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,20 @@ test_that("spark.bisectingKmeans", {
c(0, 1, 2, 3))

# Test model save/load
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)

unlink(modelPath)
}
})

test_that("spark.gaussianMixture", {
Expand Down Expand Up @@ -125,18 +127,20 @@ test_that("spark.gaussianMixture", {
expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))

# Test model save/load
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_equal(stats$lambda, stats2$lambda)
expect_equal(unlist(stats$mu), unlist(stats2$mu))
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_equal(stats$lambda, stats2$lambda)
expect_equal(unlist(stats$mu), unlist(stats2$mu))
expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
expect_equal(unlist(stats$loglik), unlist(stats2$loglik))

unlink(modelPath)
}
})

test_that("spark.kmeans", {
Expand Down Expand Up @@ -171,18 +175,20 @@ test_that("spark.kmeans", {
expect_true(class(summary.model$coefficients[1, ]) == "numeric")

# Test model save/load
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
summary2 <- summary(model2)
expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
expect_equal(summary.model$coefficients, summary2$coefficients)
expect_true(!summary.model$is.loaded)
expect_true(summary2$is.loaded)

unlink(modelPath)
}

# Test Kmeans on dataset that is sensitive to seed value
col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
Expand Down Expand Up @@ -236,22 +242,24 @@ test_that("spark.lda with libsvm", {
expect_true(logPrior <= 0 & !is.na(logPrior))

# Test model save/load
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)

expect_true(stats2$isDistributed)
expect_equal(logLikelihood, stats2$logLikelihood)
expect_equal(logPerplexity, stats2$logPerplexity)
expect_equal(vocabSize, stats2$vocabSize)
expect_equal(vocabulary, stats2$vocabulary)
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
expect_equal(logPrior, stats2$logPrior)

unlink(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)

expect_true(stats2$isDistributed)
expect_equal(logLikelihood, stats2$logLikelihood)
expect_equal(logPerplexity, stats2$logPerplexity)
expect_equal(vocabSize, stats2$vocabSize)
expect_equal(vocabulary, stats2$vocabulary)
expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
expect_equal(logPrior, stats2$logPrior)

unlink(modelPath)
}
})

test_that("spark.lda with text input", {
Expand Down
16 changes: 9 additions & 7 deletions R/pkg/inst/tests/testthat/test_mllib_fpm.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,17 @@ test_that("spark.fpGrowth", {

expect_equivalent(expected_predictions, collect(predict(model, new_data)))

modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
write.ml(model, modelPath, overwrite = TRUE)
loaded_model <- read.ml(modelPath)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
write.ml(model, modelPath, overwrite = TRUE)
loaded_model <- read.ml(modelPath)

expect_equivalent(
itemsets,
collect(spark.freqItemsets(loaded_model)))
expect_equivalent(
itemsets,
collect(spark.freqItemsets(loaded_model)))

unlink(modelPath)
unlink(modelPath)
}

model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
expect_equal(
Expand Down
42 changes: 22 additions & 20 deletions R/pkg/inst/tests/testthat/test_mllib_recommendation.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,29 +37,31 @@ test_that("spark.als", {
tolerance = 1e-4)

# Test model save/load
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_equal(stats2$rating, "score")
userFactors <- collect(stats$userFactors)
itemFactors <- collect(stats$itemFactors)
userFactors2 <- collect(stats2$userFactors)
itemFactors2 <- collect(stats2$itemFactors)
if (not_cran_or_windows_with_hadoop()) {
modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
write.ml(model, modelPath)
expect_error(write.ml(model, modelPath))
write.ml(model, modelPath, overwrite = TRUE)
model2 <- read.ml(modelPath)
stats2 <- summary(model2)
expect_equal(stats2$rating, "score")
userFactors <- collect(stats$userFactors)
itemFactors <- collect(stats$itemFactors)
userFactors2 <- collect(stats2$userFactors)
itemFactors2 <- collect(stats2$itemFactors)

orderUser <- order(userFactors$id)
orderUser2 <- order(userFactors2$id)
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
orderUser <- order(userFactors$id)
orderUser2 <- order(userFactors2$id)
expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])

orderItem <- order(itemFactors$id)
orderItem2 <- order(itemFactors2$id)
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
orderItem <- order(itemFactors$id)
orderItem2 <- order(itemFactors2$id)
expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])

unlink(modelPath)
unlink(modelPath)
}
})

sparkR.session.stop()
Loading

0 comments on commit d06610f

Please sign in to comment.