Skip to content

Commit

Permalink
add h2o for internal tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed May 14, 2020
1 parent 0e1e77e commit 00c907b
Show file tree
Hide file tree
Showing 11 changed files with 326 additions and 3 deletions.
2 changes: 2 additions & 0 deletions _control/solutions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ spark,join
clickhouse,groupby
cudf,groupby
cudf,join
h2o,groupby
h2o,join
2 changes: 1 addition & 1 deletion _launcher/launcher.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ readret = function(x) {
file.ext = function(x) {
ans = switch(
x,
"data.table"=, "dplyr"="R",
"data.table"=, "dplyr"=, "h2o"="R",
"pandas"=, "cudf"=, "spark"=, "pydatatable"=, "modin"=, "dask"="py",
"clickhouse"="sql",
"juliadf"="jl"
Expand Down
4 changes: 2 additions & 2 deletions _launcher/solution.R
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ if ("quiet" %in% names(args)) {
file.ext = function(x) {
ans = switch(
x,
"data.table"=, "dplyr"="R",
"data.table"=, "dplyr"=, "h2o"="R",
"pandas"=, "cudf"=, "spark"=, "pydatatable"=, "modin"=, "dask"="py",
"clickhouse"="sql",
"juliadf"="jl"
Expand Down Expand Up @@ -169,7 +169,7 @@ setenv(data_name_env, d)

ns = solution.path(s)
ext = file.ext(s)
localcmd = if (ext=="sql") { # sql scripts are using extra exec shell script, related only to clickhouse as of now
localcmd = if (s %in% c("clickhouse","h2o")) { # custom launcher bash script, for clickhouse and h2o
sprintf("exec.sh %s %s", t, d)
} else sprintf("%s-%s.%s", t, ns, ext)
cmd = sprintf("./%s/%s", ns, localcmd)
Expand Down
28 changes: 28 additions & 0 deletions h2o/exec.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
set -e

if [ "$#" -ne 2 ]; then
echo "usage: ./h2o/exec.sh groupby G1_1e7_1e2_0_0";
exit 1
fi;

source ./h2o/h2o.sh

h2o_active || echo "h2o instance should not be already running, investigate" >&2
h2o_active || exit 1

# start h2o
h2o_start "h2o_$1_$2"

# confirm h2o working
h2o_active || sleep 30
h2o_active || echo "h2o instance should be already running, investigate" >&2
h2o_active || exit 1

# execute benchmark script
./h2o/$1-h2o.R || echo "# h2o/exec.sh: benchmark script for $2 terminated with error" >&2

# stop h2o instance
h2o_stop && echo "# h2o/exec.sh: stopping h2o instance finished" || echo "# h2o/exec.sh: stopping h2o instance failed" >&2
h2o_active || exit 1

142 changes: 142 additions & 0 deletions h2o/groupby-h2o.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env Rscript

cat("# groupby-h2o.R\n")

source("./_helpers/helpers.R")

suppressPackageStartupMessages(library("h2o", lib.loc="./h2o/r-h2o", warn.conflicts=FALSE, quietly=TRUE))
ver = packageVersion("h2o")
git = ""
task = "groupby"
solution = "h2o"
fun = "h2o.group_by"
cache = TRUE
on_disk = FALSE

h = h2o.init(startH2O=FALSE, port=55888)
h2o.no_progress()

data_name = Sys.getenv("SRC_GRP_LOCAL")
src_grp = file.path("data", paste(data_name, "csv", sep="."))
cat(sprintf("loading dataset %s\n", data_name))

x = h2o.importFile(src_grp, col.types=c("string","string","string","int","int","int","int","int","real"))
# using string due to ERROR caused by water.parser.ParseDataset$H2OParseException: Exceeded categorical limit on column #3 (using 1-based indexing). Consider reparsing this column as a string.
# https://0xdata.atlassian.net/browse/PUBDEV-7533
print(nrow(x))

task_init = proc.time()[["elapsed"]]
cat("grouping...\n")

question = "sum v1 by id1" # q1
t = system.time(print(dim(ans<-h2o.group_by(x, by="id1", sum("v1")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by="id1", sum("v1")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "sum v1 by id1:id2" # q2
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id1","id2"), sum("v1")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id1","id2"), sum("v1")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "sum v1 mean v3 by id3" # q3
t = system.time(print(dim(ans<-h2o.group_by(x, by="id3", sum("v1"), mean("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]]), v3=sum(ans[["mean_v3"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by="id3", sum("v1"), mean("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]]), v3=sum(ans[["mean_v3"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "mean v1:v3 by id4" # q4
t = system.time(print(dim(ans<-h2o.group_by(x, by="id4", mean("v1"), mean("v2"), mean("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["mean_v1"]]), v2=sum(ans[["mean_v2"]]), v3=sum(ans[["mean_v3"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by="id4", mean("v1"), mean("v2"), mean("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["mean_v1"]]), v2=sum(ans[["mean_v2"]]), v3=sum(ans[["mean_v3"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "sum v1:v3 by id6" # q5
t = system.time(print(dim(ans<-h2o.group_by(x, by="id6", sum("v1"), sum("v2"), sum("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]]), v2=sum(ans[["sum_v2"]]), v3=sum(ans[["sum_v3"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by="id6", sum("v1"), sum("v2"), sum("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(v1=sum(ans[["sum_v1"]]), v2=sum(ans[["sum_v2"]]), v3=sum(ans[["sum_v3"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "median v3 sd v3 by id4 id5" # q6
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id4","id5"), median("v3"), sd("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["median_v3"]]), sum(ans[["sd_v3"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id4","id5"), median("v3"), sd("v3")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["median_v3"]]), sum(ans[["sd_v3"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "max v1 - min v2 by id3" # q7


question = "largest two v3 by id6" # q8


question = "regression v1 v2 by id2 id4" # q9


question = "sum v3 count by id1:id6" # q10
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id1","id2","id3","id4","id5","id6"), sum("v3"), nrow("id1","id2","id3","id4","id5","id6")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["sum_v3"]]), sum(ans[["nrow"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.group_by(x, by=c("id1","id2","id3","id4","id5","id6"), sum("v3"), nrow("id1","id2","id3","id4","id5","id6")))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["sum_v3"]]), sum(ans[["nrow"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

h2o.removeAll()

cat(sprintf("grouping finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))

if (!interactive()) q("no", status=0)
20 changes: 20 additions & 0 deletions h2o/h2o.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
h2o_start() {
((!$#)) && echo "h2o_start require h2o instance name as a parameter" >&2 && return 1
echo '# h2o_start: starting h2o instance'
nohup java -Xmx100G -Xms100G -cp ./h2o/r-h2o/h2o/java/h2o.jar water.H2OApp -name "$1" -baseport 55888 > ./h2o/log/$1.out 2> ./h2o/log/$1.err < /dev/null &
sleep 10
}
h2o_stop() {
echo '# h2o_stop: stopping h2o instance'
pidof java > /dev/null 2>&1 && killall -2 java > /dev/null 2>&1
sleep 2 && pidof java > /dev/null 2>&1 && sleep 15
pidof java > /dev/null 2>&1 && killall -15 java > /dev/null 2>&1
sleep 2 && pidof java > /dev/null 2>&1 && sleep 30
pidof java > /dev/null 2>&1 && killall -9 java > /dev/null 2>&1
sleep 2 && pidof java > /dev/null 2>&1 && sleep 60 && pidof java > /dev/null 2>&1 && echo "h2o instance could not be stopped" >&2 && return 1
return 0
}
h2o_active() {
pidof java > /dev/null 2>&1 && curl -X GET "localhost:55888/3/About" -H "accept: application/json" > /dev/null 2>&1
}

6 changes: 6 additions & 0 deletions h2o/init-h2o.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
set -e

# upgrade to latest stable from h2o repo
echo 'upgrading h2o...'
Rscript -e 'ap=available.packages(repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl"); if (ap["h2o","Version"]!=packageVersion("h2o", lib.loc="./h2o/r-h2o")) update.packages(lib.loc="./h2o/r-h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", ask=FALSE, checkBuilt=TRUE, quiet=TRUE)'
111 changes: 111 additions & 0 deletions h2o/join-h2o.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env Rscript

cat("# join-h2o.R\n")

source("./_helpers/helpers.R")

suppressPackageStartupMessages(library("h2o", lib.loc="./h2o/r-h2o", warn.conflicts=FALSE, quietly=TRUE))
ver = packageVersion("h2o")
git = ""
task = "join"
solution = "h2o"
fun = "h2o.merge"
cache = TRUE
on_disk = FALSE

h = h2o.init(startH2O=FALSE, port=55888)
h2o.no_progress()

data_name = Sys.getenv("SRC_JN_LOCAL")
src_jn_x = file.path("data", paste(data_name, "csv", sep="."))
y_data_name = join_to_tbls(data_name)
src_jn_y = setNames(file.path("data", paste(y_data_name, "csv", sep=".")), names(y_data_name))
stopifnot(length(src_jn_y)==3L)
cat(sprintf("loading datasets %s\n", paste(c(data_name, y_data_name), collapse=", ")))

x = h2o.importFile(src_jn_x, col.types=c("int","int","int","enum","enum","string","real"))
print(nrow(x))
small = h2o.importFile(src_jn_y[1L], col.types=c("int","enum","real"))
medium = h2o.importFile(src_jn_y[2L], col.types=c("int","int","enum","enum","real"))
big = h2o.importFile(src_jn_y[3L], col.types=c("int","int","int","enum","enum","string","real"))
sapply(sapply(list(small, medium, big), nrow), print) -> nul

task_init = proc.time()[["elapsed"]]
cat("joining...\n")

question = "small inner on int" # q1

t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.merge(x, small, by="id1"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "medium inner on int" # q2
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "medium outer on int" # q3
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id2", all.x=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]], na.rm=TRUE)))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "medium inner on factor" # q4
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.merge(x, medium, by="id5"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

question = "big inner on int" # q5
t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
h2o.rm(ans)
t = system.time(print(dim(ans<-h2o.merge(x, big, by="id3"))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-list(sum(ans[["v1"]]), sum(ans[["v2"]])))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=nrow(ans), out_cols=ncol(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
h2o.rm(ans)

h2o.removeAll()

cat(sprintf("joining finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))

if (!interactive()) q("no", status=0)
5 changes: 5 additions & 0 deletions h2o/setup-h2o.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mkdir -p ./h2o/log
# install h2o
mkdir -p ./h2o/r-h2o
Rscript -e 'install.packages(c("RCurl","jsonlite"), repos="https://cloud.r-project.org", lib="./h2o/r-h2o"); install.packages("h2o", repos="http://h2o-release.s3.amazonaws.com/h2o/latest_stable_R", method="curl", lib="./h2o/r-h2o")'

4 changes: 4 additions & 0 deletions h2o/ver-h2o.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
set -e

Rscript -e 'v=read.dcf(system.file(package="h2o", lib.loc="./h2o/r-h2o", "DESCRIPTION"), fields=c("Version","Revision")); cnafill=function(x) {x=c(x); x[is.na(x)]=""; x}; fw=function(f, v) writeLines(v, file.path("h2o", f)); invisible(mapply(fw, toupper(colnames(v)), cnafill(v)))'
Loading

0 comments on commit 00c907b

Please sign in to comment.