From 03a27bb2a5c8479d5a44074d4cd93a807bfc5878 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 28 Oct 2019 14:27:29 +0530 Subject: [PATCH] rework report to single page with more tabsets --- groupby.Rmd | 155 ------------------------------ history.Rmd | 14 +-- index.Rmd | 164 ++++++++++++++++++++++++++------ join.Rmd | 151 ----------------------------- rmarkdown_child/environment.Rmd | 10 -- rmarkdown_child/init.Rmd | 21 ---- rmarkdown_child/status.Rmd | 6 -- rmarkdown_child/timetaken.Rmd | 15 --- run.sh | 4 +- tech.Rmd | 15 +-- 10 files changed, 151 insertions(+), 404 deletions(-) delete mode 100644 groupby.Rmd delete mode 100644 join.Rmd delete mode 100644 rmarkdown_child/environment.Rmd delete mode 100644 rmarkdown_child/init.Rmd delete mode 100644 rmarkdown_child/status.Rmd delete mode 100644 rmarkdown_child/timetaken.Rmd diff --git a/groupby.Rmd b/groupby.Rmd deleted file mode 100644 index b78d66fd..00000000 --- a/groupby.Rmd +++ /dev/null @@ -1,155 +0,0 @@ ---- -title: "Aggregation benchmark" -output: - html_document: - self_contained: no - includes: - in_header: ga.html ---- - -This page presents results of [h2oai.github.io/db-benchmark](./index.html) _groupby_ task benchmark for various datasizes and various data characteristis (cardinality, percentage of missing values, pre-sorted input). There are 10 different questions run for each input data, questions are categorized into two groups. _Basic_ questions refers to set of 5 questions designed by [Matt Dowle](https://twitter.com/MattDowle) (creator of [data.table](https://github.com/Rdatatable/data.table)) in 2014 [here](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping). _Advanced_ questions are 5 new questions meant to cover more complex queries, which are also less obvious to optimize. - -```{r opts, echo=FALSE} -knitr::opts_chunk$set(echo=FALSE, cache=FALSE) -``` - -```{r render} -report_name = "groupby" -# Rscript -e 'rmarkdown::render("groupby.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path -``` - -```{r init, child="rmarkdown_child/init.Rmd"} -``` - -```{r links_plots} -link = function(data_name, q_group, report_name) { - fnam = sprintf("%s.%s.png", data_name, q_group) - path = file.path(report_name, "plots") - sprintf("[%s](%s)", fnam, file.path(path, fnam)) -} -``` - -## Groupby {.tabset .tabset-fade .tabset-pills} - -```{r filter_task} -dt_task = lld[task==report_name] -dt_task = dt_task[substr(data,1,2)=="G1"] ## groupby specific -by_data = function(dt, .in_rows, .task) { - dt = dt[in_rows==as.character(.in_rows)] - if (!nrow(dt)) return(invisible(NULL)) - wide = dcast(dt, data+in_rows+as.integer(as.character(k))+na+sorted+question ~ solution, value.var="time_sec_1") - d = groupingsets(wide[!is.na(question)], by=c("data","in_rows","k","na","sorted","question"), j=lapply(.SD, sum), id=TRUE, sets=list(c("data","in_rows","k","na","sorted","question"), character(0))) - setorderv(d, c("data","question"), na.last=TRUE) - setcolorder(d, c("data","in_rows","k","na","sorted","question")) - d[grouping==63L, c("in_rows"):=list(.in_rows)] - d[, c("grouping","data"):=NULL] - setnames(d, c("in_rows","k","na","sorted"), c("rows","q1_grp.size","NA_pct","pre_sorted")) - kk(d) -} -``` - -Below timings are presented for a single dataset case having random order, no NAs (missing values) and particular cardinality factor (group size question 1 `k=100`). To see timings for other cases scroll down to full timings table. If a solution is missing on particular data size timings table refer to benchplot for a reason and check its speed on smaller data size tab. - -```{r o_task_plot, message=FALSE} -path = file.path("public", report_name, "plots") -for (in_rows in c("1e7","1e8","1e9")) { - for (data_name in paste("G1", in_rows, c("1e2_0_0","1e1_0_0","2e0_0_0","1e2_0_1"), sep="_")) { - for (q_group in c("basic","advanced")) { - benchplot(as.numeric(in_rows), task=report_name, data=data_name, timings=dt_task[question_group==q_group], code=groupby.code, exceptions=groupby.exceptions, colors=solution.colors, fnam=paste(data_name, q_group, "png", sep="."), path=path, .interactive=FALSE) - } - } -} -if (dev<-FALSE) { - in_rows = "1e9" - data_name = "G1_1e9_2e0_0_0" - q_group = "advanced" - benchplot(as.numeric(in_rows), task=report_name, data=data_name, timings=dt_task[question_group==q_group], code=groupby.code, exceptions=groupby.exceptions, colors=solution.colors, fnam=paste(data_name, q_group, "png", sep="."), path=path, .interactive=TRUE) -} -``` - -### 0.5 GB - -#### **Set of basic questions** - -![](public/groupby/plots/G1_1e7_1e2_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/groupby/plots/G1_1e7_1e2_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e7", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e7_table} -by_data(dt_task, "1e7", report_name) -``` - ---- - -### 5 GB - -#### **Set of basic questions** - -![](public/groupby/plots/G1_1e8_1e2_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/groupby/plots/G1_1e8_1e2_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e8", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e8_table} -by_data(dt_task, "1e8", report_name) -``` - ---- - -### 50 GB {.active} - -#### **Set of basic questions** - -![](public/groupby/plots/G1_1e9_1e2_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/groupby/plots/G1_1e9_1e2_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e9", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e9_table} -by_data(dt_task, "1e9", report_name) -``` - ---- - -## Notes - -- ClickHouse queries were made against `mergetree` table engine, see [#91](https://github.com/h2oai/db-benchmark/issues/91) for details. - -```{r environment, child="rmarkdown_child/environment.Rmd"} -``` - ------- - -```{r timetaken, child="rmarkdown_child/timetaken.Rmd"} -``` - -```{r status, child="rmarkdown_child/status.Rmd"} -``` diff --git a/history.Rmd b/history.Rmd index 2018fc23..e12d71fd 100644 --- a/history.Rmd +++ b/history.Rmd @@ -7,16 +7,14 @@ output: includes: in_header: ga.html --- +```{r render, include=FALSE} +# Rscript -e 'rmarkdown::render("history.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path +``` ```{r opts, echo=FALSE} knitr::opts_chunk$set(echo=FALSE, cache=FALSE) ``` -```{r render} -report_name = "history" -# Rscript -e 'rmarkdown::render("history.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path -``` - ```{r init} library(lattice) source("report.R") @@ -49,6 +47,10 @@ p = sapply(setNames(nm=as.character(unique(ld$solution))), simplify = FALSE, fun sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul ``` +------ + +Report was generated on: `r format(Sys.time(), usetz=TRUE)`. -```{r status, child="rmarkdown_child/status.Rmd"} +```{r status_set_success} +cat("history\n", file=get_report_status_file(), append=TRUE) ``` diff --git a/index.Rmd b/index.Rmd index e9c59e0f..fd8f2feb 100644 --- a/index.Rmd +++ b/index.Rmd @@ -6,52 +6,144 @@ output: includes: in_header: ga.html --- +```{r render, include=FALSE} +# Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path +``` This page aims to benchmark various database-like tools popular in open-source data science. It runs regularly against very latest versions of these packages and automatically updates. We provide this as a service to both developers of these packages and to users. -We also include the syntax being timed alongside the timing. This way you can immediately see whether you are doing these tasks or not, and if the timing differences matter to you or not. A 10x difference may be irrelevant if that's just 1s vs 0.1s on your data size. The intention is that you click the tab for the size of data you have. Use this page to naviagates to _task_ reports, as of now we have _groupby_ and _join_ tasks. +We also include the syntax being timed alongside the timing. This way you can immediately see whether you are doing these tasks or not, and if the timing differences matter to you or not. A 10x difference may be irrelevant if that's just 1s vs 0.1s on your data size. The intention is that you click the tab for the size of data you have. ```{r opts, echo=FALSE} knitr::opts_chunk$set(echo=FALSE, cache=FALSE) ``` -```{r render} -report_name = "index" -# Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path +```{r helpers} +loop_benchplot = function(dt_task, report_name, code, exceptions, colors, data_namev, q_groupv) { + path = file.path("public", report_name, "plots") + for (data_name in data_namev) { + in_rows = strsplit(data_name, "_", fixed=TRUE)[[1L]][2L] + for (q_group in q_groupv) { + benchplot(as.numeric(in_rows), task=report_name, data=data_name, timings=dt_task[question_group==q_group], code=code, exceptions=exceptions, colors=colors, fnam=paste(data_name, q_group, "png", sep="."), path=path, .interactive=FALSE) + } + } +} +link = function(data_name, q_group, report_name) { + fnam = sprintf("%s.%s.png", data_name, q_group) + path = file.path(report_name, "plots") + sprintf("[%s](%s)", fnam, file.path(path, fnam)) +} +hours_took = function(lld) { + lld_script_time = lld[, .(n_script_time_sec=uniqueN(script_time_sec), script_time_sec=unique(script_time_sec)), .(solution, task, data)] + if (nrow(lld_script_time[n_script_time_sec>1L])) + stop("There are multiple different 'script_time_sec' for single solution+task+data on report 'index'") + lld_script_time[, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)] +} +``` + +```{r init} +source("report.R", chdir=TRUE) +source("helpers.R", chdir=TRUE) +source("report-code.R", chdir=TRUE) +source("benchplot.R", chdir=TRUE) +ld = time_logs() +lld = ld[script_recent==TRUE] +lld_nodename = as.character(unique(lld$nodename)) +if (length(lld_nodename)>1L) + stop(sprintf("There are multiple different 'nodename' to be presented on single report '%s'", report_name)) +lld_unfinished = lld[is.na(script_time_sec)] +if (nrow(lld_unfinished)) { + warning(sprintf("Missing solution finish timestamp in logs.csv for '%s' (still running or launcher script killed): %s", paste(unique(lld_unfinished$task), collapse=","), paste(unique(lld_unfinished$solution), collapse=", "))) +} ``` -```{r init, child="rmarkdown_child/init.Rmd"} +```{r report_groupby} +in_rows = c("1e7","1e8","1e9") +k_na_sort = c("1e2_0_0","1e1_0_0","2e0_0_0","1e2_0_1") +data_name = paste("G1", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_") +dt_groupby = lld[task=="groupby"][substr(data,1,2)=="G1"] +loop_benchplot(dt_groupby, report_name="groupby", code=groupby.code, exceptions=groupby.exceptions, colors=solution.colors, data_namev=data_name, q_groupv=c("basic","advanced")) +``` + +```{r report_join} +in_rows = c("1e7","1e8") +k_na_sort = c("NA_0_0") +data_name = paste("J1", paste(rep(in_rows, each=length(k_na_sort)), k_na_sort, sep="_"), sep="_") +dt_join = lld[task=="join"] +loop_benchplot(dt_join, report_name="join", code=join.code, exceptions=join.exceptions, colors=solution.colors, data_namev=data_name, q_groupv=c("basic")) ``` ## Task {.tabset .tabset-fade .tabset-pills} -Plot below presents chosen task, single input data size and _basic_ set of questions. Follow the link for detailed reports. +### groupby {.tabset .tabset-fade .tabset-pills} -### groupby {.active} +Below timings are presented for a single dataset case having random order, no NAs (missing values) and particular cardinality factor (group size question 1 `k=100`). To see timings for other cases click on the links below. If a solution is missing on particular data size timings table refer to benchplot for reasons and check its speed on smaller data size tab. -Full _groupby_ report available at [h2oai.github.io/db-benchmark/groupby.html](./groupby.html). +#### 0.5 GB {.tabset .tabset-fade .tabset-pills} -```{r o_groupby_plot} -dt_task = lld[task=="groupby" & question_group=="basic"] -fn = "1e9_1e2_0_0" -fnam = paste0("groupby.",fn,".png") -unlink(file.path("public",report_name,"plots", fnam)) -benchplot(1e9, task="groupby", data=paste0("G1_",fn), timings=dt_task, code=groupby.code, exceptions=groupby.exceptions, colors=solution.colors, fnam=fnam, path=file.path("public",report_name,"plots")) -``` -![](public/index/plots/groupby.1e9_1e2_0_0.png) +All data cases can be found at `r dt_groupby[in_rows=="1e7", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name="groupby"), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. -### join +##### basic {.active} -Full _join_ report available at [h2oai.github.io/db-benchmark/join.html](./join.html). +![](public/groupby/plots/G1_1e7_1e2_0_0.basic.png) -```{r o_join_plot} -dt_task = lld[task=="join" & question_group=="basic"] -fn = "1e8_NA_0_0" -fnam = paste0("join.",fn,".png") -unlink(file.path("public",report_name,"plots", fnam)) -benchplot(1e8, task="join", data=paste0("J1_",fn), timings=dt_task, code=join.code, exceptions=join.exceptions, colors=solution.colors, fnam=fnam, path=file.path("public",report_name,"plots")) -``` -![](public/index/plots/join.1e8_NA_0_0.png) +##### advanced + +![](public/groupby/plots/G1_1e7_1e2_0_0.advanced.png) + +#### 5 GB {.tabset .tabset-fade .tabset-pills} + +All data cases can be found at `r dt_groupby[in_rows=="1e8", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name="groupby"), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. + +##### basic {.active} + +![](public/groupby/plots/G1_1e8_1e2_0_0.basic.png) + +##### advanced + +![](public/groupby/plots/G1_1e8_1e2_0_0.advanced.png) + +#### 50 GB {.active .tabset .tabset-fade .tabset-pills} + +All data cases can be found at `r dt_groupby[in_rows=="1e9", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name="groupby"), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. + +##### basic {.active} + +![](public/groupby/plots/G1_1e9_1e2_0_0.basic.png) + +##### advanced + +![](public/groupby/plots/G1_1e9_1e2_0_0.advanced.png) + +### join {.tabset .tabset-fade .tabset-pills} + +Below timings are presented for datasets having random order, no NAs (missing values). Data size on tabs corresponds to the LHS dataset of join, while RHS datasets are of the following sizes: _small_ (LHS/1e6), _medium_ (LHS/1e3), _big_ (LHS). + +#### 0.6 GB {.tabset .tabset-fade .tabset-pills} + +##### basic {.active} + +![](public/join/plots/J1_1e7_NA_0_0.basic.png) + + + +#### 6 GB {.active .tabset .tabset-fade .tabset-pills} + +##### basic {.active} + +![](public/join/plots/J1_1e8_NA_0_0.basic.png) + + + +--- ## Notes @@ -59,8 +151,16 @@ benchplot(1e8, task="join", data=paste0("J1_",fn), timings=dt_task, code=join.co - Data used to generate plots on this website can be obtained from [time.csv](./time.csv) (together with [logs.csv](./logs.csv)). See [report.R](https://github.com/h2oai/db-benchmark/blob/master/report.R) for quick introduction how to work with those. - We ensure that calculations are not deferred by solution. - We also tested that answers produced from different solutions match each others, for details see [answers-validation.R](https://github.com/h2oai/db-benchmark/blob/master/answers-validation.R). +- ClickHouse queries were made against `mergetree` table engine, see [#91](https://github.com/h2oai/db-benchmark/issues/91) for details. + +## Environment configuration + +- R 3.6.0 +- python 3.6 +- Julia 1.0.2 -```{r environment, child="rmarkdown_child/environment.Rmd"} +```{r environment_hardware} +as.data.table(na.omit(fread("nodenames.csv")[lld_nodename, on="nodename", t(.SD)]), keep.rownames=TRUE)[rn!="nodename", .(Component=rn, Value=V1)][, kk(.SD)] ``` ------ @@ -69,14 +169,16 @@ benchplot(1e8, task="join", data=paste0("J1_",fn), timings=dt_task, code=join.co We limit the scope to what can be achieved on a single machine. Laptop size memory (8GB) and server size memory (250GB) are in scope. Out-of-memory using local disk such as NVMe is in scope. Multi-node systems such as Spark running in single machine mode is in scope, too. Machines are getting bigger: EC2 X1 has 2TB RAM and 1TB NVMe disk is under $300. If you can perform the task on a single machine, then perhaps you should. To our knowledge, nobody has yet compared this software in this way and published results too. -## Why this project +## Why db-benchmark? Because we have been asked many times to do so, the first task and initial motivation for this page, was to update the benchmark designed and run by [Matt Dowle](https://twitter.com/MattDowle) (creator of [data.table](https://github.com/Rdatatable/data.table)) in 2014 [here](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping). The methodology and reproducible code can be obtained there. Exact code of this report and benchmark script can be found at [h2oai/db-benchmark](https://github.com/h2oai/db-benchmark) created by [Jan Gorecki](https://github.com/jangorecki) funded by [H2O.ai](https://www.h2o.ai). In case of questions/feedback, feel free to file an issue there. ------ -```{r timetaken, child="rmarkdown_child/timetaken.Rmd"} -``` +Benchmark run took around `r hours_took(lld)` hours. + +Report was generated on: `r format(Sys.time(), usetz=TRUE)`. -```{r status, child="rmarkdown_child/status.Rmd"} +```{r status_set_success} +cat("index\n", file=get_report_status_file(), append=TRUE) ``` diff --git a/join.Rmd b/join.Rmd deleted file mode 100644 index ea57d930..00000000 --- a/join.Rmd +++ /dev/null @@ -1,151 +0,0 @@ ---- -title: "Join benchmark" -output: - html_document: - self_contained: no - includes: - in_header: ga.html ---- - -This page presents results of [h2oai.github.io/db-benchmark](./index.html) _join_ task benchmark for various datasizes and various data characteristis. Data size on tabs corresponds to the LHS dataset of join, while RHS datasets are of the following sizes: _small_ (LHS/1e6), _medium_ (LHS/1e3), _big_ (LHS). As of now only 5 _basic_ questions has been implemented, and only 1e7 (0.6 GB) and 1e8 (6 GB) data sizes. 5 _advanced_ questions will follow, as well as 1e9 (60 GB) data size. - -```{r opts, echo=FALSE} -knitr::opts_chunk$set(echo=FALSE, cache=FALSE) -``` - -```{r render} -report_name = "join" -# Rscript -e 'rmarkdown::render("join.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path -``` - -```{r init, child="rmarkdown_child/init.Rmd"} -``` - -```{r links_plots} -link = function(data_name, q_group, report_name) { - fnam = sprintf("%s.%s.png", data_name, q_group) - path = file.path(report_name, "plots") - sprintf("[%s](%s)", fnam, file.path(path, fnam)) -} -``` - -## Join {.tabset .tabset-fade .tabset-pills} - -```{r filter_task} -dt_task = lld[task=="join"] -by_data = function(dt, .in_rows, .task) { - dt = dt[in_rows==as.character(.in_rows)] - if (!nrow(dt)) return(invisible(NULL)) - wide = dcast(dt, data+in_rows+as.integer(as.character(k))+na+sorted+question ~ solution, value.var="time_sec_1") - d = groupingsets(wide[!is.na(question)], by=c("data","in_rows","k","na","sorted","question"), j=lapply(.SD, sum), id=TRUE, sets=list(c("data","in_rows","k","na","sorted","question"), character(0))) - setorderv(d, c("data","question"), na.last=TRUE) - setcolorder(d, c("data","in_rows","k","na","sorted","question")) - d[grouping==63L, c("in_rows"):=list(.in_rows)] - d[, c("grouping","data"):=NULL] - d[, "k" := NULL] ## ignored for join - setnames(d, c("in_rows","na","sorted"), c("rows","NA_pct","pre_sorted")) - kk(d) -} -``` - -Below timings are presented for datasets having random order, no NAs (missing values). - -```{r o_task_plot, message=FALSE} -path = file.path("public", report_name, "plots") -for (in_rows in c("1e7","1e8","1e9")) { - for (data_name in paste("J1", in_rows, c("NA_0_0"), sep="_")) { # single data_name within in_rows, so far - for (q_group in c("basic","advanced")) { - benchplot(as.numeric(in_rows), task=report_name, data=data_name, timings=dt_task[question_group==q_group], code=join.code, exceptions=join.exceptions, colors=solution.colors, fnam=paste(data_name, q_group, "png", sep="."), path=path, .interactive=FALSE) - } - } -} -if (dev<-FALSE) { - in_rows = "1e7" - data_name = "J1_1e7_NA_0_0" - q_group = "basic" - benchplot(as.numeric(in_rows), task=report_name, data=data_name, timings=dt_task[question_group==q_group], code=join.code, exceptions=join.exceptions, colors=solution.colors, fnam=paste(data_name, q_group, "png", sep="."), path=path, .interactive=TRUE) -} -``` - -### 0.6 GB - -#### **Set of basic questions** - -![](public/join/plots/J1_1e7_NA_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/join/plots/J1_1e7_NA_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e7", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e7_table} -by_data(dt_task, "1e7", report_name) -``` - ---- - -### 6 GB {.active} - -#### **Set of basic questions** - -![](public/join/plots/J1_1e8_NA_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/join/plots/J1_1e8_NA_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e8", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e8_table} -by_data(dt_task, "1e8", report_name) -``` - ---- - -### 60 GB - -#### **Set of basic questions** - -![](public/join/plots/J1_1e9_NA_0_0.basic.png) - ---- - -#### **Set of advanced questions** - -![](public/join/plots/J1_1e9_NA_0_0.advanced.png) - ---- - -#### **Details table** - -Plots of all cases can be found at `r dt_task[in_rows=="1e9", .(q_grp_links=paste(link(unique(data), q_group=question_group, report_name=report_name), collapse=", ")), by=question_group][, paste(q_grp_links, collapse=", ")]`. Below first run timings. - -```{r o_task_1e9_table} -by_data(dt_task, "1e9", report_name) -``` - ---- - -```{r environment, child="rmarkdown_child/environment.Rmd"} -``` - ------- - -```{r timetaken, child="rmarkdown_child/timetaken.Rmd"} -``` - -```{r status, child="rmarkdown_child/status.Rmd"} -``` diff --git a/rmarkdown_child/environment.Rmd b/rmarkdown_child/environment.Rmd deleted file mode 100644 index 2ad16073..00000000 --- a/rmarkdown_child/environment.Rmd +++ /dev/null @@ -1,10 +0,0 @@ - -## Environment configuration - -- R 3.6.0 -- python 3.6 -- Julia 1.0.2 - -```{r environment_hardware} -as.data.table(na.omit(fread("../nodenames.csv")[lld_nodename, on="nodename", t(.SD)]), keep.rownames=TRUE)[rn!="nodename", .(Component=rn, Value=V1)][, kk(.SD)] -``` diff --git a/rmarkdown_child/init.Rmd b/rmarkdown_child/init.Rmd deleted file mode 100644 index 3665154f..00000000 --- a/rmarkdown_child/init.Rmd +++ /dev/null @@ -1,21 +0,0 @@ - -```{r init_source_data} -# setwd("rmarkdown_child") -source("../report.R", chdir=TRUE) -source("../helpers.R", chdir=TRUE) -source("../report-code.R", chdir=TRUE) -source("../benchplot.R", chdir=TRUE) -ld = time_logs(path="..") -lld = ld[script_recent==TRUE] -# setwd("..") -``` - -```{r init_validation} -lld_nodename = as.character(unique(lld$nodename)) -if (length(lld_nodename)>1L) - stop(sprintf("There are multiple different 'nodename' to be presented on single report '%s'", report_name)) -lld_unfinished = lld[is.na(script_time_sec)] -if (nrow(lld_unfinished)) { - warning(sprintf("Missing solution finish timestamp in logs.csv for '%s' (still running or launcher script killed): %s", paste(unique(lld_unfinished$task), collapse=","), paste(unique(lld_unfinished$solution), collapse=", "))) -} -``` diff --git a/rmarkdown_child/status.Rmd b/rmarkdown_child/status.Rmd deleted file mode 100644 index a4fb4a23..00000000 --- a/rmarkdown_child/status.Rmd +++ /dev/null @@ -1,6 +0,0 @@ - -Report was generated on: `r format(Sys.time(), usetz=TRUE)`. - -```{r status_set_success} -cat(paste0(report_name,"\n"), file=get_report_status_file(path=".."), append=TRUE) -``` diff --git a/rmarkdown_child/timetaken.Rmd b/rmarkdown_child/timetaken.Rmd deleted file mode 100644 index 85d2d450..00000000 --- a/rmarkdown_child/timetaken.Rmd +++ /dev/null @@ -1,15 +0,0 @@ - -```{r timetaken_text_items} -lld_script_time = lld[, .(n_script_time_sec=uniqueN(script_time_sec), script_time_sec=unique(script_time_sec)), .(solution, task, data)] -if (nrow(lld_script_time[n_script_time_sec>1L])) - stop(sprintf("There are multiple different 'script_time_sec' for single solution+task+data on report '%s'", report_name)) -if (report_name=="index") { - what_bench = "Benchmark" - hours_took = lld_script_time[, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)] -} else { - what_bench = paste(tools::toTitleCase(report_name), "benchmark") - hours_took = lld_script_time[task==report_name, round(sum(script_time_sec, na.rm=TRUE)/60/60, 1)] -} -``` - -`r what_bench` run took around `r hours_took` hours. diff --git a/run.sh b/run.sh index a16ca7c3..9cffefa3 100755 --- a/run.sh +++ b/run.sh @@ -60,8 +60,6 @@ if [[ -f ./stop ]]; then echo "# Benchmark run $BATCH has been interrupted after rm -rf ./public rm -f ./report-done $DO_REPORT && Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' > ./out/rmarkdown_index.out 2>&1 && echo "# Benchmark index report produced" -$DO_REPORT && Rscript -e 'rmarkdown::render("groupby.Rmd", output_dir="public")' > ./out/rmarkdown_groupby.out 2>&1 && echo "# Benchmark groupby report produced" -$DO_REPORT && Rscript -e 'rmarkdown::render("join.Rmd", output_dir="public")' > ./out/rmarkdown_join.out 2>&1 && echo "# Benchmark join report produced" $DO_REPORT && Rscript -e 'rmarkdown::render("history.Rmd", output_dir="public")' > ./out/rmarkdown_history.out 2>&1 && echo "# Benchmark history report produced" $DO_REPORT && Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' > ./out/rmarkdown_tech.out 2>&1 && echo "# Benchmark tech report produced" @@ -69,7 +67,7 @@ $DO_REPORT && Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' > rm -rf ./db-benchmark.gh-pages $DO_REPORT && $DO_PUBLISH \ && [ -f ./report-done ] \ - && [ $(wc -l report-done | awk '{print $1}') -eq 5 ] \ + && [ $(wc -l report-done | awk '{print $1}') -eq 3 ] \ && [ -f ./token ] \ && ((./publish.sh && echo "# Benchmark results has been published") || echo "# Benchmark publish script failed") diff --git a/tech.Rmd b/tech.Rmd index 21f243b1..8ffeb824 100644 --- a/tech.Rmd +++ b/tech.Rmd @@ -7,16 +7,14 @@ output: includes: in_header: ga.html --- +```{r render, include=FALSE} +# Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path +``` ```{r opts, echo=FALSE} knitr::opts_chunk$set(echo=FALSE, cache=FALSE) ``` -```{r render} -report_name = "tech" -# Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path -``` - ```{r init} library(lattice) source("report.R") @@ -82,5 +80,10 @@ p = sapply(setNames(nm=as.character(unique(ld$solution))), simplify = FALSE, fun sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul ``` -```{r status, child="rmarkdown_child/status.Rmd"} +------ + +Report was generated on: `r format(Sys.time(), usetz=TRUE)`. + +```{r status_set_success} +cat("tech\n", file=get_report_status_file(), append=TRUE) ```