multiple report status file

jangorecki · Jan 2, 2019 · 6eec311 · 6eec311
1 parent e98d67c
commit 6eec311
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 47 deletions.
diff --git a/index.Rmd b/index.Rmd
@@ -18,23 +18,10 @@ Because we have been asked many times to do so, the first task and initial motiv
 ```{r load_deps, include=FALSE}
 # rm -rf public && Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' # has to be output_dir='public' as there is hardcode in benchplot for that path
 knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
-library(data.table)
+source("report.R")
+report_status_file = get_report_status_file()
 source("helpers.R")
 source("benchplot.R") # also creates 'code' for groupby
-fs = function(x) factor(x, levels=unique(x))
-kk = knitr::kable
-report_status_file = "report-success"
-if (file.exists(report_status_file)) file.remove(report_status_file)
-```
-
-```{r exceptions, include=FALSE, eval=FALSE}
-# CURRENTLY NOT USED
-#exceptions = rbindlist(list(
-#  data.table(solution = "pandas", version="0.23.4", task = "groupby", in_rows = 1e9, data=c("G1_1e9_1e2_0_0"), comment = "lack of memory to read csv")#,
-    #data.table(solution = "pandas", version="0.23.4", task = "join", in_rows = 1e9, data=c("X1e9_2c-Y1e9_2c"), comment = "lack of memory"),
-  #data.table(solution = "dplyr", version="0.7.99.9000", task = "join", in_rows = 1e9, data=c("X1e9_2c-Y1e9_2c"), comment = "Cannot allocate memory"),
-  #data.table(solution = "pydatatable", version="0.6.0", task = "join", in_rows = c(1e7,1e8,1e9), data=c("X1e7_2c-Y1e7_2c","X1e8_2c-Y1e8_2c","X1e9_2c-Y1e9_2c"), comment = "not yet implemented")
-#))
 ```
 
 ```{r load_data, include=FALSE}
@@ -75,7 +62,7 @@ by_data = function(dt, .in_rows, .task) {
       stop("no other task defined for decompose_dataname")
     }
   }
-  wide = dt[run==1L, dcast(.SD, fs(data)+fs(question) ~ fs(solution), value.var="time_sec")]
+  wide = dt[run==1L, dcast(.SD, ft(data)+ft(question) ~ ft(solution), value.var="time_sec")]
   #d = rollup(wide, by=c("data","question"), j=lapply(.SD, sum), id=TRUE) # including sub totals
   d = groupingsets(wide, by=c("data","question"), j=lapply(.SD, sum), id=TRUE, sets=list(c("data","question"), character(0)))
   setorderv(d, "data", na.last = TRUE)
@@ -192,7 +179,7 @@ hours_took = paste0(hours_took, recent_lg[, .(sec_diff = timestamp[action=="fini
 Benchmark run took around `r hours_took` hours.  
 
 ```{r set_success_state, include=FALSE}
-writeLines("", report_status_file)
+cat("groupby\n", file=report_status_file, append=TRUE)
 ```
 
 Report was generated on: `r format(Sys.time(), usetz=TRUE)`.
diff --git a/run.sh b/run.sh
@@ -35,12 +35,17 @@ Rscript ./launcher.R
 # publish report for all tasks
 rm -f rmarkdown.out
 rm -rf public
+rm -f report-done
 Rscript -e 'rmarkdown::render("index.Rmd", output_dir="public")' > ./rmarkdown-index.out 2>&1 && echo "# Benchmark report produced"
 Rscript -e 'rmarkdown::render("tech.Rmd", output_dir="public")' > ./rmarkdown-tech.out 2>&1 && echo "# Benchmark tech report produced"
 
-# publish benchmark, only if token file exists
+# publish benchmark, only if reports successfully generated (groupby, tech), token file exists
 rm -rf db-benchmark.gh-pages
-$DO_PUBLISH && [ -f ./report-success ] && [ -f ./token ] && ((./publish.sh && echo "# Benchmark results has been published") || echo "# Benchmark publish script failed")
+$DO_PUBLISH \
+  && [ -f ./report-done ] \
+  && [ $(wc -l report-done | awk '{print $1}') -eq 2 ] \
+  && [ -f ./token ] \
+  && ((./publish.sh && echo "# Benchmark results has been published") || echo "# Benchmark publish script failed")
 
 # remove run lock file
 rm -f run.lock

diff --git a/tech.Rmd b/tech.Rmd
@@ -8,10 +8,8 @@ output:
 
 ```{r init, echo=FALSE}
 knitr::opts_chunk$set(echo=FALSE, cache=FALSE)
-library(data.table)
-library(lattice)
-ft = function(x) factor(x, levels=unique(x))
-kk = knitr::kable
+source("report.R")
+report_status_file = get_report_status_file()
 ```
 
 ```{r loading}
@@ -20,30 +18,8 @@ l = fread("~/git/db-benchmark/logs.csv")[nzchar(solution)]
 ```
 
 ```{r cleaning}
-ftdata = function(x) {
-  k=ft(substr(x, 8, 10))
-  in_rows=ft(substr(x, 4, 6))
-  tsorted = function(x) {
-    ans = rep("unsorted", length(x))
-    ans[as.logical(x)] = "sorted"
-    ans
-  }
-  nasorted=ft(sprintf("%s%% NAs, %s", substr(x, 12, 12), tsorted(as.integer(substr(x, 14, 14)))))
-  list(k=k, in_rows=in_rows, nasorted=nasorted)
-}
-l = l[, c(list(nodename=ft(nodename), ibatch=as.integer(ft(as.character(batch))), solution=ft(solution),
-           action=ft(action), stderr=stderr, version=ft(version), git=ft(git), task=ft(task), data=ft(data), timestamp=timestamp, batch=batch),
-      ftdata(data))]
-l[, max_batch:=max(batch, na.rm=TRUE), c("nodename","solution","task","data")]
-l[, is_max_batch:=FALSE][batch==max_batch, is_max_batch:=TRUE][, max_batch:=NULL]
-
-d = d[, c(list(nodename=ft(nodename), ibatch=as.integer(ft(as.character(batch))), solution=ft(solution),
-           question=ft(question), run=run, version=ft(version), git=ft(git), task=ft(task), data=ft(data), 
-           timestamp=timestamp, batch=batch, time_sec=time_sec),
-      ftdata(data))]
-d[, max_batch:=max(batch, na.rm=TRUE), c("nodename","solution","task","data")]
-d[, is_max_batch:=FALSE][batch==max_batch, is_max_batch:=TRUE][, max_batch:=NULL]
-
+l = clean_logs(l)
+d = clean_time(d)
 ld = d[l[action=="start"], on=c("nodename","batch","solution","task","data","in_rows","k","nasorted"), nomatch=NA]
 ```
 
@@ -83,3 +59,7 @@ p = sapply(setNames(nm=as.character(unique(ll$solution))), simplify = FALSE, fun
 )
 sapply(seq_along(p), function(i) print(p[[i]], split=c(1, i, 1, length(p)), more=i!=length(p))) -> nul
 ```
+
+```{r set_success_state, include=FALSE}
+cat("tech\n", file=report_status_file, append=TRUE)
+```