duckdblabs · jangorecki · Apr 24, 2023 · Jun 17, 2023 · Jun 21, 2023 · Jun 21, 2023
diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
@@ -55,6 +55,10 @@ jobs:
       shell: bash
       run: python3 _utils/prep_solutions.py --task=join && source path.env && TEST_RUN=true ./run.sh
 
+    - name: Run mini Rollfun benchmark
+      shell: bash
+      run: python3 _utils/prep_solutions.py --task=rollfun && source path.env && TEST_RUN=true ./run.sh
+
     - name: Validate benchmark results
       shell: bash
       run: ./_utils/validate_no_errors.sh

diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ Contribution and feedback are very welcome!
   - [x] groupby
   - [x] join
   - [x] groupby2014
+  - [x] rollfun
 
 # Solutions
 

diff --git a/_benchplot/benchplot-dict.R b/_benchplot/benchplot-dict.R
@@ -249,7 +249,7 @@ groupby.query.exceptions = {list(
   "arrow"      =  list("Expression row_number() <= 2L not supported in Arrow; pulling data into R" = "max v1 - min v2 by id3", "Expression cor(v1, v2, ... is not supported in arrow; pulling data into R" = "regression v1 v2 by id2 id4"),
   "duckdb"     =  list(),
   "duckdb-latest"     =  list(),
-  "datafusion" =  list(),
+  "datafusion" =  list()
 )}
 groupby.data.exceptions = {list(                                                             # exceptions as of run 1575727624
   "data.table" = {list(
@@ -468,7 +468,7 @@ join.data.exceptions = {list(
                         "J1_1e9_NA_5_0","J1_1e9_NA_0_1")                                  # q1 r1
   )},
   "polars" = {list(
-    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1"),
+    "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1")
   )},
   "arrow" = {list(
     "out of memory" = c("J1_1e9_NA_0_0","J1_1e9_NA_5_0","J1_1e9_NA_0_1", "J1_1e8_NA_0_0", "J1_1e8_NA_5_0", "J1_1e8_NA_0_1" )#,
@@ -529,3 +529,97 @@ groupby2014.data.exceptions = {list(
   )}
 )}
 groupby2014.exceptions = task.exceptions(groupby2014.query.exceptions, groupby2014.data.exceptions)
+
+# rollfun ----
+
+rollfun_q_title_fun = function(x) {
+  stopifnot(c("question","iquestion","out_rows","out_cols","in_rows") %in% names(x),
+            uniqueN(x, by="iquestion")==nrow(x))
+  x[, sprintf("Query %s: \"%s\"",
+              iquestion, as.character(question)),
+    by = "iquestion"]$V1
+}
+rollfun.syntax.dict = {list(
+  "data.table" = {c(
+    "mean" = "frollmean(x$v1, w)",
+    "window small" = "frollmean(x$v1, wsmall)",
+    "window big" = "frollmean(x$v1, wbig)",
+    "min" = "frollmin(x$v1, w)",
+    "median" = "frollmedian(x$v1, w)",
+    "multiroll" = "frollmean(list(x$v1, x$v2), c(w-50L, w+50L))",
+    "weighted" = "",
+    "uneven dense" = "frollmean(x$v1, frolladapt(x$id2, w), adaptive=TRUE)",
+    "uneven sparse" = "frollmean(x$v1, frolladapt(x$id3, w), adaptive=TRUE)",
+    "regression" = ""
+  )},
+  "dplyr" = {c(
+    "mean" = "slide_mean(x$v1, before=w-1L, complete=TRUE)",
+    "window small" = "slide_mean(x$v1, before=wsmall-1L, complete=TRUE)",
+    "window big" = "slide_mean(x$v1, before=wbig-1L, complete=TRUE)",
+    "min" = "slide_min(x$v1, before=w-1L, complete=TRUE)",
+    "median" = "",
+    "multiroll" = "list(slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE), slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE))",
+    "weighted" = "",
+    "uneven dense" = "slide_index_mean(x$v1, i=x$id2, before=w-1L, complete=TRUE)",
+    "uneven sparse" = "slide_index_mean(x$v1, i=x$id3, before=w-1L, complete=TRUE)",
+    "regression" = ""
+  )},
+  "pandas" = {c(
+    "mean" = "x['v1'].rolling(w).mean()",
+    "window small" = "x['v1'].rolling(wsmall).mean()",
+    "window big" = "x['v1'].rolling(wbig).mean()",
+    "min" = "x['v1'].rolling(w).min()",
+    "median" = "x['v1'].rolling(w).median()",
+    "multiroll" = "pd.concat([x[['v1','v2']].rolling(w-50).mean().reset_index(drop=True), x[['v1','v2']].rolling(w+50).mean().reset_index(drop=True)], axis=1)",
+    "weighted" = "",
+    "uneven dense" = "{y}.rolling('{w}s').mean()",
+    "uneven sparse" = "{y}.rolling('{w}s').mean()",
+    "regression" = ""
+  )},
+  "spark" = {c(
+    "mean" = "select avg(v1) over (order by id1 rows between {w-1} preceding and current row) as v1 from x",
+    "window small" = "select avg(v1) over (order by id1 rows between {wsmall-1} preceding and current row) as v1 from x",
+    "window big" = "select avg(v1) over (order by id1 rows between {wbig-1} preceding and current row) as v1 from x",
+    "min" = "select min(v1) over (order by id1 rows between {w-1} preceding and current row) as v1 from x",
+    "median" = "select median(v1) over (order by id1 rows between {w-1} preceding and current row) as v1 from x",
+    "multiroll" = "select avg(v1) over small as v1_small, avg(v1) over big as v1_big, avg(v2) over small as v2_small, avg(v2) over big as v2_big from x window small as (order by id1 rows between {w-51} preceding and current row), big as (order by id1 rows between {w+49} preceding and current row)",
+    "weighted" = "",
+    "uneven dense" = "select avg(v1) over (order by id2 range between {w-1} preceding and current row) as v1 from x",
+    "uneven sparse" = "select avg(v1) over (order by id3 range between {w-1} preceding and current row) as v1 from x",
+    "regression" = ""
+  )},
+  "duckdb-latest" = {c(
+    "mean" = "SELECT avg(v1) OVER (ORDER BY id1 ROWS BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "window small" = "SELECT avg(v1) OVER (ORDER BY id1 ROWS BETWEEN {wsmall-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "window big" = "SELECT avg(v1) OVER (ORDER BY id1 ROWS BETWEEN {wbig-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "min" = "SELECT min(v1) OVER (ORDER BY id1 ROWS BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "median" = "SELECT median(v1) OVER (ORDER BY id1 ROWS BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "multiroll" = "SELECT avg(v1) OVER small AS v1_small, avg(v1) OVER big AS v1_big, avg(v2) OVER small AS v2_small, avg(v2) OVER big AS v2_big FROM x WINDOW small AS (ORDER BY id1 ROWS BETWEEN w-51 PRECEDING AND CURRENT ROW), big AS (ORDER BY id1 ROWS BETWEEN w+49 PRECEDING AND CURRENT ROW)",
+    "weighted" = "",
+    "uneven dense" = "SELECT avg(v1) OVER (ORDER BY id2 RANGE BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "uneven sparse" = "SELECT avg(v1) OVER (ORDER BY id3 RANGE BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS v1 FROM x",
+    "regression" = "SELECT regr_r2(v2, v1) OVER (ORDER BY id1 ROWS BETWEEN {w-1} PRECEDING AND CURRENT ROW) AS r2 FROM x"
+  )}
+)}
+rollfun.query.exceptions = {list(
+  "data.table" =  list("not yet implemented" = "weighted", "not yet implemented" = "regression"),
+  "dplyr" =       list("not yet implemented" = "median", "not yet implemented" = "weighted", "not yet implemented" = "regression"),
+  "pandas" =      list("not yet implemented" = "weighted", "not yet implemented" = "regression"),
+  "spark" =       list("not yet implemented" = "median", "not yet implemented" = "weighted", "not yet implemented" = "regression"),
+  "duckdb-latest"     =  list("not yet implemented" = "weighted")
+)}
+rollfun.data.exceptions = {list(
+  "data.table" = {list(
+  )},
+  "dplyr" = {list(
+  )},
+  "pandas" = {list(
+  )},
+  "spark" = {list(
+    "timeout" = c("R1_1e7_NA_0_1", "R1_1e8_NA_0_1")
+  )},
+  "duckdb-latest" = {list(
+    "timeout" = c("R1_1e8_NA_0_1")
+  )}
+)}
+rollfun.exceptions = task.exceptions(rollfun.query.exceptions, rollfun.data.exceptions)
diff --git a/_benchplot/benchplot.R b/_benchplot/benchplot.R
@@ -365,7 +365,8 @@ benchplot = function(
   }
   margins(nsolutions, pending=pending)
   x[na_time_sec==FALSE, "max_time" := max(c(time1, time2)), by=c("solution","question")]
-  lim_x = tail(xlab_labels(max(c(0, x$max_time), na.rm=TRUE)), n=1L)
+  trunc5 = function(x) trunc(x*1e5)/1e5
+  lim_x = tail(xlab_labels(trunc5(max(c(0, x$max_time), na.rm=TRUE))), n=1L)
   if (lim_x == 0) stop("internal error: lim x is c(0,0), this should be already escaped at the beginning with 'sum(x$na_time_sec)==nrow(x)'")
   # get bars Y coordinates, positions only, plot later in bar1
   all_y_bars = barplot(rep(NA_real_, length(pad)), horiz=TRUE, xlim=c(0, lim_x), axes=FALSE, xpd=FALSE)

diff --git a/_control/data.csv b/_control/data.csv
@@ -20,4 +20,7 @@ join,J1_1e7_NA_0_1,1e7,NA,0,1,1
 join,J1_1e8_NA_0_0,1e8,NA,0,0,1
 join,J1_1e8_NA_5_0,1e8,NA,5,0,1
 join,J1_1e8_NA_0_1,1e8,NA,0,1,1
-join,J1_1e9_NA_0_0,1e9,NA,0,0,1
+join,J1_1e9_NA_0_0,1e9,NA,0,0,1
+rollfun,R1_1e6_NA_0_1,1e6,NA,0,1,1
+rollfun,R1_1e7_NA_0_1,1e7,NA,0,1,1
+rollfun,R1_1e8_NA_0_1,1e8,NA,0,1,1
diff --git a/_control/questions.csv b/_control/questions.csv
@@ -19,3 +19,13 @@ groupby2014,sum v1 by id1:id2,basic
 groupby2014,sum v1 mean v3 by id3,basic
 groupby2014,mean v1:v3 by id4,basic
 groupby2014,sum v1:v3 by id6,basic
+rollfun,mean,basic
+rollfun,window small,basic
+rollfun,window big,basic
+rollfun,min,basic
+rollfun,median,basic
+rollfun,multiroll,advanced
+rollfun,weighted,advanced
+rollfun,uneven dense,advanced
+rollfun,uneven sparse,advanced
+rollfun,regression,advanced
diff --git a/_control/solutions.csv b/_control/solutions.csv
@@ -2,16 +2,20 @@ solution,task
 data.table,groupby
 data.table,join
 data.table,groupby2014
+data.table,rollfun
 dplyr,groupby
 dplyr,join
 dplyr,groupby2014
+dplyr,rollfun
 pandas,groupby
 pandas,join
 pandas,groupby2014
+pandas,rollfun
 pydatatable,groupby
 pydatatable,join
 spark,groupby
 spark,join
+spark,rollfun
 dask,groupby
 dask,join
 juliadf,groupby
@@ -28,5 +32,6 @@ duckdb,groupby
 duckdb,join
 duckdb-latest,groupby
 duckdb-latest,join
+duckdb-latest,rollfun
 datafusion,groupby
 datafusion,join
diff --git a/_control/timeout.csv b/_control/timeout.csv
@@ -8,3 +8,6 @@ join,1e9,360
 groupby2014,1e7,60
 groupby2014,1e8,120
 groupby2014,1e9,180
+rollfun,1e6,60
+rollfun,1e7,120
+rollfun,1e8,180
diff --git a/_data/rollfun-datagen.R b/_data/rollfun-datagen.R
@@ -0,0 +1,35 @@
+# Rscript _data/rollfun-datagen.R 1e6 0 0 1
+# Rscript _data/rollfun-datagen.R 1e7 0 0 1
+# Rscript _data/rollfun-datagen.R 1e8 0 0 1
+
+args = commandArgs(TRUE)
+
+pretty_sci = function(x) {
+  tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
+  if(length(tmp)==1L) {
+    paste0(substr(tmp, 1L, 1L), "e", nchar(tmp)-1L)
+  } else if(length(tmp)==2L){
+    paste0(tmp[1L], as.character(as.integer(tmp[2L])))
+  }
+}
+
+library(data.table)
+N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
+stopifnot(nas==0L, sort==1L) ## timeseries data always sorted
+set.seed(108)
+cat(sprintf("Producing data of %s rows, %s NAs ratio, %s sort flag\n", pretty_sci(N), nas, sort))
+DT = list()
+DT[["id1"]] = seq.int(N)                     ## index, do we need it as POSIXct/IDate?
+## uneven idx
+DT[["id2"]] = sort(sample(N*1.1, N))         ## index dense
+DT[["id3"]] = sort(sample(N*2, N))           ## index sparse
+DT[["v1"]] =  cumprod(rnorm(N, 1, 0.005))    ## more risky asset
+DT[["v2"]] =  cumprod(rnorm(N, 1, 0.001))    ## less risky asset
+DT[["weights"]] = rnorm(n=N, m=1, sd=0.1)
+
+setDT(DT)
+file = sprintf("R1_%s_NA_%s_%s.csv", pretty_sci(N), nas, sort)
+cat(sprintf("Writing data to %s\n", file))
+fwrite(DT, file)
+cat(sprintf("Data written to %s, quitting\n", file))
+if (!interactive()) quit("no", status=0)
diff --git a/_launcher/solution.R b/_launcher/solution.R
@@ -133,6 +133,8 @@ data.desc = function(task, nrow, k, na, sort) {
     prefix = "J1"
   } else if (task=="groupby2014") {
     prefix = "G0"
+  } else if (task=="rollfun") {
+    prefix = "R1"
   } else {
     stop("undefined task in solution.R data.desc function")
   }