From 045b7d5729b4f7c5b705cd647e6ebfde631afcf8 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 2 Jul 2023 19:19:11 +0200 Subject: [PATCH] rollfun questions amended --- _control/questions.csv | 6 +-- datatable/rollfun-datatable.R | 73 +++++++++++++++++--------------- dplyr/rollfun-dplyr.R | 78 ++++++++++++++++------------------- 3 files changed, 78 insertions(+), 79 deletions(-) diff --git a/_control/questions.csv b/_control/questions.csv index 2b09c60b..92f7f8ac 100644 --- a/_control/questions.csv +++ b/_control/questions.csv @@ -19,13 +19,13 @@ groupby2014,sum v1 by id1:id2,basic groupby2014,sum v1 mean v3 by id3,basic groupby2014,mean v1:v3 by id4,basic groupby2014,sum v1:v3 by id6,basic -rollfun,rolling mean,basic +rollfun,mean,basic rollfun,window small,basic rollfun,window big,basic -rollfun,multi vars cols,basic +rollfun,min,basic rollfun,median,basic +rollfun,multiroll,advanced rollfun,weighted,advanced rollfun,uneven dense,advanced rollfun,uneven sparse,advanced rollfun,regression,advanced -rollfun,udf,advanced diff --git a/datatable/rollfun-datatable.R b/datatable/rollfun-datatable.R index 45e3e97b..b653222b 100755 --- a/datatable/rollfun-datatable.R +++ b/datatable/rollfun-datatable.R @@ -37,7 +37,7 @@ cat("rolling...\n") fun = "frollmean" -question = "rolling mean" # q1 +question = "mean" # q1 t = system.time(print(length(ans<-frollmean(x$v1, w))))[["elapsed"]] m = memory_usage() chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] @@ -79,20 +79,24 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -question = "multi vars cols" # q4 -t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]] +fun = "frollmin" + +question = "min" # q4 +t = system.time(print(length(ans<-frollmin(x$v1, w))))[["elapsed"]] m = memory_usage() -chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] -write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] +write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) rm(ans) -t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]] +t = system.time(print(length(ans<-frollmin(x$v1, w))))[["elapsed"]] m = memory_usage() -chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] -write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -print(lapply(ans, head, 3)) -print(lapply(ans, tail, 3)) +chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] +write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(head(ans, 3)) +print(tail(ans, 3)) rm(ans) +#fun = "frollmedian" + #question = "median" # q5 ## not yet implemeneted #t = system.time(print(length(ans<-frollmedian(x$v1, w))))[["elapsed"]] #m = memory_usage() @@ -107,7 +111,23 @@ rm(ans) #print(tail(ans, 3)) #rm(ans) -#question = "weighted" # q6 ## not yet implemeneted +fun = "frollmean" + +question = "multiroll" # q6 +t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]] +m = memory_usage() +chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] +write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +rm(ans) +t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]] +m = memory_usage() +chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] +write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(lapply(ans, head, 3)) +print(lapply(ans, tail, 3)) +rm(ans) + +#question = "weighted" # q7 ## not yet implemeneted #t = system.time(print(length(ans<-frollmean(x$v1, w, w=x$weights))))[["elapsed"]] #m = memory_usage() #chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] @@ -121,7 +141,9 @@ rm(ans) #print(tail(ans, 3)) #rm(ans) -question = "uneven dense" # q7 +fun = "frollmean" + +question = "uneven dense" # q8 t = system.time(print(length(ans<-frollmean(x$v1, frolladapt(x$id2, w), adaptive=TRUE))))[["elapsed"]] m = memory_usage() chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] @@ -135,7 +157,7 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -question = "uneven sparse" # q8 +question = "uneven sparse" # q9 t = system.time(print(length(ans<-frollmean(x$v1, frolladapt(x$id3, w), adaptive=TRUE))))[["elapsed"]] m = memory_usage() chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] @@ -149,32 +171,15 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -#question = "regression" # q9 ## not yet implemeneted -#t = system.time(print(length(ans<-frollmean(x[,c("v1","v2")], c(w-50L, w+50L)))))[["elapsed"]] -#m = memory_usage() -#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] -#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#rm(ans) -#t = system.time(print(length(ans<-frollmean(x[,c("v1","v2")], c(w-50L, w+50L)))))[["elapsed"]] -#m = memory_usage() -#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] -#write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(head(ans, 3)) -#print(tail(ans, 3)) -#rm(ans) +#fun = "frollreg" -#question = "udf" # q10 ## UDF simply does not scale -## compound distance -#udf = function(x) { -# tmp <- range(x) -# tmp[2L]/tmp[1L] -#} -#t = system.time(print(length(ans<-frollapply(x$v1, w, udf, simplify=unlist))))[["elapsed"]] +#question = "regression" # q10 ## not yet implemeneted +#t = system.time(print(length(ans<-frollreg(list(x$v1, x$v2), w))))[["elapsed"]] #m = memory_usage() #chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] #write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) #rm(ans) -#t = system.time(print(length(ans<-frollapply(x$v1, w, udf, simplify=unlist))))[["elapsed"]] +#t = system.time(print(length(ans<-frollreg(list(x$v1, x$v2), w))))[["elapsed"]] #m = memory_usage() #chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] #write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) diff --git a/dplyr/rollfun-dplyr.R b/dplyr/rollfun-dplyr.R index f12c16a7..12c773ba 100755 --- a/dplyr/rollfun-dplyr.R +++ b/dplyr/rollfun-dplyr.R @@ -30,7 +30,7 @@ wbig = nrow(x)/1e2L task_init = proc.time()[["elapsed"]] cat("rolling...\n") -question = "rolling mean" # q1 +question = "mean" # q1 fun = "slide_mean" t = system.time(print(length(ans<-slide_mean(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]] m = memory_usage() @@ -75,25 +75,19 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -question = "multi vars cols" # q4 -fun = "slide_mean" -t = system.time(print(length(ans<-list( - slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE), - slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE) -))))[["elapsed"]] +question = "min" # q4 +fun = "slide_min" +t = system.time(print(length(ans<-slide_min(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]] m = memory_usage() -chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] -write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] +write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) rm(ans) -t = system.time(print(length(ans<-list( - slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE), - slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE) -))))[["elapsed"]] +t = system.time(print(length(ans<-slide_min(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]] m = memory_usage() -chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] -write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -print(lapply(ans, head, 3)) -print(lapply(ans, tail, 3)) +chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] +write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(head(ans, 3)) +print(tail(ans, 3)) rm(ans) #question = "median" # q5 ## not yet implemeneted @@ -111,7 +105,28 @@ rm(ans) #print(tail(ans, 3)) #rm(ans) -#question = "weighted" # q6 ## not yet implemeneted +question = "multiroll" # q6 +fun = "slide_mean" +t = system.time(print(length(ans<-list( + slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE), + slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE) +))))[["elapsed"]] +m = memory_usage() +chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] +write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +rm(ans) +t = system.time(print(length(ans<-list( + slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE), + slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE) +))))[["elapsed"]] +m = memory_usage() +chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]] +write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) +print(lapply(ans, head, 3)) +print(lapply(ans, tail, 3)) +rm(ans) + +#question = "weighted" # q7 ## not yet implemeneted #fun = "slide_mean" #t = system.time(print(length(ans<-slide_mean(x$v1, before=w-1L, complete=TRUE, w=x$weights))))[["elapsed"]] #m = memory_usage() @@ -126,7 +141,7 @@ rm(ans) #print(tail(ans, 3)) #rm(ans) -question = "uneven dense" # q7 +question = "uneven dense" # q8 fun = "slide_index_mean" t = system.time(print(length(ans<-slide_index_mean(x$v1, i=x$id2, before=w-1L, complete=TRUE))))[["elapsed"]] m = memory_usage() @@ -141,7 +156,7 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -question = "uneven sparse" # q8 +question = "uneven sparse" # q9 fun = "slide_index_mean" t = system.time(print(length(ans<-slide_index_mean(x$v1, i=x$id3, before=w-1L, complete=TRUE))))[["elapsed"]] m = memory_usage() @@ -156,11 +171,10 @@ print(head(ans, 3)) print(tail(ans, 3)) rm(ans) -#question = "regression" # q9 ## Killed, UDF simply does not scale +#question = "regression" # q10 ## Killed, UDF simply does not scale, needs to be specialized fun #fun = "slide" #t = system.time(print(length(ans<-slide(select(x, v1, v2), ~lm(v2 ~ v1, data=.x), .before=w-1L, .complete=TRUE))))[["elapsed"]] #m = memory_usage() -#TODO #chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] #write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) #rm(ans) @@ -172,26 +186,6 @@ rm(ans) #print(tail(ans, 3)) #rm(ans) -#question = "udf" # q10 ## UDF simply does not scale -## compound distance -#udf = function(x) { -# tmp <- range(x) -# tmp[2L]/tmp[1L] -#} -#fun = "slide" -#t = system.time(print(length(ans<-slide_dbl(x$v1, udf, .before=w-1L, .complete=TRUE))))[["elapsed"]] -#m = memory_usage() -#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] -#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#rm(ans) -#t = system.time(print(length(ans<-slide_dbl(x$v1, udf, .before=w-1L, .complete=TRUE))))[["elapsed"]] -#m = memory_usage() -#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]] -#write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk) -#print(head(ans, 3)) -#print(tail(ans, 3)) -#rm(ans) - cat(sprintf("rolling finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init)) if( !interactive() ) q("no", status=0)