Skip to content

Commit

Permalink
rollfun questions amended
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Jul 2, 2023
1 parent b6dd51e commit 045b7d5
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 79 deletions.
6 changes: 3 additions & 3 deletions _control/questions.csv
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ groupby2014,sum v1 by id1:id2,basic
groupby2014,sum v1 mean v3 by id3,basic
groupby2014,mean v1:v3 by id4,basic
groupby2014,sum v1:v3 by id6,basic
rollfun,rolling mean,basic
rollfun,mean,basic
rollfun,window small,basic
rollfun,window big,basic
rollfun,multi vars cols,basic
rollfun,min,basic
rollfun,median,basic
rollfun,multiroll,advanced
rollfun,weighted,advanced
rollfun,uneven dense,advanced
rollfun,uneven sparse,advanced
rollfun,regression,advanced
rollfun,udf,advanced
73 changes: 39 additions & 34 deletions datatable/rollfun-datatable.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ cat("rolling...\n")

fun = "frollmean"

question = "rolling mean" # q1
question = "mean" # q1
t = system.time(print(length(ans<-frollmean(x$v1, w))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
Expand Down Expand Up @@ -79,20 +79,24 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

question = "multi vars cols" # q4
t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]]
fun = "frollmin"

question = "min" # q4
t = system.time(print(length(ans<-frollmin(x$v1, w))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
rm(ans)
t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]]
t = system.time(print(length(ans<-frollmin(x$v1, w))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(lapply(ans, head, 3))
print(lapply(ans, tail, 3))
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

#fun = "frollmedian"

#question = "median" # q5 ## not yet implemeneted
#t = system.time(print(length(ans<-frollmedian(x$v1, w))))[["elapsed"]]
#m = memory_usage()
Expand All @@ -107,7 +111,23 @@ rm(ans)
#print(tail(ans, 3))
#rm(ans)

#question = "weighted" # q6 ## not yet implemeneted
fun = "frollmean"

question = "multiroll" # q6
t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
rm(ans)
t = system.time(print(length(ans<-frollmean(list(x$v1, x$v2), c(w-50L, w+50L)))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(lapply(ans, head, 3))
print(lapply(ans, tail, 3))
rm(ans)

#question = "weighted" # q7 ## not yet implemeneted
#t = system.time(print(length(ans<-frollmean(x$v1, w, w=x$weights))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
Expand All @@ -121,7 +141,9 @@ rm(ans)
#print(tail(ans, 3))
#rm(ans)

question = "uneven dense" # q7
fun = "frollmean"

question = "uneven dense" # q8
t = system.time(print(length(ans<-frollmean(x$v1, frolladapt(x$id2, w), adaptive=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
Expand All @@ -135,7 +157,7 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

question = "uneven sparse" # q8
question = "uneven sparse" # q9
t = system.time(print(length(ans<-frollmean(x$v1, frolladapt(x$id3, w), adaptive=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
Expand All @@ -149,32 +171,15 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

#question = "regression" # q9 ## not yet implemeneted
#t = system.time(print(length(ans<-frollmean(x[,c("v1","v2")], c(w-50L, w+50L)))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#rm(ans)
#t = system.time(print(length(ans<-frollmean(x[,c("v1","v2")], c(w-50L, w+50L)))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#print(head(ans, 3))
#print(tail(ans, 3))
#rm(ans)
#fun = "frollreg"

#question = "udf" # q10 ## UDF simply does not scale
## compound distance
#udf = function(x) {
# tmp <- range(x)
# tmp[2L]/tmp[1L]
#}
#t = system.time(print(length(ans<-frollapply(x$v1, w, udf, simplify=unlist))))[["elapsed"]]
#question = "regression" # q10 ## not yet implemeneted
#t = system.time(print(length(ans<-frollreg(list(x$v1, x$v2), w))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#rm(ans)
#t = system.time(print(length(ans<-frollapply(x$v1, w, udf, simplify=unlist))))[["elapsed"]]
#t = system.time(print(length(ans<-frollreg(list(x$v1, x$v2), w))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
Expand Down
78 changes: 36 additions & 42 deletions dplyr/rollfun-dplyr.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ wbig = nrow(x)/1e2L
task_init = proc.time()[["elapsed"]]
cat("rolling...\n")

question = "rolling mean" # q1
question = "mean" # q1
fun = "slide_mean"
t = system.time(print(length(ans<-slide_mean(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]]
m = memory_usage()
Expand Down Expand Up @@ -75,25 +75,19 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

question = "multi vars cols" # q4
fun = "slide_mean"
t = system.time(print(length(ans<-list(
slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE),
slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE)
))))[["elapsed"]]
question = "min" # q4
fun = "slide_min"
t = system.time(print(length(ans<-slide_min(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
rm(ans)
t = system.time(print(length(ans<-list(
slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE),
slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE)
))))[["elapsed"]]
t = system.time(print(length(ans<-slide_min(x$v1, before=w-1L, complete=TRUE))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(lapply(ans, head, 3))
print(lapply(ans, tail, 3))
chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

#question = "median" # q5 ## not yet implemeneted
Expand All @@ -111,7 +105,28 @@ rm(ans)
#print(tail(ans, 3))
#rm(ans)

#question = "weighted" # q6 ## not yet implemeneted
question = "multiroll" # q6
fun = "slide_mean"
t = system.time(print(length(ans<-list(
slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE),
slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE)
))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
rm(ans)
t = system.time(print(length(ans<-list(
slide_mean(x$v1, before=w-51L, complete=TRUE), slide_mean(x$v1, before=w+49L, complete=TRUE),
slide_mean(x$v2, before=w-51L, complete=TRUE), slide_mean(x$v2, before=w+49L, complete=TRUE)
))))[["elapsed"]]
m = memory_usage()
chkt = system.time(chk<-lapply(ans, sum, na.rm=TRUE))[["elapsed"]]
write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans[[1L]]), out_cols=length(ans), solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
print(lapply(ans, head, 3))
print(lapply(ans, tail, 3))
rm(ans)

#question = "weighted" # q7 ## not yet implemeneted
#fun = "slide_mean"
#t = system.time(print(length(ans<-slide_mean(x$v1, before=w-1L, complete=TRUE, w=x$weights))))[["elapsed"]]
#m = memory_usage()
Expand All @@ -126,7 +141,7 @@ rm(ans)
#print(tail(ans, 3))
#rm(ans)

question = "uneven dense" # q7
question = "uneven dense" # q8
fun = "slide_index_mean"
t = system.time(print(length(ans<-slide_index_mean(x$v1, i=x$id2, before=w-1L, complete=TRUE))))[["elapsed"]]
m = memory_usage()
Expand All @@ -141,7 +156,7 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

question = "uneven sparse" # q8
question = "uneven sparse" # q9
fun = "slide_index_mean"
t = system.time(print(length(ans<-slide_index_mean(x$v1, i=x$id3, before=w-1L, complete=TRUE))))[["elapsed"]]
m = memory_usage()
Expand All @@ -156,11 +171,10 @@ print(head(ans, 3))
print(tail(ans, 3))
rm(ans)

#question = "regression" # q9 ## Killed, UDF simply does not scale
#question = "regression" # q10 ## Killed, UDF simply does not scale, needs to be specialized fun
#fun = "slide"
#t = system.time(print(length(ans<-slide(select(x, v1, v2), ~lm(v2 ~ v1, data=.x), .before=w-1L, .complete=TRUE))))[["elapsed"]]
#m = memory_usage()
#TODO
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#rm(ans)
Expand All @@ -172,26 +186,6 @@ rm(ans)
#print(tail(ans, 3))
#rm(ans)

#question = "udf" # q10 ## UDF simply does not scale
## compound distance
#udf = function(x) {
# tmp <- range(x)
# tmp[2L]/tmp[1L]
#}
#fun = "slide"
#t = system.time(print(length(ans<-slide_dbl(x$v1, udf, .before=w-1L, .complete=TRUE))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=1L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#rm(ans)
#t = system.time(print(length(ans<-slide_dbl(x$v1, udf, .before=w-1L, .complete=TRUE))))[["elapsed"]]
#m = memory_usage()
#chkt = system.time(chk<-sum(ans, na.rm=TRUE))[["elapsed"]]
#write.log(run=2L, task=task, data=data_name, in_rows=nrow(x), question=question, out_rows=length(ans), out_cols=1L, solution=solution, version=ver, git=git, fun=fun, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(chk), chk_time_sec=chkt, on_disk=on_disk)
#print(head(ans, 3))
#print(tail(ans, 3))
#rm(ans)

cat(sprintf("rolling finished, took %.0fs\n", proc.time()[["elapsed"]]-task_init))

if( !interactive() ) q("no", status=0)

0 comments on commit 045b7d5

Please sign in to comment.