Skip to content

Use collapse::pivot in resample.R functions for better performance #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

maouw
Copy link

@maouw maouw commented Jan 28, 2025

I noticed the functions in resample.R could probably be sped up by using functions from the package collapse. We will need a little more testing but here is a draft. Is this something that would be useful?

Benchmark and results show several improvements in throughput and memory usage:


> devtools::load_all()

> mk_df <- function(n_subjectID=20000L, n_nodeID=40L, n_group=2L,seed=0) {
+ 	if(!is.null(seed)) {
+ 		set.seed(seed)
+ 	}
+ 	expand.grid(subjectID=seq.int(n_subjectID),nodeID=seq.int(n_nodeID),group=seq.int(n_group))|>dplyr::group_by(subjectID)|>dplyr::mutate(age=20+sample.int(30,size = 1),dti_fa=rnorm(dplyr::n())) |> dplyr::arrange(subjectID,nodeID,group)|>dplyr::ungroup()
+ }

> df1 <- mk_df(n_subjectID=1000L, n_nodeID = 10L, n_group=2L)

> df1_factor <- df1 |> dplyr::mutate(nodeID=as.factor(nodeID))

> df2 <- mk_df(n_subjectID=20000L, n_nodeID = 40L, n_group=2L)

> df2_factor <- df2 |> dplyr::mutate(nodeID=as.factor(nodeID))

> # Compare times
> # equality guaranteed by setting tidyr::pivot_longer/tidyr::pivot_wider row order options
> t1 <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE, tractable.pivot_names_vary = "slowest",tractable.pivot_cols_vary="slowest"),bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1, target = "dti_fa", node_group = "group")))
+ )

> t1
# A tibble: 2 × 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result   memory    
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>   <list>    
1 dplyr         5.6ms   5.89ms      168.    5.54MB     8.26    61     3      363ms <tibble> <Rprofmem>
2 collapse    775.8µs 845.99µs     1137.    2.87MB    15.5    513     7      451ms <tibble> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> # Compare times, don't check for equality (result order may differ despite same seed)
> # Also compare against using a factor nodeID, which should be faster
> t1_nocheck <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE),bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse_factor = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1_factor, target = "dti_fa", node_group = "group"))),
+ 	check = FALSE
+ )

> t1_nocheck
# A tibble: 3 × 13
  expression         min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr>     <bch:t> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 dplyr            5.7ms   5.89ms      169.    4.03MB     8.06    63     3      372ms <NULL> <Rprofmem>
2 collapse       774.2µs 839.23µs     1182.    1.38MB    18.0    524     8      443ms <NULL> <Rprofmem>
3 collapse_fact… 652.4µs 716.48µs     1385.    1.34MB    20.4    611     9      441ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> # Compare times
> # equality guaranteed by setting tidyr::pivot_longer/tidyr::pivot_wider row order options
> t2 <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE, tractable.pivot_names_vary = "slowest",tractable.pivot_cols_vary="slowest"),bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2, target = "dti_fa", node_group = "group")))
+ )

> t2
# A tibble: 2 × 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result   memory    
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>   <list>    
1 dplyr       205.1ms  272.9ms      3.98     287MB     14.6     3    11      754ms <tibble> <Rprofmem>
2 collapse     33.7ms   35.8ms     19.4      105MB     21.1    12    13      617ms <tibble> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> rm(list=c("df1", "df1_factor"))

> # Compare times, don't check for equality (result order may differ despite same seed)
> # Also compare against using a factor nodeID, which should be faster
> t2_nocheck <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE),bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse_factor = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2_factor, target = "dti_fa", node_group = "group"))),
+ 	check = FALSE,
+ )

> t2_nocheck
# A tibble: 3 × 13
  expression          min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr>      <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 dplyr           189.6ms 206.5ms      4.39   293.3MB     10.2     3     7      684ms <NULL> <Rprofmem>
2 collapse         26.4ms  32.9ms     24.8    105.3MB     19.1    13    10      524ms <NULL> <Rprofmem>
3 collapse_factor  22.8ms  29.8ms     22.5     99.2MB     22.5    13    13      577ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> rm(list=c("df2", "df2_factor"))

> benchmarkme::get_cpu()
$vendor_id
character(0)

$model_name
[1] "Apple M1"

$no_of_cores
[1] 8


> benchmarkme::get_ram()
17.2 GB

> sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.7.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tractable_0.2.1 testthat_3.2.3  felp_0.6.0     

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1      dplyr_1.1.4           farver_2.1.2          fastmap_1.2.0        
 [5] promises_1.3.2        digest_0.6.37         mime_0.12             lifecycle_1.0.4      
 [9] ellipsis_0.3.2        survival_3.8-3        magrittr_2.0.3        compiler_4.4.2       
[13] rlang_1.1.4           tools_4.4.2           utf8_1.2.4            collapse_2.0.19      
[17] fitdistrplus_1.2-2    htmlwidgets_1.6.4     pkgbuild_1.4.6        curl_6.1.0           
[21] bench_1.1.4           aws.signature_0.6.0   xml2_1.3.6            RColorBrewer_1.1-3   
[25] pkgload_1.4.0         miniUI_0.1.1.1        withr_3.0.2           purrr_1.0.2          
[29] desc_1.4.3            grid_4.4.2            urlchecker_1.0.1      profvis_0.4.0        
[33] xtable_1.8-4          colorspace_2.1-1      ggplot2_3.5.1         scales_1.3.0         
[37] iterators_1.0.14      MASS_7.3-64           cli_3.6.3             generics_0.1.3       
[41] remotes_2.5.0         rstudioapi_0.17.1     httr_1.4.7            tzdb_0.4.0           
[45] sessioninfo_1.2.2     cachem_1.1.0          profmem_0.6.0         stringr_1.5.1        
[49] splines_4.4.2         parallel_4.4.2        base64enc_0.1-3       vctrs_0.6.5          
[53] devtools_2.4.5        Matrix_1.7-1          benchmarkme_1.0.8     gratia_0.10.0        
[57] hms_1.1.3             patchwork_1.3.0       foreach_1.5.2         tidyr_1.3.1          
[61] benchmarkmeData_1.0.4 glue_1.8.0            codetools_0.2-20      ggokabeito_0.1.0     
[65] mvnfast_0.2.8         stringi_1.8.4         gtable_0.3.6          later_1.4.1          
[69] aws.s3_0.3.21         itsadug_2.4.1         munsell_0.5.1         tibble_3.2.1         
[73] pillar_1.10.1         htmltools_0.5.8.1     brio_1.1.5            R6_2.5.1             
[77] doParallel_1.0.17     rprojroot_2.0.4       shiny_1.10.0          lattice_0.22-6       
[81] readr_2.1.5           memoise_2.0.1         httpuv_1.6.15         Rcpp_1.0.14          
[85] nlme_3.1-166          mgcv_1.9-1            fs_1.6.5              forcats_1.0.0        
[89] usethis_3.1.0         plotfunctions_1.4     pkgconfig_2.0.3      

@arokem
Copy link
Member

arokem commented Jan 30, 2025

Thanks for taking a look! Heads up that this module is not currently on the main path of the most-used public APIs. It's here because we used it to sanity check some of the results early on in the development of the library, and I can see it having some uses down the line (e.g., to check assumptions), so there are potentially reasons to keep it working well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants