Use collapse::pivot in resample.R functions for better performance #18

maouw · 2025-01-28T23:59:53Z

I noticed the functions in resample.R could probably be sped up by using functions from the package collapse. We will need a little more testing but here is a draft. Is this something that would be useful?

Benchmark and results show several improvements in throughput and memory usage:


> devtools::load_all()

> mk_df <- function(n_subjectID=20000L, n_nodeID=40L, n_group=2L,seed=0) {
+ 	if(!is.null(seed)) {
+ 		set.seed(seed)
+ 	}
+ 	expand.grid(subjectID=seq.int(n_subjectID),nodeID=seq.int(n_nodeID),group=seq.int(n_group))|>dplyr::group_by(subjectID)|>dplyr::mutate(age=20+sample.int(30,size = 1),dti_fa=rnorm(dplyr::n())) |> dplyr::arrange(subjectID,nodeID,group)|>dplyr::ungroup()
+ }

> df1 <- mk_df(n_subjectID=1000L, n_nodeID = 10L, n_group=2L)

> df1_factor <- df1 |> dplyr::mutate(nodeID=as.factor(nodeID))

> df2 <- mk_df(n_subjectID=20000L, n_nodeID = 40L, n_group=2L)

> df2_factor <- df2 |> dplyr::mutate(nodeID=as.factor(nodeID))

> # Compare times
> # equality guaranteed by setting tidyr::pivot_longer/tidyr::pivot_wider row order options
> t1 <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE, tractable.pivot_names_vary = "slowest",tractable.pivot_cols_vary="slowest"),bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1, target = "dti_fa", node_group = "group")))
+ )

> t1
# A tibble: 2 × 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result   memory    
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>   <list>    
1 dplyr         5.6ms   5.89ms      168.    5.54MB     8.26    61     3      363ms <tibble> <Rprofmem>
2 collapse    775.8µs 845.99µs     1137.    2.87MB    15.5    513     7      451ms <tibble> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> # Compare times, don't check for equality (result order may differ despite same seed)
> # Also compare against using a factor nodeID, which should be faster
> t1_nocheck <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE),bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1, target = "dti_fa", node_group = "group"))),
+ 	collapse_factor = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df1_factor, target = "dti_fa", node_group = "group"))),
+ 	check = FALSE
+ )

> t1_nocheck
# A tibble: 3 × 13
  expression         min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr>     <bch:t> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 dplyr            5.7ms   5.89ms      169.    4.03MB     8.06    63     3      372ms <NULL> <Rprofmem>
2 collapse       774.2µs 839.23µs     1182.    1.38MB    18.0    524     8      443ms <NULL> <Rprofmem>
3 collapse_fact… 652.4µs 716.48µs     1385.    1.34MB    20.4    611     9      441ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> # Compare times
> # equality guaranteed by setting tidyr::pivot_longer/tidyr::pivot_wider row order options
> t2 <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE, tractable.pivot_names_vary = "slowest",tractable.pivot_cols_vary="slowest"),bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2, target = "dti_fa", node_group = "group")))
+ )

> t2
# A tibble: 2 × 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result   memory    
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>   <list>    
1 dplyr       205.1ms  272.9ms      3.98     287MB     14.6     3    11      754ms <tibble> <Rprofmem>
2 collapse     33.7ms   35.8ms     19.4      105MB     21.1    12    13      617ms <tibble> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> rm(list=c("df1", "df1_factor"))

> # Compare times, don't check for equality (result order may differ despite same seed)
> # Also compare against using a factor nodeID, which should be faster
> t2_nocheck <- bench::mark(
+ 	dplyr = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=FALSE),bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2, target = "dti_fa", node_group = "group"))),
+ 	collapse_factor = withr::with_seed(0,withr::with_options(list(tractable.use_collapse=TRUE), bootstrap_df(df2_factor, target = "dti_fa", node_group = "group"))),
+ 	check = FALSE,
+ )

> t2_nocheck
# A tibble: 3 × 13
  expression          min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory    
  <bch:expr>      <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list>    
1 dplyr           189.6ms 206.5ms      4.39   293.3MB     10.2     3     7      684ms <NULL> <Rprofmem>
2 collapse         26.4ms  32.9ms     24.8    105.3MB     19.1    13    10      524ms <NULL> <Rprofmem>
3 collapse_factor  22.8ms  29.8ms     22.5     99.2MB     22.5    13    13      577ms <NULL> <Rprofmem>
# ℹ 2 more variables: time <list>, gc <list>

> rm(list=c("df2", "df2_factor"))

> benchmarkme::get_cpu()
$vendor_id
character(0)

$model_name
[1] "Apple M1"

$no_of_cores
[1] 8


> benchmarkme::get_ram()
17.2 GB

> sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.7.2

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Los_Angeles
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] tractable_0.2.1 testthat_3.2.3  felp_0.6.0     

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.1      dplyr_1.1.4           farver_2.1.2          fastmap_1.2.0        
 [5] promises_1.3.2        digest_0.6.37         mime_0.12             lifecycle_1.0.4      
 [9] ellipsis_0.3.2        survival_3.8-3        magrittr_2.0.3        compiler_4.4.2       
[13] rlang_1.1.4           tools_4.4.2           utf8_1.2.4            collapse_2.0.19      
[17] fitdistrplus_1.2-2    htmlwidgets_1.6.4     pkgbuild_1.4.6        curl_6.1.0           
[21] bench_1.1.4           aws.signature_0.6.0   xml2_1.3.6            RColorBrewer_1.1-3   
[25] pkgload_1.4.0         miniUI_0.1.1.1        withr_3.0.2           purrr_1.0.2          
[29] desc_1.4.3            grid_4.4.2            urlchecker_1.0.1      profvis_0.4.0        
[33] xtable_1.8-4          colorspace_2.1-1      ggplot2_3.5.1         scales_1.3.0         
[37] iterators_1.0.14      MASS_7.3-64           cli_3.6.3             generics_0.1.3       
[41] remotes_2.5.0         rstudioapi_0.17.1     httr_1.4.7            tzdb_0.4.0           
[45] sessioninfo_1.2.2     cachem_1.1.0          profmem_0.6.0         stringr_1.5.1        
[49] splines_4.4.2         parallel_4.4.2        base64enc_0.1-3       vctrs_0.6.5          
[53] devtools_2.4.5        Matrix_1.7-1          benchmarkme_1.0.8     gratia_0.10.0        
[57] hms_1.1.3             patchwork_1.3.0       foreach_1.5.2         tidyr_1.3.1          
[61] benchmarkmeData_1.0.4 glue_1.8.0            codetools_0.2-20      ggokabeito_0.1.0     
[65] mvnfast_0.2.8         stringi_1.8.4         gtable_0.3.6          later_1.4.1          
[69] aws.s3_0.3.21         itsadug_2.4.1         munsell_0.5.1         tibble_3.2.1         
[73] pillar_1.10.1         htmltools_0.5.8.1     brio_1.1.5            R6_2.5.1             
[77] doParallel_1.0.17     rprojroot_2.0.4       shiny_1.10.0          lattice_0.22-6       
[81] readr_2.1.5           memoise_2.0.1         httpuv_1.6.15         Rcpp_1.0.14          
[85] nlme_3.1-166          mgcv_1.9-1            fs_1.6.5              forcats_1.0.0        
[89] usethis_3.1.0         plotfunctions_1.4     pkgconfig_2.0.3

arokem · 2025-01-30T17:39:11Z

Thanks for taking a look! Heads up that this module is not currently on the main path of the most-used public APIs. It's here because we used it to sanity check some of the results early on in the development of the library, and I can see it having some uses down the line (e.g., to check assumptions), so there are potentially reasons to keep it working well.

Use collapse::pivot in resample.R functions

27645f4

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Use collapse::pivot in resample.R functions for better performance #18

Use collapse::pivot in resample.R functions for better performance #18

Uh oh!

maouw commented Jan 28, 2025 •

edited

Loading

Uh oh!

arokem commented Jan 30, 2025

Uh oh!

Uh oh!

Use collapse::pivot in resample.R functions for better performance #18

Are you sure you want to change the base?

Use collapse::pivot in resample.R functions for better performance #18

Uh oh!

Conversation

maouw commented Jan 28, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arokem commented Jan 30, 2025

Uh oh!

Uh oh!

maouw commented Jan 28, 2025 •

edited

Loading