AngusMcLure · AngusMcLure · Jan 7, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 7, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -14,3 +14,4 @@ cran-comments\.md
 ^docs$
 ^pkgdown$
 ^codecov\.yml$
+COMPILATION INSTRUCTIONS
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -10,7 +10,10 @@ name: R-CMD-check
 
 jobs:
   R-CMD-check:
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes

diff --git a/COMPILATION INSTRUCTIONS b/COMPILATION INSTRUCTIONS
@@ -0,0 +1,23 @@
+NOTES FOR BUILDING PACKAGE ON ANGUS' WINDOWS MACHINE
+
+Compilation will not work out of the box on Angus' machine. During compilation
+it runs into issues files being too big.
+
+Changes to make it work:
+
+Add the following to configure.win
+
+# Add configuration needed for big object files
+echo "PKG_CXXFLAGS += -Wa,-mbig-obj" >> ./src/Makevars.win
+
+This will allow for compilation, however the .dll produced will be enormous
+(preventing hosting on github) if compiled with some commands.
+I have not figured out which commands are the issue, but this inlcudes:
+* the default build configuration in load_all(recompile = TRUE)
+* roxygenise()
+
+Note that the above work fine if not recompiling Cpp/stan code.
+
+To avoid the large .dll problem, make sure all building of packages that
+involves recompiling stan/Cpp code happens with the 'Install' or
+'Clean and Install' button in RStudio
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: PoolTestR
 Title: Prevalence and Regression for Pool-Tested (Group-Tested) Data
-Version: 0.2.0.9000
+Version: 0.2.1
 Authors@R: c(
     person("Angus", "McLure", , "angus.mclure@anu.edu.au", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-2551-3059")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,106 +1,85 @@
+# PoolTestR 0.2.1
+- Patched bug that caused `PoolPrev()` to fail with pools bigger than about 300,
+with tests to capture this behavior in the future (on pools of 1000).
 # PoolTestR (development version v0.2.0.9000)
-- Implemented a test suite for PoolTestR. Currently mostly covers the input 
-checking functions with just a couple of tests for `PoolPrev()` and 
+- Implemented a test suite for PoolTestR. Currently mostly covers the input
+checking functions with just a couple of tests for `PoolPrev()` and
 `HierPoolPrev()`.
 - New function `CheckInputData()` allows users to test the formatting of their
-input data prior to estimating prevalence, checking things including class of 
-input variables, whether input columns exist, whether columns/rows have 
+input data prior to estimating prevalence, checking things including class of
+input variables, whether input columns exist, whether columns/rows have
 incorrect or missing values, among other checks.
 - New function `PrepareClusterData()` allows users to test the hierarchy/clustered
-sampling scheme prior to estimating prevalence, to ensure data is formatted 
+sampling scheme prior to estimating prevalence, to ensure data is formatted
 correctly according to the assumptions underlying PoolTestR. This function tests
-whether any values in the hierarchy/sampling columns are missing, tests the 
-nesting of clustered variables, and tests whether each location has a unique 
-identifier.
-
-# PoolTestR v0.2.0 (Release date: 2024-12-05)
-Caitlin Cherryh has joined the development team and has been working on 
-improving readability of outputs, documentation, and testing.
-
-This update includes an option for `PoolPrev()` to skip the calculation of 
-Bayesian estimates. When using `bayesian = FALSE`, only MLE and likelihood ratio 
-confidence intervals will be calculated, substantially speeding up this function
-(perhaps x100).
-
-This updates also removes one source of bias from prevalence estimates returned 
-for any hierarchical models. This effects the results of `HierPoolPrev()` and 
-`getPrevalence()` applied to models with random effects. Under the update, 
+whether any values in the hierarchy/sampling columns are missing, tests the
+nesting of clustered variables, and tests whether each location has a unique
+identifier. # PoolTestR v0.2.0 (Release date: 2024-12-05) Caitlin Cherryh has
+joined the development team and has been working on improving readability of
+outputs, documentation, and testing. This update includes an option for
+`PoolPrev()` to skip the calculation of Bayesian estimates. When using `bayesian
+= FALSE`, only MLE and likelihood ratio confidence intervals will be calculated,
+substantially speeding up this function (perhaps x100). This updates also
+removes one source of bias from prevalence estimates returned for any
+hierarchical models. This effects the results of `HierPoolPrev()` and
+`getPrevalence()` applied to models with random effects. Under the update,
 prevalence estimates will typically slightly increase, though the difference
 will not be notable if the sample size is large and there is little clustering.
-
 Previous estimates of prevalence did not marginalise out the random effects when
-calculating population-level prevalence, but as of this version, random effects 
-are marginalised out. Due to the complexity introduced by this bias-correction 
-we now longer-support specifying nested surveys using `~(1|Layer1/Layer2)` and 
-recommend using the format `~(1|Layer1) + (1|Layer2)` which should be equivalent 
-as long as each level in `Layer2` is unique --- i.e. the format already required 
-for `HierPoolPrev()`.
-
-Due to the complexity introduced by this bias-correction, the way of specifying 
-priors for `HierPoolPrev()` has been updated. Priors for `HierPoolPrev()` are 
-now directly on the real-scale (logit-transformed) parameters, rather than 
-prevalence directly. We have also updated the default priors for `PoolRegBayes()` 
-for regression parameters, as we believe the previous priors were too diffuse 
-(`normal(0,100)`). The defaults for the centered predictors are now 
-`student(6,0,1.5)`.
-
-`HierPoolPrev()` now has functionality to return estimate of intracluster 
-correlation coefficients (ICC) at one or more levels of clustering.
-
-`HierPoolPrev()` and `PoolPrev()` now have custom output classes (inheriting 
-from tibble, the previous class for these outputs). This has allowed us 
-(Caitlin) to add pretty-print functions these outputs which are much more human 
-readable. Saving the output with `write.csv()` or similar will still return a 
-detailed, machine-readable output.
-
-Both `PoolPrev()` and `HierPoolPrev()` have been updated to improve point 
-estimates. New default function values have been added for both `PoolPrev()` and 
-`HierPoolPrev()`. The default for the new `robust` parameter is `robust = TRUE`, 
-which means the point estimate of prevalence is the posterior median. In both 
+calculating population-level prevalence, but as of this version, random effects
+are marginalised out. Due to the complexity introduced by this bias-correction
+we now longer-support specifying nested surveys using `~(1|Layer1/Layer2)` and
+recommend using the format `~(1|Layer1) + (1|Layer2)` which should be equivalent
+as long as each level in `Layer2` is unique --- i.e. the format already required
+for `HierPoolPrev()`. Due to the complexity introduced by this bias-correction,
+the way of specifying priors for `HierPoolPrev()` has been updated. Priors for
+`HierPoolPrev()` are now directly on the real-scale (logit-transformed)
+parameters, rather than prevalence directly. We have also updated the default
+priors for `PoolRegBayes()` for regression parameters, as we believe the
+previous priors were too diffuse (`normal(0,100)`). The defaults for the
+centered predictors are now `student(6,0,1.5)`. `HierPoolPrev()` now has
+functionality to return estimate of intracluster correlation coefficients (ICC)
+at one or more levels of clustering. `HierPoolPrev()` and `PoolPrev()` now have
+custom output classes (inheriting from tibble, the previous class for these
+outputs). This has allowed us (Caitlin) to add pretty-print functions these
+outputs which are much more human readable. Saving the output with `write.csv()`
+or similar will still return a detailed, machine-readable output. Both
+`PoolPrev()` and `HierPoolPrev()` have been updated to improve point estimates.
+New default function values have been added for both `PoolPrev()` and
+`HierPoolPrev()`. The default for the new `robust` parameter is `robust = TRUE`,
+which means the point estimate of prevalence is the posterior median. In both
 functions, the default value for `all.negative.pools = 'zero'`, meaning when all
-pools are negative, the point estimate and the. lower bound for the interval 
-will be 0.
-
-# PoolTestR v0.1.3 (Release date: 2022-07-XX)
-This is patch to fix a bug affecting `PoolPrev()`. The bug affected the maximum 
-likelihood estimates (MLE) and likelihood ratio confidence intervals (LR-CIs) 
-of prevalence when the default Jeffrey's prior was being used. The bug would 
-usually make the MLE and LR-CIs much closer to the Bayesian estimates than they 
-should have been. As both sets of estimates are valid, the results will still 
-have been approximately correct.
-
-This patch also includes an option, `replicate.poolscreen` (default to `FALSE`), 
-for `PoolPrev()`. This options changes the way the likelihood ratio confidence 
-intervals are calculated. With `replicate.poolscreen = TRUE`, PoolPrev will more 
-closely reproduce the results produced by Poolscreen. We believe that our 
-implementation of these intervals is more correct so would recommend that users 
-continue to use the default (`replicate.poolscreen = FALSE`), but this option 
-may be helpful for those who are trying to compare results across the two 
-programs.
-
-# PoolTestR v0.1.2 (Release date: 2021-07-XX)
-We have published a paper about PoolTestR in *Environmental Modelling and Software* 
-now available at https://doi.org/10.1016/j.envsoft.2021.105158. If you find this 
-package useful, please let us know and/or cite our paper!
-
-A couple bug fixes:
-
+pools are negative, the point estimate and the. lower bound for the interval
+will be 0. # PoolTestR v0.1.3 (Release date: 2022-07-XX) This is patch to fix a
+bug affecting `PoolPrev()`. The bug affected the maximum likelihood estimates
+(MLE) and likelihood ratio confidence intervals (LR-CIs) of prevalence when the
+default Jeffrey's prior was being used. The bug would usually make the MLE and
+LR-CIs much closer to the Bayesian estimates than they should have been. As both
+sets of estimates are valid, the results will still have been approximately
+correct. This patch also includes an option, `replicate.poolscreen` (default to
+`FALSE`), for `PoolPrev()`. This options changes the way the likelihood ratio
+confidence intervals are calculated. With `replicate.poolscreen = TRUE`,
+PoolPrev will more closely reproduce the results produced by Poolscreen. We
+believe that our implementation of these intervals is more correct so would
+recommend that users continue to use the default (`replicate.poolscreen =
+FALSE`), but this option may be helpful for those who are trying to compare
+results across the two programs. # PoolTestR v0.1.2 (Release date: 2021-07-XX)
+We have published a paper about PoolTestR in *Environmental Modelling and
+Software* now available at https://doi.org/10.1016/j.envsoft.2021.105158. If you
+find this package useful, please let us know and/or cite our paper! A couple bug
+fixes:
 * Corrections to the Jeffrey's prior in PoolPrev
 * Improved numerical stability of hierarchical models -- previous implementation was causing initialisation of MCMC to fail in some edge cases
-
 A few improvements:
-
 * Allow user to specify level of confidence intervals and credible intervals (default level is 0.95, for 95% intervals)
 * Option for `getPrevalence()` to return the posterior median as the point estimate (instead of the posterior mean) for Bayesian models with with `PoolRegBayes()`
 * Implement `PoolRegBayes()` with a logit link function as custom family in `brms`. This allows for better post-processing for the results of `PoolRegBayes()` -- e.g. simulating from the model, leave-one-out cross-validation, posterior predictive checks. See `brms` for details
 * Allow users to pass more control variables to MCMC sampling routines across `PoolRegBayes()`, `HierPoolPrev()`, and `PoolPrev()`
 * Allows users to specify the scale parameter for the half-Cauchy hyper-prior or the standard deviations of the random effect terms in `HierPoolPrev()`. Also reduced the default value of this hyperprior from 25 (very diffuse) to 2 (weakly informative). This is now very comparable to the equivalent default hyper-prior for `brms` models including those fit using `PoolRegBayes()` (i.e. a half t distribution three degrees of freedom )
-
-# PoolTestR v0.1.1 (Release date: 2021-02-13)
-
-Minor patch so that the package works across more platforms (namely solaris)
-
-
-# PoolTestR v0.1.0 (Release date: 2021-02-08)
-
-This is our first official release! Please see the github site (https://github.com/AngusMcLure/PoolTestR#pooltestr) for a basic crash course on using the package. An upcoming (open access) journal article will go into further detail. A preprint can be accessed at https://arxiv.org/abs/2012.05405. I'll post a link to the article when published.
+# PoolTestR v0.1.1 (Release date: 2021-02-13) Minor patch so that the package
+works across more platforms (namely solaris) # PoolTestR v0.1.0 (Release date:
+2021-02-08) This is our first official release! Please see the github site
+(https://github.com/AngusMcLure/PoolTestR#pooltestr) for a basic crash course on
+using the package. An upcoming (open access) journal article will go into
+further detail. A preprint can be accessed at https://arxiv.org/abs/2012.05405.
+I'll post a link to the article when published.
diff --git a/R/PoolPrev.R b/R/PoolPrev.R
@@ -115,7 +115,9 @@ PoolPrev <- function(data,result,poolSize,...,
 
   useJefferysPrior <- is.null(prior)
   if(bayesian){
-    if(!useJefferysPrior && (is.null(prior$alpha) || is.null(prior$beta) || is.null(prior$beta))){
+    if(!useJefferysPrior && (is.null(prior$alpha) ||
+                             is.null(prior$beta) ||
+                             is.null(prior$absent))){
       stop("If not using the default prior (NULL), prior$alpha, prior$beta, and prior$absent must all be specified.")
     }
     if(!useJefferysPrior && (length(prior$alpha) != 1 ||

diff --git a/configure.win b/configure.win
@@ -3,3 +3,7 @@
 # Generated by rstantools.  Do not edit by hand.
 
 "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "rstantools::rstan_config()"
+
+
+# Add configuration needed for big object files
+echo "PKG_CXXFLAGS += -Wa,-mbig-obj" >> ./src/Makevars.win
diff --git a/inst/stan/HierPoolPrevIndividualSD.stan b/inst/stan/HierPoolPrevIndividualSD.stan
@@ -38,28 +38,28 @@ parameters {
 }
 model{
   int k;
-  vector[N] ps; //1 - prevalence at the pool level (adjusted for pool size)
+  vector[N] qpool; //1 - prevalence at the pool level (adjusted for pool size)
   vector[TotalGroups] au; //actual group effects
   k = 1;
   for(l in 1:L){
     au[k:(k+NumGroups[l]-1)] = u[k:(k+NumGroups[l]-1)] * group_sd[l];
     k = k + NumGroups[l];
   }
 
-  //This code is equvailent way of calculating ps (though less efficient)
+  //This code is equvailent way of calculating qpool (though less efficient)
 
   //vector[N] los; //log odds at the individual level at that site
   //los = Intercept + Z * au;
-  //ps = exp(-log1p_exp(los) .* PoolSize);
+  //qpool = exp(-log1p_exp(los) .* PoolSize);
 
   //Also equivalent to
-  //ps = exp(log1m_inv_logit(Intercept + Z * au) .* PoolSize);
-  ps = exp(log1m_inv_logit(Intercept + csr_matrix_times_vector(N,TotalGroups,Zw,Zv,Zu,au)) .* PoolSize);
+  //qpool = exp(log1m_inv_logit(Intercept + Z * au) .* PoolSize);
+  qpool = exp(log1m_inv_logit(Intercept + csr_matrix_times_vector(N,TotalGroups,Zw,Zv,Zu,au)) .* PoolSize);
 
   Intercept        ~ student_t(InterceptNu, InterceptMu, InterceptSigma);
   group_sd         ~ student_t(GroupSDNu, GroupSDMu, GroupSDSigma);
   u                ~ std_normal();
-  FlippedResult    ~ bernoulli(ps);
+  FlippedResult    ~ bernoulli(qpool);
 }
 generated quantities{
   real total_group_sd;

diff --git a/inst/stan/HierPoolPrevTotalSD.stan b/inst/stan/HierPoolPrevTotalSD.stan
@@ -43,26 +43,26 @@ transformed parameters{
 }
 model{
   int k;
-  vector[N] ps; //1 - prevalence at the pool level (adjusted for pool size)
+  vector[N] qpool; //1 - prevalence at the pool level (adjusted for pool size)
   vector[TotalGroups] au; //actual group effects
   k = 1;
   for(l in 1:L){
     au[k:(k+NumGroups[l]-1)] = u[k:(k+NumGroups[l]-1)] * group_sd[l];
     k = k + NumGroups[l];
   }
 
-  //This code is equvailent way of calculating ps (though less efficient)
+  //This code is equvailent way of calculating qpool (though less efficient)
 
   //vector[N] los; //log odds at the individual level at that site
   //los = Intercept + Z * au;
-  //ps = exp(-log1p_exp(los) .* PoolSize);
+  //qpool = exp(-log1p_exp(los) .* PoolSize);
 
   //Also equivalent to
-  //ps = exp(log1m_inv_logit(logit(p) + Z * au) .* PoolSize);
-  ps = exp(log1m_inv_logit(Intercept + csr_matrix_times_vector(N,TotalGroups,Zw,Zv,Zu,au)) .* PoolSize);
+  //qpool = exp(log1m_inv_logit(logit(p) + Z * au) .* PoolSize);
+  qpool = exp(log1m_inv_logit(Intercept + csr_matrix_times_vector(N,TotalGroups,Zw,Zv,Zu,au)) .* PoolSize);
 
   Intercept        ~ student_t(InterceptNu, InterceptMu, InterceptSigma);
   total_group_sd   ~ student_t(GroupSDNu, GroupSDMu, GroupSDSigma);
   u                ~ std_normal();
-  FlippedResult    ~ bernoulli(ps);
+  FlippedResult    ~ bernoulli(qpool);
 }
diff --git a/inst/stan/PoolPrev.stan b/inst/stan/PoolPrev.stan
@@ -5,31 +5,48 @@ data {
   real<lower=0> PriorAlpha;
   real<lower=0> PriorBeta;
   int<lower=0, upper=1> JeffreysPrior;
+}
+transformed data{
+  array[N] int<lower=0, upper=1> FlippedResult;
+  //vector[N] LogPoolSize2;
+  vector[N] PoolSizeLess2;
+  vector[N] PoolSizeSq;
+
+
+  for(n in 1:N){
+    // We can avoid '1 - ' operation later if we predict negative pools rather than positive ones
+    FlippedResult[n] = 1 - Result[n];
+  }
+
+  // LogPoolSize2 = log(PoolSize) * 2;
+  PoolSizeLess2 = PoolSize - 2;
+  PoolSizeSq = PoolSize ^ 2.0;
+
+
 }
 parameters {
   real<lower=0, upper=1> p;
 }
 transformed parameters{
-  array[N] real<lower=0, upper=1> ps;
-  real q;
-  q = 1-p;
-  for(n in 1:N){
-    real PS = PoolSize[n];
-    ps[n] = 1-q^PS;
-  }
+  vector<lower=0, upper=1>[N] qpool;
+  //qpool = q ^ PoolSize; \\below is equivalent to this line
+  qpool = exp(log1m(p) .* PoolSize); // probability of a negative pool
 }
 model{
   if(JeffreysPrior){
-    real s;
-    s = 0;
-    for(n in 1:N){
-        real PS = PoolSize[n];
-        s += PS ^ 2.0 * q ^ (PS - 2) / (1 - q ^ PS);
-    }
-    target += log(s)/2;
+    // real logq; # log of 1-p
+    vector[N] s;
+    s = PoolSizeSq .* (1 - p) ^ PoolSizeLess2 ./ (1 - (1 - p) ^ PoolSize);
+    target += log(sum(s))/2;
+    // Below should be equivalent to the above commented lines and more stable,
+    // but I have gotten undiagnosable errors so have reverted to the above
+    // (which seems stable enough anyway) 
+    // logq = log1m(p);
+    // s = LogPoolSize2 + logq .* PoolSizeLess2 - log1m_exp(logq .* PoolSize); 
+    // target += log_sum_exp(s)/2;
   }else{
     p ~ beta(PriorAlpha,PriorBeta);
   }
-  Result ~ bernoulli(ps);
+  FlippedResult ~ bernoulli(qpool);
 }
 
diff --git a/src/Makevars.win b/src/Makevars.win
@@ -8,3 +8,4 @@ PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::Cx
 PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()")
 
 CXX_STD = CXX17
+PKG_CXXFLAGS += -Wa,-mbig-obj
diff --git a/src/PoolTestR.dll b/src/PoolTestR.dll
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,3 +8,4 @@ PKG_CXXFLAGS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::Cx
		PKG_LIBS = $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "RcppParallel::RcppParallelLibs()") $(shell "$(R_HOME)/bin$(R_ARCH_BIN)/Rscript" -e "StanHeaders:::LdFlags()")

		CXX_STD = CXX17
		PKG_CXXFLAGS += -Wa,-mbig-obj