Merge pull request #207 from DistanceDevelopment/Issue_194

LHMarshall · web-flow · commit 7b2bfa12604f · 2025-06-18T13:12:41.000+01:00
Closes Issue #194
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,7 +16,7 @@ Description: A simple way of fitting detection functions to distance sampling
     Horvitz-Thompson-like estimator) if survey area information is provided. See
     Miller et al. (2019) <doi:10.18637/jss.v089.i01> for more information on
     methods and <https://distancesampling.org/resources/vignettes.html> for example analyses.
-Version: 2.0.0.9011
+Version: 2.0.0.9012
 URL: https://github.com/DistanceDevelopment/Distance/
 BugReports: https://github.com/DistanceDevelopment/Distance/issues
 Language: en-GB
diff --git a/NAMESPACE b/NAMESPACE
@@ -49,8 +49,10 @@ importFrom(dplyr,inner_join)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,mutate_if)
+importFrom(dplyr,pull)
 importFrom(dplyr,reframe)
 importFrom(dplyr,select)
+importFrom(dplyr,summarize)
 importFrom(dplyr,summarize_all)
 importFrom(dplyr,ungroup)
 importFrom(dplyr,vars)
@@ -61,6 +63,7 @@ importFrom(mrds,ddf)
 importFrom(mrds,dht)
 importFrom(rlang,.data)
 importFrom(stats,AIC)
+importFrom(stats,aggregate)
 importFrom(stats,as.formula)
 importFrom(stats,logLik)
 importFrom(stats,median)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,7 @@
 
 * Fixes issue with print dht2 when multipliers are a data.frame (Issue #179)
 * Fixes bug when including a uniform with no adjustment terms in the summarize_ds_models function (Issue #180)
+* dht2 can now deal with strata where there are no observations (Issue #194)
 * Users can alternatively pass a list of models to summarize_ds_models rather than passing them individually. (Issue #149)
 * Truncation distances greater than the largest cutpoint value for binned data are no longer permitted as these cause fitting issues. (Issue #175)
 * print.dht_result now displays estimates for groups as well as individuals by default when group size is present. (Issue #178)
diff --git a/R/ER_var_f.R b/R/ER_var_f.R
@@ -25,12 +25,14 @@ ER_var_f <- function(erdat, innes, binomial_var=FALSE){
 
     # sort the data if we use O2/O3 estimators
     if(any(erdat$er_est %in% c("O2", "O3"))){
-      warning("Using O2 or O3 encounter rate variance estimator, assuming that sorting on Sample.Label is meaningful")
+      warning("Using O2 or O3 encounter rate variance estimator, assuming that sorting on Sample.Label is meaningful", immediate. = TRUE, call. = FALSE)
       if(!is.numeric(erdat$Sample.Label)){
-        warning("Additionally, Sample.Label is not numeric, this may cause additional issues")
+        warning("Additionally, Sample.Label is not numeric, this may cause additional issues", immediate. = TRUE, call. = FALSE)
       }
+      # Add in a column indexing the original order
+      erdat$.orig.order <- 1:nrow(erdat)
       erdat <- erdat %>%
-        mutate(.originalorder = 1:nrow(erdat)) %>%
+        # Sort by sample label
         arrange(.data$Sample.Label)
     }
 
diff --git a/R/dht2.R b/R/dht2.R
@@ -649,7 +649,9 @@ dht2 <- function(ddf, observations=NULL, transects=NULL, geo_strat=NULL,
                                       var(.data$size, na.rm=TRUE)/
                                        sum(!is.na(.data$size)),
                                       0),
-             group_mean     = mean(.data$size, na.rm=TRUE)) %>%
+             group_mean     = if_else(.data$n_observations>0,
+                                      mean(.data$size, na.rm=TRUE),
+                                      1)) %>%
       # report n as n_observations
       mutate(n = .data$n_observations)
   # if we didn't have any areas, then set to 1 and estimate density
@@ -745,11 +747,13 @@ if(mult){
       mutate(ER = .data$n/.data$Effort)
   }
   res <- res %>%
-    mutate(ER_CV = sqrt(.data$ER_var)/.data$ER,
+    mutate(ER_CV = if_else(.data$ER == 0, 0, 
+                           sqrt(.data$ER_var)/.data$ER),
            ER_df = compute_df(.data$k, type=.data$er_est)) %>%
     # calculate stratum abundance estimate
     mutate(Abundance = (.data$Area/.data$Covered_area) * .data$Nc) %>%
-    mutate(df_CV = sqrt(.data$df_var)/.data$Abundance) %>%
+    mutate(df_CV = if_else(.data$Abundance == 0, 0,
+                           sqrt(.data$df_var)/.data$Abundance)) %>%
     mutate(group_CV = if_else(.data$group_var==0, 0,
                               sqrt(.data$group_var)/.data$group_mean))
 
@@ -1093,7 +1097,7 @@ if(mult){
 
   # warn if we only had one transect in one or more strata
   if(any(res$k == 1)){
-    warning("One or more strata have only one transect, cannot calculate empirical encounter rate variance")
+    warning("One or more strata have only one transect, cannot calculate empirical encounter rate variance", call. = FALSE)
   }
 
   # fix area == covered area for compatibility with mrds::dht
diff --git a/R/varNhat.R b/R/varNhat.R
@@ -1,6 +1,7 @@
 # Calculate the variance contribution of the detection function
 #  to abundance estimates
-#' @importFrom dplyr reframe
+#' @importFrom dplyr reframe group_by across all_of summarize pull
+#' @importFrom stats aggregate
 varNhat <- function(data, model){
 
   # format the data
@@ -22,14 +23,23 @@ varNhat <- function(data, model){
     reframe(Covered_area = sum(.data$Covered_area),
               Area         = .data$Area) %>%
     distinct()
+  
+  # Add column giving number of obs per stratum
+  n_obs <- aggregate(!is.na(data$object), 
+                     by = data[strat_vars], 
+                     FUN = sum)
+  names(n_obs)[length(n_obs)] <- "n_obs"
+  
+  # Merge into grp_dat
+  grp_dat <- left_join(grp_dat, n_obs, by = strat_vars)
 
   # join the per-stratum data onto the frame
   data$Covered_area <- NULL
   data$Area <- NULL
   data <- left_join(data, grp_dat, by=strat_vars)
 
   # function to calculate Nhat
-  dhtd <- function(par, data, model){
+  dhtd <- function(par, data, model, grp_dat){
     # set par
     model$par <- par
 
@@ -42,6 +52,10 @@ varNhat <- function(data, model){
       reframe(N = (.data$Area/.data$Covered_area) *
                      sum(.data$Nc, na.rm=TRUE)) %>%
       distinct()
+    
+    # Include an NA for strata with no obs and then convert to 0.
+    res <- left_join(grp_dat, res, by = colnames(grp_dat)[1])
+    res$N[is.na(res$N)] <- 0
 
     res$N
   }
@@ -54,7 +68,7 @@ varNhat <- function(data, model){
 
   # calculate variance
   dm <- DeltaMethod(model$par, dhtd, vcov, sqrt(.Machine$double.eps),
-                    model=model, data=data)
+                    model=model, data=data, grp_dat=grp_dat)
   attr(dm, "vardat_str") <- vardat_str
 
   # fiddle with variance data.frame
diff --git a/R/variance_contributions.R b/R/variance_contributions.R
@@ -18,8 +18,9 @@ variance_contributions <- function(res){
   # get the total
   CV_cont$Total <- sqrt(rowSums(CV_cont^2))
 
-  # make that into percentages
-  CV_cont <- (CV_cont^2/CV_cont[["Total"]]^2)*100
+  # make that into percentages (for non zero rows)
+  index <- which(CV_cont[["Total"]]!=0)
+  CV_cont[index,] <- (CV_cont[index,]^2/CV_cont[index,"Total"]^2)*100
   CV_cont[["Total"]] <- NULL
 
   # zero ER contributions if only one sample
diff --git a/tests/testthat/test_dht2.R b/tests/testthat/test_dht2.R
@@ -260,3 +260,63 @@ test_that("can accept data.frame and scalar sampling fractions", {
   minke_dht2
   
 })
+
+
+test_that("Deal with strata with no detections", {
+  
+  # Add a third Region with two samples but no observations.
+  # Adding two samples instead of just one to prevent a warning unstable ER var
+  new.row1 <- minke[1, ] %>%
+    mutate(
+      Region.Label = "East",
+      Area = 10000,
+      Sample.Label = max(minke$Sample.Label) + 1,
+      Effort = 10,
+      distance = NA,
+      object = NA
+    )
+  
+  new.row2 <- minke[1, ] %>%
+    mutate(
+      Region.Label = "East",
+      Area = 10000,
+      Sample.Label = max(minke$Sample.Label) + 2,
+      Effort = 10,
+      distance = NA,
+      object = NA
+    )
+  
+  minke <- rbind(minke, new.row1, new.row2)
+  # Fit a detection function
+  detfn <- ds(data=minke, truncation=1.5, key="hr", adjustment=NULL)
+  
+  result <- dht2(ddf = detfn,
+                 flatfile = minke,
+                 strat_formula =  ~ Region.Label,
+                 stratification = "geographical")
+  
+  expect_true(is(result, "dht_result"))
+  
+  # Check values in output
+  new.row3 <- minke[1, ] %>%
+    mutate(
+      Region.Label = "East",
+      Area = 10000,
+      Sample.Label = max(minke$Sample.Label) + 3,
+      Effort = 10,
+      distance = NA,
+      object = NA
+    )
+  
+  minke2 <- rbind(minke, new.row1, new.row2, new.row3)
+  detfn <- ds(data=minke2, truncation=1.5, key="hr", adjustment=NULL, optimizer = "R")
+  dht2.results <- dht2(ddf = detfn,
+                       flatfile = minke2,
+                       strat_formula =  ~ Region.Label,
+                       stratification = "geographical")
+  
+  expect_equal(dht2.results$Abundance[1], 0)
+  expect_equal(dht2.results$Abundance_se[4], 4834.4118, , tolerance = 0.0001)
+  expect_equal(dht2.results$df[4],15.25496, tolerance = 0.00001)
+  
+})
diff --git a/tests/testthat/test_systematic_var2.R b/tests/testthat/test_systematic_var2.R
@@ -56,8 +56,9 @@ cu <- 1/(cu[3]/(cu[1]*cu[2]))
 
 test_that("no strat",{
 
-  df <- ds(Systematic_variance_2, convert_units=cu)
-
+  # No adjustments to avoid model selection which generates a monotonicity warning. (The HN model with no adjustments is selected based on minimum AIC any way.)
+  df <- ds(Systematic_variance_2, nadj = 0, 
+           convert_units=cu, truncation = 10)
 
   # fiddle with region labels
   obs.table$Region.Label <- NULL
diff --git a/tests/testthat/test_variance.R b/tests/testthat/test_variance.R
@@ -10,8 +10,10 @@ test_that("variance 2",{
   convert.units <- Systematic_variance_2_units
   cu <- convert.units[, 3]
   cu <- 1/(cu[3]/(cu[1]*cu[2]))
-  # first fit a model
-  sysvar_df <- ds(Systematic_variance_2, adjustment="cos", convert_units=cu)
+  # first fit a model - to avoid warnings use no adjustments (HN model was selected any way based on minimum AIC)
+  sysvar_df <- ds(Systematic_variance_2, 
+                  nadj = 0, 
+                  convert_units=cu)
 
   systematic_var2 <- Systematic_variance_2
   unflat <- unflatten(systematic_var2)
@@ -30,9 +32,9 @@ test_that("variance 2",{
   unflat$obs.table$Sample.Label <- as.numeric(unflat$obs.table$Sample.Label)
   unflat$data$Sample.Label <- as.numeric(unflat$data$Sample.Label)
 
-  # fit the detection function
-  sysvar_df <- ds(unflat$data, adjustment="cos", convert_units=cu)
-
+  # fit the detection function (again avoid adjustment terms to supress warning - no adjustments is selected by AIC any way)
+  sysvar_df <- ds(unflat$data, nadj = 0, convert_units=cu)
+  
   ## no stratification
   #Nhat_nostrat_eff <- dht2(sysvar_df, transects=unflat$sample.table,
   #                             geo_strat=unflat$region.table, observations=unflat$obs.table,