signaturescience · vpnagraj · Nov 3, 2024 · Nov 3, 2024 · Nov 3, 2024
diff --git a/R/fiphde-package.R b/R/fiphde-package.R
@@ -136,5 +136,6 @@ if(getRversion() >= "2.15.1")  utils::globalVariables(c(".",
                                                         "target_name",
                                                         "target_prob",
                                                         "nnetar",
+                                                        "mean_flu_admits",
                                                         "."))
 
diff --git a/R/prep.R b/R/prep.R
@@ -9,6 +9,8 @@
 #' @param trim Named list with elements for epiyear and epiweek corresponding to the minimum epidemiological week to retain; defaults to `list(epiyear=2020, epiweek=43)`, which is the first date of report in the healthdata.gov hospitalization data; if set to `NULL` the data will not be trimmed
 #' @param remove_incomplete Logical as to whether or not to remove the last week if incomplete; defaults is `TRUE`.
 #' @param min_per_week The minimum number of flu.admits per week needed to retain that state. Default removes states with less than 1 flu admission per week over the last 30 days.
+#' @param augment Logical as to whether or not the data should be augmented with NHSN hospitalizations imputed backwards in time (see 'Details' for more); default is `FALSE`.
+#' @param augment_stop Date at which the time series imputation data should stop; yyyy-mm-dd format; only used if "augment" is `TRUE` default is `"2020-10-18"`
 #' @return A `tibble` with hospitalization data summarized to epiyear/epiweek with the following columns:
 #'
 #' - **abbreviation**: Abbreviation for the location
@@ -25,6 +27,10 @@
 #' - **hosp_mean**: Estimate of historical flu hospitalization rate for the given epidemiological week
 #' - **hosp_rank**: Rank of the given epidemiological week in terms of flu hospitalizations across season (1 being highest average activity)
 #'
+#' @details
+#'
+#' The preparation for the weekly flu hospitalization data includes an option to "augment" the input time series. The augmentation is based on an extended time series that was developed with an imputation approach. The extended time series estimates flu hospitalizations at the state-level in years before NHSN reporting became available. If the user decides to include the imputed data, then the time series is extended backwards in time from the "augment_stop" date (defaults to October 18, 2020). The prepended data augmentation is formatted to match the true NSHN reporting. For more details on the data augmentation approach, refer to the publication: <https://www.medrxiv.org/content/10.1101/2024.07.31.24311314v1>.
+#'
 #' @export
 #' @examples
 #' \dontrun{
@@ -38,12 +44,28 @@ prep_hdgov_hosp <- function(hdgov_hosp,
                             statesonly=TRUE,
                             trim=list(epiyear=2020, epiweek=43),
                             remove_incomplete=TRUE,
-                            min_per_week=1) {
+                            min_per_week=1,
+                            augment = FALSE,
+                            augment_stop = "2020-10-18") {
   # What's the last date you have data on? You'll need this to chop the data later on.
   last_date <- max(hdgov_hosp$date)
 
   # Summarize to epiyear, epiweek
   message("Summarizing to epiyear/epiweek")
+
+  ## if the augment option is set then prepend with the imputed data
+  if(augment) {
+    hdgov_hosp <-
+      nhsn_imputed %>%
+      dplyr::select(state = location, date, flu.admits = mean_flu_admits) %>%
+      ## NOTE: augmentation includes data that extends beyond the date we have used to trim HHS data previously
+      ## this line (in combination with filter in the next line) ...
+      ## ... will ensure that the data pulled from healthdata.gov is used for all dates after augment_stop date
+      dplyr::filter(date <= as.Date(augment_stop)) %>%
+      dplyr::bind_rows(dplyr::filter(hdgov_hosp, date > as.Date(augment_stop)),.) %>%
+      dplyr::arrange(state, date)
+  }
+
   hweek <- hdgov_hosp %>%
     dplyr::rename(location=state) %>%
     dplyr::mutate(epiyear=lubridate::epiyear(date), epiweek=lubridate::epiweek(date), .after=date) %>%

diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/data-raw/control_imputed_summaries.rds b/data-raw/control_imputed_summaries.rds
diff --git a/data-raw/generate_sysdata.R b/data-raw/generate_sysdata.R
@@ -75,18 +75,23 @@ hosp <- fiphde:::hospitalizations(surveillance_area = "flusurv", region="all")
 hosp
 hospstats <-
   hosp %>%
-  filter(age_label == "Overall") %>%
+  dplyr::filter(age_label=="Overall") %>%
+  dplyr::filter(race_label=="Overall") %>%
+  dplyr::filter(sexid == 0) %>%
+  dplyr::filter(name == "FluSurv-NET") %>%
+  ## get the overall for influenza a/b
+  dplyr::filter(flutype == 0)  %>%
   filter(year %>% between(2010, 2021)) %>%
   mutate("counts" = as.numeric(weeklyrate)*3300) %>%
-  dplyr::select(c("wk_start", "counts", "year_wk_num")) %>%
-  rename(epiweek = year_wk_num) %>%
+  dplyr::select(c("weekstart", "counts", "weeknumber")) %>%
+  rename(epiweek = weeknumber) %>%
   group_by(epiweek) %>%
-  summarise(min = min(counts),
+  summarise(min = min(counts, na.rm = TRUE),
             lowhinge = IQR(counts, 0.25),
-            med = median(counts),
+            med = median(counts, na.rm=TRUE),
             uprhinge = IQR(counts, 0.75),
-            max = max(counts),
-            mean = mean(counts))
+            max = max(counts, na.rm=TRUE),
+            mean = mean(counts, na.rm = TRUE))
 hospstats
 
 # Get weighted and unweighted ILI (2010-2019), summarize by epiweek
@@ -107,8 +112,13 @@ ilisum
 hospsum <-
   hosp %>%
   filter(year %>% between(2010, 2019)) %>%
-  filter(age_label=="Overall") %>%
-  rename(epiweek=year_wk_num) %>%
+  dplyr::filter(age_label=="Overall") %>%
+  dplyr::filter(race_label=="Overall") %>%
+  dplyr::filter(sexid == 0) %>%
+  dplyr::filter(name == "FluSurv-NET") %>%
+  ## get the overall for influenza a/b
+  dplyr::filter(flutype == 0)  %>%
+  rename(epiweek=weeknumber) %>%
   group_by(epiweek) %>%
   summarize(hosp_mean=mean(weeklyrate)) %>%
   mutate(hosp_rank=rank(hosp_mean) %>% as.integer())
@@ -183,7 +193,7 @@ vd$hosp_fitfor <- ts_fit_forecast(vd$prepped_hosp_tsibble,
                                covariates=TRUE)
 
 # Format for submission
-vd$formatted_list <- format_for_submission(vd$hosp_fitfor$tsfor)
+vd$formatted_list <- format_for_submission(vd$hosp_fitfor$tsfor, method = "ts", format = "legacy")
 
 # CREG ILI data - stuff in vd$ is created here and saved
 # # Original: no time limit and nowcast
@@ -289,9 +299,13 @@ if (!file.exists(here::here("data-raw/ilinearby.csv"))) {
   ilinearby <- read_csv(here::here("data-raw/ilinearby.csv"), col_types="cciid")
 }
 
+## read in imputed data
+nhsn_imputed <- readRDS(here::here("data-raw/control_imputed_summaries.rds"))
+
 # Write package data ------------------------------------------------------
 
 usethis::use_data(locations,
+                  nhsn_imputed,
                   legacy_rate_change,
                   hubverse_rate_change,
                   q,

diff --git a/inst/testdata/testdata-create.R b/inst/testdata/testdata-create.R
@@ -35,10 +35,10 @@ hosp_fitfor <- ts_fit_forecast(prepped_tsibble,
                                covariates=TRUE)
 hosp_fitfor
 
-prepped_forecast_ts <- format_for_submission(hosp_fitfor$tsfor, method = "ts")
+prepped_forecast_ts <- format_for_submission(hosp_fitfor$tsfor, method = "ts", format = "legacy")
 prepped_forecast_ts
 
-prepped_forecast_ts_cat <- forecast_categorical(prepped_forecast_ts$ensemble, prepped_hosp)
+prepped_forecast_ts_cat <- forecast_categorical(prepped_forecast_ts$ensemble, prepped_hosp, format = "legacy", method = "interpolation")
 prepped_forecast_ts_cat
 
 forcplot <- plot_forecast(prepped_tsibble, prepped_forecast_ts$ensemble)

diff --git a/inst/testdata/testdata.rd b/inst/testdata/testdata.rd
diff --git a/man/prep_hdgov_hosp.Rd b/man/prep_hdgov_hosp.Rd