signaturescience · vpnagraj · Dec 10, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,17 +10,20 @@ export(get_cdc_clin)
 export(get_cdc_hosp)
 export(get_cdc_ili)
 export(get_hdgov_hosp)
+export(get_nhsn_weekly)
 export(get_nowcast_ili)
 export(glm_wrap)
 export(is_monday)
 export(make_tsibble)
 export(mmwr_week_to_date)
 export(mnz)
 export(mnz_replace)
+export(ns_impute)
 export(plot_forecast)
 export(plot_forecast_categorical)
 export(pois_forc)
 export(prep_hdgov_hosp)
+export(prep_nhsn_weekly)
 export(replace_ili_nowcast)
 export(smoothie)
 export(this_monday)

diff --git a/R/fiphde-package.R b/R/fiphde-package.R
@@ -137,5 +137,18 @@ if(getRversion() >= "2.15.1")  utils::globalVariables(c(".",
                                                         "target_prob",
                                                         "nnetar",
                                                         "mean_flu_admits",
+                                                        "Geographic aggregation",
+                                                        "Total Influenza Admissions",
+                                                        "max_reporting",
+                                                        "flu.admits.cov.perc",
+                                                        "hosp_mean",
+                                                        "hosp_rank",
+                                                        "ili_mean",
+                                                        "ili_rank",
+                                                        "Week Ending Date",
+                                                        "Percent Hospitals Reporting Influenza Admissions",
+                                                        "Percent Hospitals Reporting  Influenza Admissions",
+                                                        "Number Hospitals Reporting Influenza Admissions",
+                                                        "Number Hospitals Reporting  Influenza Admissions",
                                                         "."))
 
diff --git a/R/prep.R b/R/prep.R
@@ -173,3 +173,112 @@ make_tsibble <- function(df, epiyear, epiweek, key=location) {
     tsibble::as_tsibble(index=yweek, key={{key}})
   return(out)
 }
+
+#' @title Prepare NHSN weekly data
+#'
+#' @description This function prepares data retrieved from the weekly aggregated NHSN hospital respiratory data API. The data must be first retrieved using [prep_nhsn_weekly]. Once pulled from the API, this function will conditionally adjust partial reporting and add extended time series data (see 'Details' for more information). The preparation also includes joining to internal data prepared to estimate the historical severity of each epiweek.
+#'
+#' @param dat Weekly hospital utilization data from [prep_nhsn_weekly]
+#' @param adjust_partial Logical as to whether or not the partial reporting should be adjusted (see 'Details' for more); default is `TRUE`.
+#' @param trim Named list with elements for epiyear and epiweek corresponding to the minimum epidemiological week to retain; default is set to `NULL` the data will not be trimmed; to override the default use a named list (e.g., `list(epiyear=2020, epiweek=43)`)
+#' @param statesonly Logical as to whether or not the data should be limited to states and DC (i.e., no other territories included); default is `TRUE`.
+#' @param augment Logical as to whether or not the data should be augmented with NHSN hospitalizations imputed backwards in time (see 'Details' for more); default is `FALSE`.
+#' @param augment_stop Date at which the time series imputation data should stop; yyyy-mm-dd format; only used if "augment" is `TRUE` default is `"2020-10-18"`
+#'
+#' @details
+#'
+#' The weekly aggregated data from NHSN includes locations that may have incomplete coverage of hospitals reporting (see <https://data.cdc.gov/Public-Health-Surveillance/Weekly-Hospital-Respiratory-Data-HRD-Metrics-by-Ju/mpgq-jmmr/about_data> for more information). The preparation in this function includes an optional step triggered by the "adjust_partial" argument to find the maximum coverage at any time point for each location, then adjusts the reported counts by a factor of X / Y_t, where X is the maximum coverage and Y_t is the coverage at time point t. If the coverage for the given week is near or equal to the maximum observed coverage, then the counts will have little to no effect on the counts. Note that this should be used with caution, as it is possible that some locations may have non-uniform reporting behaviors, especially during non-mandatory NHSN reporting windows. In other words, the counts may be adjusted using reported values from healthcare facilities that may be of a different size, serve different communities, or otherwise have different characteristics than the facilities that did not report.
+#'
+#' The preparation for the weekly flu hospitalization data includes an option to "augment" the input time series. The augmentation is based on an extended time series that was developed with an imputation approach. The extended time series estimates flu hospitalizations at the state-level in years before NHSN reporting became available. If the user decides to include the imputed data, then the time series is extended backwards in time from the "augment_stop" date (defaults to October 18, 2020). The prepended data augmentation is formatted to match the true NSHN reporting. For more details on the data augmentation approach, refer to the publication: <https://www.medrxiv.org/content/10.1101/2024.07.31.24311314v1>.
+#'
+#' @return A `tibble` with hospitalization data summarized to epiyear/epiweek with the following columns:
+#'
+#' - **abbreviation**: Abbreviation for the location
+#' - **location**: FIPS code for the location
+#' - **week_start**: Date of beginning (Sunday) of the given epidemiological week
+#' - **monday**: Date of Monday of the given epidemiological week
+#' - **week_end**: Date of end (Saturday) of the given epidemiological week
+#' - **epiyear**: Year of reporting (in epidemiological week calendar)
+#' - **epiweek**: Week of reporting (in epidemiological week calendar)
+#' - **flu.admits**: Count of flu cases among admitted patients on previous day
+#' - **flu.admits.cov**: Coverage (number of hospitals reporting) for incident flu cases
+#' - **ili_mean**: Estimate of historical ILI activity for the given epidemiological week
+#' - **ili_rank**: Rank of the given epidemiological week in terms of ILI activity across season (1 being highest average activity)
+#' - **hosp_mean**: Estimate of historical flu hospitalization rate for the given epidemiological week
+#' - **hosp_rank**: Rank of the given epidemiological week in terms of flu hospitalizations across season (1 being highest average activity)
+#'
+#' @export
+prep_nhsn_weekly <- function(dat,
+                             adjust_partial = TRUE,
+                             trim = NULL,
+                             statesonly = TRUE,
+                             augment = FALSE,
+                             augment_stop = "2020-10-18") {
+
+  ## fix "USA" abbreviation to be consistent with fiphde:::locations "US"
+  dat <-
+    dat %>%
+    dplyr::mutate(abbreviation = ifelse(abbreviation == "USA", "US", abbreviation))
+
+  if(adjust_partial) {
+    dat <-
+      dat %>%
+      dplyr::group_by(abbreviation) %>%
+      ## find the max reporting percentage and adjust up to that
+      dplyr::mutate(max_reporting = max(flu.admits.cov.perc, na.rm = TRUE)) %>%
+      dplyr::ungroup() %>%
+      ## TODO: double check this logic
+      ## performs the adjustment
+      dplyr::mutate(flu.admits = round((max_reporting/flu.admits.cov.perc)*flu.admits))
+  } else {
+    dat <-
+      dat %>%
+      dplyr::mutate(flu.admits = round(flu.admits))
+  }
+
+  ## if the augment option is set then prepend with the imputed data
+  if(augment) {
+    dat <-
+      ## start with internal imputation object
+      nhsn_imputed %>%
+      dplyr::select(abbreviation = location, week_start = date, flu.admits = mean_flu_admits) %>%
+      ## NOTE: augmentation includes data that extends beyond the date we have used to trim HHS data previously
+      ## this line (in combination with filter in the next line) ...
+      ## ... will ensure that the data pulled from the NHSN API is used for all dates after augment_stop date
+      dplyr::mutate(week_end = week_start + 6) %>%
+      dplyr::select(-week_start) %>%
+      dplyr::filter(week_end <= as.Date(augment_stop)) %>%
+      dplyr::bind_rows(dplyr::filter(dat, week_end > as.Date(augment_stop)),.) %>%
+      dplyr::arrange(abbreviation, week_end)
+  }
+
+
+  dat <-
+    dat %>%
+    dplyr::select(abbreviation, flu.admits, week_end, flu.admits.cov) %>%
+    dplyr::left_join(locations) %>%
+    dplyr::mutate(epiyear=lubridate::epiyear(week_end), epiweek=lubridate::epiweek(week_end)) %>%
+    dplyr::mutate(week_start = mmwr_week_to_date(epiyear, epiweek, 1)) %>%
+    dplyr::mutate(monday = mmwr_week_to_date(epiyear, epiweek, 2)) %>%
+    dplyr::left_join(historical_severity, by="epiweek")
+
+  # Trim to US+states+DC only
+  if(statesonly) {
+    message("Filtering to US+DC+States only")
+    dat <-
+      dat %>%
+      dplyr::filter(location %in% c("US", stringr::str_pad(1:56, width=2, pad="0")))
+  }
+
+  # Trim to desired start
+  if (!is.null(trim) && is.list(trim)) {
+    message(sprintf("Trimming data to start at %s", MMWRweek::MMWRweek2Date(trim$epiyear, trim$epiweek)))
+    dat <-
+      dat %>%
+      dplyr::filter(week_start >= MMWRweek::MMWRweek2Date(trim$epiyear, trim$epiweek))
+  }
+
+  dat %>%
+    dplyr::select(abbreviation, location, week_start, monday, week_end, epiyear, epiweek, flu.admits, flu.admits.cov, ili_mean, ili_rank, hosp_mean, hosp_rank)
+
+}
diff --git a/R/retrieve.R b/R/retrieve.R
@@ -909,3 +909,36 @@ who_nrevss <- function(region = c("national", "hhs", "census", "state"), years =
 
 }
 
+#' Retrieve weekly NHSN flu hospitalization data
+#'
+#' This function retrieves wweekly aggregated NHSN hospital respiratory data API. The function was written to use the default API endpoint (see description of "endpoint" argument and link in references). Note that this endpoint includes data flagged as "preliminary". All reported weekly aggregates include the number of facilities reporting. In the weeks between April 28, 2024 and November 02, 2024 the NHSN flu hospitalization signal was not required to be reported.
+#'
+#' @param endpoint URL to data.cdc.gov endpoint; default is `"https://data.cdc.gov/api/views/mpgq-jmmr/rows.csv"`
+#'
+#' @return A `tibble` with the following columns:
+#'
+#' - **abbreviation**: Abbreviation of the state or US aggregate
+#' - **week_end**: End date for the epiweek/epiyear being reproted
+#' - **flu.admits**: Count of incident flu cases among hospitalized patients
+#' - **flu.admits.cov**: Coverage (number of hospitals reporting) for incident flu cases
+#' - **flu.admits.cov.perc**: Coverage (percentage of hospitals reporting) for incident flu cases
+#'
+#' @export
+#'
+#' @references <https://data.cdc.gov/Public-Health-Surveillance/Weekly-Hospital-Respiratory-Data-HRD-Metrics-by-Ju/mpgq-jmmr/about_data>
+#'
+#'
+get_nhsn_weekly <- function(endpoint = "https://data.cdc.gov/api/views/mpgq-jmmr/rows.csv") {
+
+  dat <-
+    ## read from endpoint
+    readr::read_csv(endpoint) %>%
+    ## NOTE: the mpgq-jmmr has issues with spaces in the names for two columns
+    dplyr::rename(`Percent Hospitals Reporting Influenza Admissions` = `Percent Hospitals Reporting  Influenza Admissions`) %>%
+    dplyr::rename(`Number Hospitals Reporting Influenza Admissions` = `Number Hospitals Reporting  Influenza Admissions`) %>%
+    ## downselect to just a few columns
+    dplyr::select(abbreviation = `Geographic aggregation`, week_end = `Week Ending Date`, flu.admits = `Total Influenza Admissions`, flu.admits.cov = `Number Hospitals Reporting Influenza Admissions`, flu.admits.cov.perc = `Percent Hospitals Reporting Influenza Admissions`)
+
+  dat
+
+}
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/R/utils.R b/R/utils.R
@@ -639,3 +639,141 @@ round_preserve <- function(x, digits = 0) {
   y / up
 }
 
+#' @title Non-seasonal flu hospitalization imputation
+#'
+#' @description
+#'
+#' This unexported helper function is used to create a "non-seasonal", location-specific imputation estimate for weekly NHSN flu hospitalization counts. The imputation approach was motivated by the change in reporting requirements for the NHSN hospital respiratory disease metrics, which became option from April 2024 to November 2024. This function includes four different approaches (see 'Details' for more) to adjusting and/or filling the gap in reporting state-level flu hospitalizations.
+#'
+#' @param dat A `tibble` with hospitalization data prepared either by [prep_hdgov_hosp] or [prep_nhsn_weekly]
+#' @param location FIPS code for location to impute
+#' @param method Imputation method to use; must be one of `"val"`, `"diff"`, `"median"`, or `"partial"` (see 'Details' for more); default is `"val"`.
+#' @param begin_date Start date for imputation in YYYY-MM-DD format; default is `"2024-04-28"`
+#' @param end_date End date for imputation in YYYY-MM-DD; default is `"2024-11-02"`
+#'
+#' @details
+#' There are four possible methods for imputing non-seasonal weeks implemented in this function:
+#'
+#' - "val": Random sampling from a vector of values including all flu hospitalizations reported weeks between June-October 2022 and June-October 2023 for the given location; first and last values are defined as median of the random sample and the closest un-imputed value (i.e., the week before imputation begins and the week after imputation ends)
+#' - "diff": Random sampling from a vector of differences in flu hospitalizations reported weeks between June-October 2022 and June-October 2023 for the given location
+#' - "median": Median of 2022 and 2023 values reported for the given epiweek
+#' - "partial": Uses the `adjust_partial=TRUE` flag for the [prep_nhsn_weekly] for weeks in the date range specified
+#'
+#' @return A `tibble` with the same structure as the input for the "dat" argument, but with weeks between "begin_date" and "end_date" imputed.
+#'
+#' @export
+#'
+#'
+ns_impute <- function(dat, location, method = "val", begin_date = "2024-04-28", end_date = "2024-11-02") {
+
+  ## enforce date-ness for date math below
+  begin_date <- as.Date(begin_date)
+  end_date <- as.Date(end_date)
+
+  ## correct for the beginning date being the *week start* (i.e., sunday)
+  ## all of the dates will be oriented towards *week end* (i.e., saturday)
+  true_begin <- begin_date + 6
+
+  ## get a sequence of the beginning and end weeks for imputation
+  imputed_weeks <- seq(true_begin, end_date, by = 7)
+
+  ## restrict data to the location of interest
+  tmp_dat <-
+    dat %>%
+    dplyr::filter(.data$location == .env$location) %>%
+    dplyr::filter(!dplyr::between(.data$week_end, true_begin, end_date))
+
+  if(method %in% c("diff","val")) {
+    ## pull non-seasonal values for 2022/2023
+    ## NOTE: for now this is hardcoded as anything between start of June and start of October)
+    nonseasonal_vals <-
+      tmp_dat %>%
+      dplyr::filter(dplyr::between(.data$week_start, as.Date("2022-06-01"), as.Date("2022-10-01")) | dplyr::between(.data$week_start, as.Date("2023-06-01"), as.Date("2023-10-01"))) %>%
+      dplyr::pull("flu.admits")
+
+    ## use observed non-seasonal values to find observed point-to-point differences
+    nonseasonal_diffs <- diff(nonseasonal_vals)
+
+    ## get the last value to use with the imputed differences and smoothing below
+    last_left_val <-
+      tmp_dat %>%
+      dplyr::filter(.data$week_end == as.Date(begin_date - 1)) %>%
+      dplyr::pull(flu.admits)
+
+    ## get the first value of restarted reporting to use with smoothing below
+    first_right_val <-
+      tmp_dat %>%
+      dplyr::filter(.data$week_end == as.Date(end_date + 7)) %>%
+      dplyr::pull(flu.admits)
+
+    if(method == "diff") {
+      ## randomly sample from non-seasonal differences for the length of the sequence of weeks to impute
+      imputed_diffs <- sample(nonseasonal_diffs, length(imputed_weeks), replace = TRUE)
+
+      ## get the cumulative sum of the differences added to the last value
+      ## the -1 index removes the last value so we dont repeat that in the time series when we stitch together
+      imputed_ts <- cumsum(c(last_left_val,imputed_diffs))[-1]
+    } else if (method == "val") {
+      imputed_ts <- sample(nonseasonal_vals, length(imputed_weeks), replace = TRUE)
+
+      ## smooth left edge by taking mean of non seasonal impute and the "last left value" ...
+      ## ... i.e., the last reported week before non xseasonal impute begins
+      imputed_ts[1] <- stats::median(c(imputed_ts[1],last_left_val))
+      ## smooth right edge by taking mean of non seasonal impute and the "last right value" ...
+      ## ... i.e., the first reported week after non seasonal impute ends
+      imputed_ts[length(imputed_ts)] <- stats::median(c(imputed_ts[length(imputed_ts)],first_right_val))
+    }
+  } else if (method == "median") {
+
+    ## get the epiweeks for the week end dates to impute
+    ews <- lubridate::epiweek(imputed_weeks)
+
+    ## use a median imputation approach for equivalent weeks in 2022 and 2023
+    imputed_ts <-
+      tmp_dat %>%
+      ## make sure we dont use data prior to NHSN flu hosps being required fields
+      dplyr::filter(week_end >= as.Date("2022-04-01")) %>%
+      dplyr::filter(epiweek %in% ews) %>%
+      dplyr::group_by(epiweek) %>%
+      ## just take the median at each epiweek
+      dplyr::summarise(flu.admits = stats::median(flu.admits, na.rm = TRUE)) %>%
+      dplyr::pull(flu.admits)
+  } else if (method == "partial") {
+
+    ## get partially reported data from NHSN weekly aggregates
+    ## prep to format as imputed ts
+    partial_dat <-
+      get_nhsn_weekly() %>%
+      prep_nhsn_weekly(adjust_partial = TRUE) %>%
+      ## the data has other weeks in it so we need to filter for the date range of interest
+      dplyr::filter(dplyr::between(week_end, begin_date,end_date)) %>%
+      dplyr::filter(.data$location == .env$location) %>%
+      dplyr::select(abbreviation, flu.admits, week_end)
+
+    imputed_ts <-
+      partial_dat %>%
+      dplyr::pull(flu.admits)
+
+  }
+
+
+  ## truncate at zero so there are no negative counts
+  imputed_ts <- ifelse(imputed_ts < 0, 0, imputed_ts)
+
+  res <-
+    dplyr::tibble(
+      location = .env$location,
+      abbreviation = unique(tmp_dat$abbreviation),
+      flu.admits = imputed_ts,
+      week_end = imputed_weeks
+    ) %>%
+    dplyr::mutate(week_start = week_end - 6,
+                  epiyear = lubridate::epiyear(week_end),
+                  epiweek = lubridate::epiweek(week_end),
+                  monday = week_start + 1) %>%
+    dplyr::bind_rows(., tmp_dat) %>%
+    dplyr::arrange(week_end)
+
+  return(res)
+}
+