insightsengineering · shajoezhu · Dec 14, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025
@@ -46,6 +46,7 @@ setClassUnion("SubsetDef", c("expression", "logical", "integer", "numeric"))
 
 setClassUnion("integerOrNULL", c("NULL", "integer"))
 setClassUnion("characterOrNULL", c("NULL", "character"))
+setClassUnion("characterOrList", c("list", "character"))
 
 ## should XXX [splits, s_values, sval_labels, subset(?)] be a data.frame?
 setClass("TreePos", representation(
@@ -60,10 +61,29 @@ validity = function(object) {
 }
 )
 
+setOldClass(c("FormatList", "list"))
+
+FormatList <- function(..., .list = list(...)) {
+  if (!is.list(.list)) {
+    .list <- list(.list)
+  }
+  valid <- vapply(.list, is, class2 = "FormatSpec", TRUE)
+  if (!are(.list, "FormatSpec")) {
+    stop(
+      "Attempted to construct FormatList with elements that are not ",
+      "FormatSpec compatible. This should not happen, please contact ",
+      "the maintainers."
+    )
+  }
+
+  class(.list) <- c("FormatList", "list")
+  .list
+}
+
 setClassUnion("functionOrNULL", c("NULL", "function"))
 setClassUnion("listOrNULL", c("NULL", "list"))
 ## TODO (?) make "list" more specific, e.g FormatList, or FunctionList?
-setClassUnion("FormatSpec", c("NULL", "character", "function", "list"))
+setClassUnion("FormatSpec", c("NULL", "character", "function", "list", "FormatList"))
 setClassUnion("ExprOrNULL", c("NULL", "expression"))
 
 setClass("ValueWrapper", representation(
@@ -133,7 +153,7 @@ setClass("Split",
     name = "character",
     split_label = "character",
     split_format = "FormatSpec",
-    split_na_str = "character",
+    split_na_str = "characterOrList",
     split_label_position = "character",
     ## NB this is the function which is applied to
     ## get the content rows for the CHILDREN of this
@@ -633,7 +653,9 @@ setClass("VAnalyzeSplit",
   representation(
     default_rowlabel = "character",
     include_NAs = "logical",
-    var_label_position = "character"
+    var_label_position = "character",
+    row_formats_var = "characterOrNULL",
+    row_na_strs_var = "characterOrNULL"
   )
 )
 
@@ -672,7 +694,9 @@ AnalyzeVarSplit <- function(var,
                             indent_mod = 0L,
                             label_pos = "default",
                             cvar = "",
-                            section_div = NA_character_) {
+                            section_div = NA_character_,
+                            formats_var = NULL,
+                            na_strs_var = NULL) {
   check_ok_label(split_label)
   label_pos <- match.arg(label_pos, c("default", label_pos_values))
   if (!any(nzchar(defrowlab))) {
@@ -701,7 +725,9 @@ AnalyzeVarSplit <- function(var,
     page_title_prefix = NA_character_,
     child_section_div = section_div,
     child_show_colcounts = FALSE,
-    child_colcount_format = NA_character_
+    child_colcount_format = NA_character_,
+    row_formats_var = formats_var,
+    row_na_strs_var = na_strs_var
   ) ## no content_extra_args
 }
 
@@ -823,7 +849,9 @@ AnalyzeMultiVars <- function(var,
                              child_labels = c("default", "topleft", "visible", "hidden"),
                              child_names = var,
                              cvar = "",
-                             section_div = NA_character_) {
+                             section_div = NA_character_,
+                             formats_var = NULL,
+                             na_strs_var = NULL) {
   ## NB we used to resolve to strict TRUE/FALSE for label visibillity
   ## in this function but that was too greedy for repeated
   ## analyze calls, so that now occurs in the tabulation machinery
@@ -842,26 +870,59 @@ AnalyzeMultiVars <- function(var,
     ##        split_format = .repoutlst(split_format, nv)
     inclNAs <- .repoutlst(inclNAs, nv)
     section_div_if_multivar <- if (length(var) > 1) NA_character_ else section_div
-    pld <- mapply(AnalyzeVarSplit,
-      var = var,
-      split_name = child_names,
-      split_label = split_label,
-      afun = afun,
-      defrowlab = defrowlab,
-      cfun = cfun,
-      cformat = cformat,
-      ##                     split_format = split_format,
-      inclNAs = inclNAs,
-      MoreArgs = list(
-        extra_args = extra_args,
-        indent_mod = indent_mod,
-        label_pos = show_kidlabs,
-        split_format = split_format,
-        split_na_str = split_na_str,
-        section_div = section_div_if_multivar
-      ), ## rvis),
-      SIMPLIFY = FALSE
+
+    moreargs <- list(
+      extra_args = extra_args,
+      indent_mod = indent_mod,
+      label_pos = show_kidlabs,
+      section_div = section_div_if_multivar,
+      formats_var = formats_var,
+      na_strs_var = na_strs_var
     )
+    mv_list_case <- is.list(split_format) &&
+      all(var %in% names(split_format)) &&
+      all(vapply(split_format, is, class2 = "FormatList", TRUE))
+    if (mv_list_case) { # diff format list for each var
+      stopifnot(all(var %in% names(split_na_str)))
+      ## split_value does *not* go  in more args, not constant across vars
+      pld <- mapply(
+        AnalyzeVarSplit,
+        var = var,
+        split_name = child_names,
+        split_label = split_label,
+        afun = afun,
+        defrowlab = defrowlab,
+        cfun = cfun,
+        cformat = cformat,
+        ## in case they're in the wrong order for some insane reason
+        split_format = split_format[var],
+        split_na_str = split_na_str[var],
+        inclNAs = inclNAs,
+        MoreArgs = moreargs, ## rvis),
+        SIMPLIFY = FALSE
+      )
+    } else { # not diff lists for each var
+      ## split format goes in more args because its constant across vars
+      pld <- mapply(
+        AnalyzeVarSplit,
+        var = var,
+        split_name = child_names,
+        split_label = split_label,
+        afun = afun,
+        defrowlab = defrowlab,
+        cfun = cfun,
+        cformat = cformat,
+        inclNAs = inclNAs,
+        MoreArgs = c(
+          moreargs,
+          list(
+            split_format = split_format,
+            split_na_str = split_na_str
+          )
+        ), ## rvis),
+        SIMPLIFY = FALSE
+      )
+    }
   } else {
     ## we're combining existing splits here
     pld <- unlist(lapply(.payload, .uncompound))

@@ -98,6 +98,14 @@ gen_args <- function(df, alt_counts_df, spl, pos, tt, tr, verbose, colwidths, ob
 #'   functions. See [formatters::list_valid_format_labels()] for a list of all available format strings.
 #' @param format_na_str (`string`)\cr string which should be displayed when formatted if this cell's value(s)
 #'   are all `NA`.
+#' @param formats_var (`string` or `NULL`)\cr `NULL` (the default) or the name of the list column containing named
+#' lists of default formats to use. These will be applied with the same precedence as the `format` argument; i.e.,
+#' they will not override formats (other than `"default"`) set within the afun.
+#' Cannot be used simultaneously with `format`.
+#' @param na_strs_var (`string` or `NULL`)\cr `NULL` (the default) or the name of the list column containing named
+#' lists of default NA strings to use. These will be applied with the same precedence as the `format` argument; i.e.,
+#' they will not override formats (other than `"default"`) set within the afun.
+#' Cannot be used simultaneously with `format`. Cannot be used if `formats_var` is `NULL`.
 #' @param indent_mod (`numeric`)\cr modifier for the default indent position for the structure created by this
 #'   function (subtable, content table, or row) *and all of that structure's children*. Defaults to 0, which
 #'   corresponds to the unmodified default behavior.
@@ -157,7 +165,7 @@ lyt_args <- function(lyt, var, vars, label, labels_var, varlabels, varnames, spl
                      var_labels, cvar,
                      table_names, topleft, align, page_by, page_prefix,
                      format_na_str, section_div, na_str, show_colcounts,
-                     colcount_format, parent_name) {
+                     colcount_format, parent_name, formats_var, na_strs_var) {
   NULL
 }
 

@@ -1067,12 +1067,81 @@ NULL
 #'   divider will be overridden by a split-level section divider when
 #'   both apply to the same position in the rendered output.
 #'
-#' @inherit split_cols_by return
-#'
-#' @details
-#' When non-`NULL`, `format` is used to specify formats for all generated rows, and can be a character vector, a
-#' function, or a list of functions. It will be repped out to the number of rows once this is calculated during the
-#' tabulation process, but will be overridden by formats specified within `rcell` calls in `afun`.
+#' @details When `length(vars) > 1` and when two calls to `analyze`
+#'     are done in sequence (the second with the default `nested =
+#'     TRUE`), the analyses will be combined into a multi-variable
+#'     analysis that will be reflected in the row structure of the
+#'     resulting table. In these cases, the default is to show the
+#'     label describing the variable analyzed for each of the
+#'     resulting subtables, while that is hidden by default in
+#'     one-variable cases.
+#'
+#' # Specifying Default Formatting Behavior
+#'
+#' *Default* formatting behavior for rows generated by `afun` can be
+#' specified by one of `format` or `formats_var`. In both cases, these
+#' default formatting instructions *will not* supersede formatting
+#' specified from within `afun` at either the `rcell` or `in_rows`
+#' call levels; They will only apply to rows/cells whose formatting as
+#' returned by `afun` is either `NULL` or `"default"`.  When
+#' non-`NULL`, `format` is used to specify formats for all generated
+#' rows, and can be a character vector, a function, or a list of
+#' functions. It will be repped out to the number of rows once this is
+#' calculated during the tabulation process, but will be overridden by
+#' formats specified within `rcell` calls in `afun`.
+#'
+#' `format` can accept a format label string (see
+#' [formatters::list_valid_format_labels()]), a formatting function, an
+#' unnamed list, or a named list.
+#'
+#' When `format` is an unnamed list - or a named list where not all
+#' values of `vars` appear in the names - its elements will be repped
+#' out to the number of rows generated by `afun` (separately) within
+#' each row facet `afun` is applied within. **This includes recycling
+#' behavior, even in the case where the number of rows is not cleanly
+#' divisible by the number of specified formats**. This behavior is
+#' retained largely for legacy reasons and switching to the new
+#' named-list behavior is advised where applicable.
+#'
+#' When `format` is a named list whose names contain all values in
+#' `vars`, the elements of `format` are taken to be specific to the
+#' analysis of the corresponding variable; this allows us to specify a
+#' multi-variable analysis where e.g., the different variables are
+#' analyzed by the same `afun` but have different levels of
+#' measurement precision (and thus different formatting needs). In
+#' this case the var-specific formatting can be a single format (label
+#' string or function) or can be a named list whose names will be
+#' matched up to those of the rows generated by applying `afun` in
+#' each row facet. Matching of formats to rows is performed the same
+#' as in the `formats_var` case and is described below.
+#'
+#' When `formats_var` is non-`NULL`, it specifies the name of a list
+#' column containing formatting instructions for one or more rows
+#' `afun` will generate when applied within a row facet. This can be
+#' used when the analysis results for a single variable (e.g., `value`
+#' or `AVAL` in long-form data) should be formatted differently within
+#' different row facets (e.g., when faceting on `statistic` or
+#' `PARAMCD`). The value of `df[[formats_var]]` is assumed without
+#' verification to be constant within each row facet `afun` is applied
+#' within, and the first (list) value of the column within the row
+#' facet data will be used.
+#'
+#' In the `formats_var` case as well as the case of `format` being a
+#' named list containing the values of `vars`, after rows are created
+#' during tabulation, the default formats are matched and applied to
+#' them as follows:
+#'
+#' 1. When the generated row's name (as given by `obj_name`) matches
+#'   a name in the list, the corresponding default format is applied,
+#' 2. for those without exact matches, the default format whose name
+#'   provides *the best partial match* to each row name is applied,
+#' 3. For those without default format names that partially match
+#'   the row name, no default format is applied.
+#'
+#' Note carefully that in (2), it is the names of the list of formats
+#' that are partially matching the row names not the other way around.
+#'
+#' # The Analysis Function
 #'
 #' The analysis function (`afun`) should take as its first parameter either `x` or `df`. Whichever of these the
 #' function accepts will change the behavior when tabulation is performed as follows:
@@ -1086,6 +1155,8 @@ NULL
 #' other parameters which, *if and only if* present in the formals, will be passed to the function by the tabulation
 #' machinery. These are listed and described in [additional_fun_params].
 #'
+#' @inherit split_cols_by return
+#'
 #' @note None of the arguments described in [additional_fun_params] can be overridden via `extra_args` or when calling
 #'   [make_afun()]. `.N_col` and `.N_total` can be overridden via the `col_counts` argument to [build_table()].
 #'   Alternative values for the others must be calculated within `afun` based on a combination of extra arguments and
@@ -1124,7 +1195,9 @@ analyze <- function(lyt,
                     table_names = vars,
                     parent_name = NULL,
                     format = NULL,
+                    formats_var = NULL,
                     na_str = NA_character_,
+                    na_strs_var = NULL,
                     nested = TRUE,
                     ## can't name this na_rm symbol conflict with possible afuns!!
                     inclNAs = FALSE,
@@ -1134,6 +1207,19 @@ analyze <- function(lyt,
                     section_div = NA_character_) {
   show_labels <- match.arg(show_labels)
   subafun <- substitute(afun)
+  if (!is.null(format) && !is.null(formats_var)) {
+    stop(
+      "Cannot use 'format' and 'formats_var' arguments at ",
+      "the same time. Please choose one method for specifying ",
+      "default formatting."
+    )
+  } else if (is.null(formats_var) && !is.null(na_strs_var)) {
+    stop(
+      "Cannot use 'na_strs_var' (got ",
+      na_strs_var,
+      ") without using 'formats_var'."
+    )
+  }
   # R treats a single NA value as a logical atomic. The below
   # maps all the NAs in `var_labels` to NA_character_ required by `Split`
   # and avoids the error when `var_labels` is just c(NA).
@@ -1159,6 +1245,17 @@ analyze <- function(lyt,
     defrowlab <- var_labels
   }
 
+  ## hook up the new hotness
+  var_format_lists <- length(vars) > 1 &&
+    is.list(format) &&
+    all(vars %in% names(format))
+
+  if (var_format_lists) {
+    format <- lapply(format, function(x) FormatList(.list = x))
+    if (is.character(na_str)) {
+      na_str <- lapply(format, function(x) na_str)
+    }
+  }
   spl <- AnalyzeMultiVars(vars, var_labels,
     afun = afun,
     split_format = format,
@@ -1170,7 +1267,9 @@ analyze <- function(lyt,
     child_names = table_names,
     child_labels = show_labels,
     section_div = section_div,
-    split_name = parent_name
+    split_name = parent_name,
+    formats_var = formats_var,
+    na_strs_var = na_strs_var
   )
 
   if (nested && (is(last_rowsplit(lyt), "VAnalyzeSplit") || is(last_rowsplit(lyt), "AnalyzeMultiVars"))) {