Improved the extract_eq for the glm class

1. Fixed the problem that error terms keep showing up in the model equation. 2. Added support for the offset term 3. Now support basic glm eqn display for all distributions. 4. Fixed the issue that log is also a pattern match to "logistic" and "logit" 5. Fixed the issue that "link" is saved in "info" instead of "family".
training-datalab · Aug 29, 2020 · d8246b3 · d8246b3
1 parent 7e8a559
commit d8246b3
Show file tree

Hide file tree

Showing 10 changed files with 134 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -42,7 +42,8 @@ Depends: R (>= 3.3.0)
 Imports: 
     broom (>= 0.7.0),
     stats,
-    knitr
+    knitr,
+    utils
 Suggests: 
     texPreview,
     testthat (>= 2.1.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,7 +14,6 @@ S3method(extract_lhs,clm)
 S3method(extract_lhs,glm)
 S3method(extract_lhs,lm)
 S3method(extract_lhs,polr)
-S3method(extract_lhs2,glm)
 S3method(format,equation)
 S3method(knit_print,equation)
 S3method(modify_lhs_for_link,clm)

diff --git a/R/create_eq.R b/R/create_eq.R
@@ -15,8 +15,8 @@ create_eq <- function(lhs,...) {
 #' @inheritParams extract_eq
 #' @noRd
 
-create_eq.default <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
-                              fix_signs, model, intercept, greek,
+create_eq.default <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
+                              fix_signs, intercept, greek,
                               raw_tex) {
   rhs$final_terms <- create_term(rhs, ital_vars)
 
@@ -39,23 +39,27 @@ create_eq.default <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
 #' @export
 #' @noRd
 #' @inheritParams extract_eq
-create_eq.glm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
-                              fix_signs, model, intercept, greek, raw_tex) {
+create_eq.glm <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
+                              fix_signs, intercept, greek, raw_tex) {
   rhs$final_terms <- create_term(rhs, ital_vars)
 
   if (use_coefs) {
     rhs$final_terms <- add_coefs(rhs, rhs$final_terms, coef_digits)
   } else {
     rhs$final_terms <- add_greek(rhs, rhs$final_terms, greek, intercept, raw_tex)
   }
+  if (!is.null(model$offset)){
+    rhs <- rbind(rhs, c(rep(NA, (dim(rhs)[2]-1)),
+                        add_tex_ital(utils::tail(names(attr(model$terms, "dataClasses")),1), ital_vars)))
+  }
 
   list(lhs = list(lhs), rhs = list(rhs$final_terms))
 }
 
 #' @export
 #' @noRd
-create_eq.polr <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
-                           fix_signs, model, ...) {
+create_eq.polr <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
+                           fix_signs, ...) {
   rhs$final_terms <- create_term(rhs, ital_vars)
 
   if (use_coefs) {
@@ -74,8 +78,8 @@ create_eq.polr <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
 
 #' @export
 #' @noRd
-create_eq.clm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
-                          fix_signs, model, ...) {
+create_eq.clm <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
+                          fix_signs, ...) {
   rhs$final_terms <- create_term(rhs, ital_vars)
 
   if (use_coefs) {
@@ -88,6 +92,7 @@ create_eq.clm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
   rhs_final <- lapply(splt$intercept$final_terms, function(x) {
     c(x, splt$location$final_terms)
   })
+
   attributes(lhs) <- NULL
   list(lhs = lhs, rhs = rhs_final)
 }

diff --git a/R/extract_eq.R b/R/extract_eq.R
@@ -102,13 +102,13 @@ extract_eq <- function(model, intercept = "alpha", greek = "beta",
   lhs <- extract_lhs(model, ital_vars, show_distribution, use_coefs)
   rhs <- extract_rhs(model)
 
-  eq_raw <- create_eq(lhs,
+  eq_raw <- create_eq(model,
+                      lhs,
                       rhs,
                       ital_vars,
                       use_coefs,
                       coef_digits,
                       fix_signs,
-                      model,
                       intercept,
                       greek,
                       raw_tex)

diff --git a/R/extract_lhs.R b/R/extract_lhs.R
@@ -60,10 +60,42 @@ extract_lhs.lm <- function(model, ital_vars,
 #' @return A character string
 #' @noRd
 
-extract_lhs.glm <- function(model, ital_vars, show_distribution, ...) {
+extract_lhs.glm <- function(model, ital_vars, show_distribution, use_coefs, ...) {
   if (show_distribution) {
-    return(extract_lhs2.glm(model, ital_vars))
+    if (model$family$family == "binomial"){
+      return(extract_lhs2_binomial(model, ital_vars, use_coefs))
+    } else {
+      message("This distribution is not presently supported; the distribution assumption
+      will not be displayed")
+      lhs <- all.vars(formula(model))[1]
+      full_lhs <- paste("E(", add_tex_ital_v(lhs, ital_vars), ")")
+      if (use_coefs){
+        full_lhs <- add_hat(full_lhs)
+      }
+      full_lhs <- modify_lhs_for_link(model, full_lhs)
+      class(full_lhs) <- c("character", class(model))
+      return(full_lhs)
+      }
   }
+  if (model$family$family == "binomial"){
+    return(extract_lhs_binomial(model, ital_vars, use_coefs))
+    } else {
+      lhs <- all.vars(formula(model))[1]
+      full_lhs <- paste("E(", add_tex_ital(lhs, ital_vars), ")")
+      if (use_coefs){
+        full_lhs <- add_hat(full_lhs)
+      }
+      full_lhs <- modify_lhs_for_link(model, full_lhs)
+      class(full_lhs) <- c("character", class(model))
+      return(full_lhs)
+    }
+}
+
+#' @return A character string
+#' @keywords internal
+#' @noRd
+
+extract_lhs_binomial <- function(model, ital_vars, use_coefs){
   lhs <- all.vars(formula(model))[1]
 
   # This returns a 1x1 data.frame
@@ -76,23 +108,24 @@ extract_lhs.glm <- function(model, ital_vars, show_distribution, ...) {
   ss_escaped <- escape_tex(ss)
 
   if (is.na(ss)) {
-    full_lhs <- add_tex_ital_v(lhs_escaped, ital_vars)
+    full_lhs <- paste("P(", add_tex_ital_v(lhs_escaped, ital_vars), ")")
   } else {
-    full_lhs <- paste(add_tex_ital_v(lhs_escaped, ital_vars),
+    full_lhs <- paste("P(", add_tex_ital_v(lhs_escaped, ital_vars),
                       "=",
-                      add_tex_ital_v(ss_escaped, ital_vars))
+                      add_tex_ital_v(ss_escaped, ital_vars), ")")
+  }
+  if (use_coefs){
+    full_lhs <- add_hat(full_lhs)
   }
 
   full_lhs <- modify_lhs_for_link(model, full_lhs)
   class(full_lhs) <- c("character", class(model))
   full_lhs
 }
 
-#' @export
 #' @keywords internal
 #' @noRd
-
-extract_lhs2.glm <- function(model, ital_vars, ...) {
+extract_lhs2_binomial <- function(model, ital_vars, ...){
   outcome <- all.vars(formula(model))[1]
   n <- unique(model$model$`(weights)`)
   if (is.null(n)) {
@@ -114,32 +147,27 @@ extract_lhs2.glm <- function(model, ital_vars, ...) {
   outcome_escaped <- escape_tex(outcome)
   ss_escaped <- escape_tex(ss)
 
-  if (is.na(ss)) {
-    lhs <- add_tex_ital_v(outcome_escaped, ital_vars)
-  } else {
-    lhs <- paste0(add_tex_ital_v(outcome_escaped, ital_vars),
-                  add_tex_subscripts(
-                    add_tex_ital_v(ss_escaped, ital_vars)
-                  )
-    )
-  }
+  lhs <- add_tex_ital_v(outcome_escaped, ital_vars)
+  p <- paste0("\\operatorname{prob}",
+              add_tex_subscripts(
+                paste0(
+                add_tex_ital_v(outcome_escaped, ital_vars), " = ",
+                  add_tex_ital_v(ss_escaped, ital_vars)
+                  )))
 
-  rhs <- paste0("B\\left(",
-                "\\operatorname{prob} = \\hat{P},",
-                "\\operatorname{size} = ", n,
+  rhs <- paste0("Bernoulli\\left(", p,
+                 "= \\hat{P}",
                 "\\right)")
 
   topline <- paste(lhs, "&\\sim", rhs)
-  second_line <- "\\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right]"
 
-  full_lhs <- paste(topline, "\\\\\n", second_line, "\n")
+  second_line <- modify_lhs_for_link(model, "\\hat{P}")
 
-  class(full_lhs) <- c("character", class(model))
+  full_lhs <- paste(topline, "\\\\\n", second_line, "\n")
   full_lhs
 }
 
 
-
 #' Extract left-hand side of a polr object
 #'
 #' Extract a string of the outcome/dependent/y variable with the appropriate
@@ -160,7 +188,7 @@ extract_lhs.polr <- function(model, ital_vars, ...) {
 
   lhs <- lapply(strsplit(lhs_escaped, "\\|"), add_tex_ital_v, ital_vars)
   lhs <- lapply(lhs, paste, collapse = " \\geq ")
-
+  lhs <- lapply(lhs, function(.x) paste0("P( ", .x , " )"))
   full_lhs <- lapply(lhs, function(.x) modify_lhs_for_link(model, .x))
 
   class(full_lhs) <- c("list", class(model))
@@ -188,7 +216,7 @@ extract_lhs.clm <- function(model, ital_vars, ...) {
 
   lhs <- lapply(strsplit(lhs_escaped, "\\|"), add_tex_ital_v, ital_vars)
   lhs <- lapply(lhs, paste, collapse = " \\geq ")
-
+  lhs <- lapply(lhs, function(.x) paste("P(", .x , ")"))
   full_lhs <- lapply(lhs, function(.x) modify_lhs_for_link(model, .x))
 
   class(full_lhs) <- c("list", class(model))
@@ -213,6 +241,9 @@ modify_lhs_for_link.glm <- function(model, lhs) {
     model$family$link <- "identity"
   }
   matched_row_bool <- grepl(model$family$link, link_function_df$link_name)
+  if (sum(matched_row_bool)>1){
+    matched_row_bool[1] <- FALSE
+  }
   filtered_link_formula <- link_function_df[matched_row_bool, "link_formula"]
   gsub("y", lhs, filtered_link_formula, fixed = TRUE)
 }
@@ -234,9 +265,12 @@ modify_lhs_for_link.clm <- function(model, lhs) {
   if (!(any(grepl(model$info$link, link_function_df$link_name)))) {
     message("This link function is not presently supported; using an identity
               function instead")
-    model$family$link <- "identity"
+    model$info$link <- "identity"
   }
   matched_row_bool <- grepl(model$info$link, link_function_df$link_name)
+  if (sum(matched_row_bool) > 1){
+    matched_row_bool[1] <- FALSE
+  }
   filtered_link_formula <- link_function_df[matched_row_bool, "link_formula"]
   gsub("y", lhs, filtered_link_formula, fixed = TRUE)
 }
@@ -253,11 +287,11 @@ link_name <- c("logit, logistic",
 
 # not sure how to address this one: quasi(link = "identity", variance = "constant")
 
-link_formula <- c("\\log\\left[ \\frac { P( y ) }{ 1 - P( y ) } \\right]",
-                  "P(y)",
-                  "\\frac { 1 }{ P( y ) }",
+link_formula <- c("\\log\\left[ \\frac { y }{ 1 - y } \\right]",
+                  "y",
+                  "\\frac { 1 }{ y }",
                   # "\\frac { 1 }{ 1/{ y }^{ 2 } } ", # inverse gaussian - correct?
-                  "\\log ( { y )} ",
+                  "\\log ({ y }) ",
                   "y") # are the parentheses italicized here?
 
 link_function_df <- data.frame(link_name, link_formula,

diff --git a/equatiomatic.Rproj b/equatiomatic.Rproj
@@ -13,7 +13,6 @@ RnwWeave: Sweave
 LaTeX: pdfLaTeX
 
 AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
 
 BuildType: Package
 PackageUseDevtools: Yes

diff --git a/tests/testthat/.Rapp.history b/tests/testthat/.Rapp.history
diff --git a/tests/testthat/test-clm.R b/tests/testthat/test-clm.R
@@ -32,14 +32,14 @@ test_that("Ordered models with clm work", {
 \\end{aligned}"
 
   actual_nowrap_probit <- "\\begin{aligned}
-P(\\operatorname{A} \\geq \\operatorname{B}) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
-P(\\operatorname{B} \\geq \\operatorname{C}) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})]
+P( \\operatorname{A} \\geq \\operatorname{B} ) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
+P( \\operatorname{B} \\geq \\operatorname{C} ) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})]
 \\end{aligned}"
 
   actual_wrap_probit <- "\\begin{aligned}
-P(\\operatorname{A} \\geq \\operatorname{B}) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
+P( \\operatorname{A} \\geq \\operatorname{B} ) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
 &\\qquad\\ \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
-P(\\operatorname{B} \\geq \\operatorname{C}) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
+P( \\operatorname{B} \\geq \\operatorname{C} ) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
 &\\qquad\\ \\beta_{2}(\\operatorname{continuous\\_2})]
 \\end{aligned}"
 

diff --git a/tests/testthat/test-glm.R b/tests/testthat/test-glm.R
@@ -28,15 +28,15 @@ test_that("Probit regression works", {
                       family = binomial(link = "probit"))
 
   tex <- extract_eq(model_probit)
-  actual <- "P(\\operatorname{outcome} = \\operatorname{1}) = \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]"
+  actual <- "P( \\operatorname{outcome} = \\operatorname{1} ) = \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]"
 
   expect_equal(tex, equation_class(actual),
                label = "basic equation builds correctly")
 
   # Everything works when there are no categorical variables
   model_gaussian <- glm(mpg ~ cyl + disp, data = mtcars, family = gaussian())
   tex <- extract_eq(model_gaussian)
-  actual <- "\\operatorname{mpg} = \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})"
+  actual <- "E( \\operatorname{mpg} ) = \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})"
 
   expect_equal(tex, equation_class(actual),
                label = "equation sans categorical variables builds correctly")
@@ -66,13 +66,15 @@ test_that("Distribution-based equations work", {
   tex_probit <- extract_eq(model_probit, show_distribution = TRUE)
 
   actual_logit <- "\\begin{aligned}
-\\operatorname{outcome}_{\\operatorname{1}} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 300\\right) \\\\
- \\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})
+\\operatorname{outcome} &\\sim Bernoulli\\left(\\operatorname{prob}_{\\operatorname{outcome} = \\operatorname{1}}= \\hat{P}\\right) \\\\
+ \\log\\left[ \\frac { \\hat{P} }{ 1 - \\hat{P} } \\right] 
+ &= \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})
 \\end{aligned}"
 
   actual_probit <- "\\begin{aligned}
-\\operatorname{outcome}_{\\operatorname{1}} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 300\\right) \\\\
- \\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]
+\\operatorname{outcome} &\\sim Bernoulli\\left(\\operatorname{prob}_{\\operatorname{outcome} = \\operatorname{1}}= \\hat{P}\\right) \\\\
+ \\hat{P} 
+ &= \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]
 \\end{aligned}"
 
   expect_equal(tex_logit, equation_class(actual_logit),
@@ -85,8 +87,7 @@ test_that("Distribution-based equations work", {
   model_gaussian <- glm(mpg ~ cyl + disp, data = mtcars, family = gaussian())
   tex <- extract_eq(model_gaussian, show_distribution = TRUE)
   actual <- "\\begin{aligned}
-\\operatorname{mpg} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 32\\right) \\\\
- \\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})
+E( \\operatorname{mpg} ) &= \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})
 \\end{aligned}"
 
   expect_equal(tex, equation_class(actual),
@@ -106,3 +107,38 @@ test_that("Weights work", {
 
   expect_warning(extract_eq(model_logit, show_distribution = TRUE))
 })
+
+test_that("non-binomial regression works", {
+  set.seed(1234)
+  df <- data.frame(outcome = sample(0:15, 300, replace = TRUE),
+                   categorical = rep(letters[1:3], 100),
+                   continuous_1 = rnorm(300, 100, 1),
+                   continuous_2 = rnorm(300, 50, 5))
+
+  model_log <- glm(outcome ~ ., data = df,
+                   family = poisson(link = "log"))
+  tex_log <- extract_eq(model_log)
+  actual_log <- "\\log ({ E( \\operatorname{outcome} ) })  = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})"
+
+  expect_equal(tex_log, equation_class(actual_log),
+               label = "basic log equation builds correctly")
+
+  expect_message(extract_eq(model_log, show_distribution = TRUE))
+
+  model_id <- glm(outcome ~ ., data = df,
+                  family = gaussian(link = "identity"))
+  tex_id <- extract_eq(model_log)
+  actual_id <- "\\log ({ E( \\operatorname{outcome} ) })  = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})"
+
+  expect_equal(tex_id, equation_class(actual_id),
+               label = "basic identity equation builds correctly")
+
+  model_offset <- glm(outcome ~ . + offset(rep(1, 300)), data = df,
+                      family = poisson(link = "log"))
+  tex_offset <- extract_eq(model_offset)
+  actual_offset <- "\\log ({ E( \\operatorname{outcome} ) })  = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2}) + \\operatorname{offset(rep(1, 300))}"
+
+  expect_equal(tex_offset, equation_class(actual_offset),
+               label = "basic equation with offset builds correctly")
+})
+