Skip to content

Commit

Permalink
Improved the extract_eq for the glm class
Browse files Browse the repository at this point in the history
1. Fixed the problem that error terms keep showing up in the model equation. 
2. Added support for the offset term
3. Now support basic glm eqn display for all distributions. 
4. Fixed the issue that log is also a pattern match to "logistic" and "logit" 
5. Fixed the issue that "link" is saved in "info" instead of "family".
  • Loading branch information
thomas-fung committed Aug 29, 2020
1 parent 7e8a559 commit d8246b3
Show file tree
Hide file tree
Showing 10 changed files with 134 additions and 60 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ Depends: R (>= 3.3.0)
Imports:
broom (>= 0.7.0),
stats,
knitr
knitr,
utils
Suggests:
texPreview,
testthat (>= 2.1.0),
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ S3method(extract_lhs,clm)
S3method(extract_lhs,glm)
S3method(extract_lhs,lm)
S3method(extract_lhs,polr)
S3method(extract_lhs2,glm)
S3method(format,equation)
S3method(knit_print,equation)
S3method(modify_lhs_for_link,clm)
Expand Down
21 changes: 13 additions & 8 deletions R/create_eq.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ create_eq <- function(lhs,...) {
#' @inheritParams extract_eq
#' @noRd

create_eq.default <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, model, intercept, greek,
create_eq.default <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, intercept, greek,
raw_tex) {
rhs$final_terms <- create_term(rhs, ital_vars)

Expand All @@ -39,23 +39,27 @@ create_eq.default <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
#' @export
#' @noRd
#' @inheritParams extract_eq
create_eq.glm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, model, intercept, greek, raw_tex) {
create_eq.glm <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, intercept, greek, raw_tex) {
rhs$final_terms <- create_term(rhs, ital_vars)

if (use_coefs) {
rhs$final_terms <- add_coefs(rhs, rhs$final_terms, coef_digits)
} else {
rhs$final_terms <- add_greek(rhs, rhs$final_terms, greek, intercept, raw_tex)
}
if (!is.null(model$offset)){
rhs <- rbind(rhs, c(rep(NA, (dim(rhs)[2]-1)),
add_tex_ital(utils::tail(names(attr(model$terms, "dataClasses")),1), ital_vars)))
}

list(lhs = list(lhs), rhs = list(rhs$final_terms))
}

#' @export
#' @noRd
create_eq.polr <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, model, ...) {
create_eq.polr <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, ...) {
rhs$final_terms <- create_term(rhs, ital_vars)

if (use_coefs) {
Expand All @@ -74,8 +78,8 @@ create_eq.polr <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,

#' @export
#' @noRd
create_eq.clm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, model, ...) {
create_eq.clm <- function(model, lhs, rhs, ital_vars, use_coefs, coef_digits,
fix_signs, ...) {
rhs$final_terms <- create_term(rhs, ital_vars)

if (use_coefs) {
Expand All @@ -88,6 +92,7 @@ create_eq.clm <- function(lhs, rhs, ital_vars, use_coefs, coef_digits,
rhs_final <- lapply(splt$intercept$final_terms, function(x) {
c(x, splt$location$final_terms)
})

attributes(lhs) <- NULL
list(lhs = lhs, rhs = rhs_final)
}
Expand Down
4 changes: 2 additions & 2 deletions R/extract_eq.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ extract_eq <- function(model, intercept = "alpha", greek = "beta",
lhs <- extract_lhs(model, ital_vars, show_distribution, use_coefs)
rhs <- extract_rhs(model)

eq_raw <- create_eq(lhs,
eq_raw <- create_eq(model,
lhs,
rhs,
ital_vars,
use_coefs,
coef_digits,
fix_signs,
model,
intercept,
greek,
raw_tex)
Expand Down
96 changes: 65 additions & 31 deletions R/extract_lhs.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,42 @@ extract_lhs.lm <- function(model, ital_vars,
#' @return A character string
#' @noRd

extract_lhs.glm <- function(model, ital_vars, show_distribution, ...) {
extract_lhs.glm <- function(model, ital_vars, show_distribution, use_coefs, ...) {
if (show_distribution) {
return(extract_lhs2.glm(model, ital_vars))
if (model$family$family == "binomial"){
return(extract_lhs2_binomial(model, ital_vars, use_coefs))
} else {
message("This distribution is not presently supported; the distribution assumption
will not be displayed")
lhs <- all.vars(formula(model))[1]
full_lhs <- paste("E(", add_tex_ital_v(lhs, ital_vars), ")")
if (use_coefs){
full_lhs <- add_hat(full_lhs)
}
full_lhs <- modify_lhs_for_link(model, full_lhs)
class(full_lhs) <- c("character", class(model))
return(full_lhs)
}
}
if (model$family$family == "binomial"){
return(extract_lhs_binomial(model, ital_vars, use_coefs))
} else {
lhs <- all.vars(formula(model))[1]
full_lhs <- paste("E(", add_tex_ital(lhs, ital_vars), ")")
if (use_coefs){
full_lhs <- add_hat(full_lhs)
}
full_lhs <- modify_lhs_for_link(model, full_lhs)
class(full_lhs) <- c("character", class(model))
return(full_lhs)
}
}

#' @return A character string
#' @keywords internal
#' @noRd

extract_lhs_binomial <- function(model, ital_vars, use_coefs){
lhs <- all.vars(formula(model))[1]

# This returns a 1x1 data.frame
Expand All @@ -76,23 +108,24 @@ extract_lhs.glm <- function(model, ital_vars, show_distribution, ...) {
ss_escaped <- escape_tex(ss)

if (is.na(ss)) {
full_lhs <- add_tex_ital_v(lhs_escaped, ital_vars)
full_lhs <- paste("P(", add_tex_ital_v(lhs_escaped, ital_vars), ")")
} else {
full_lhs <- paste(add_tex_ital_v(lhs_escaped, ital_vars),
full_lhs <- paste("P(", add_tex_ital_v(lhs_escaped, ital_vars),
"=",
add_tex_ital_v(ss_escaped, ital_vars))
add_tex_ital_v(ss_escaped, ital_vars), ")")
}
if (use_coefs){
full_lhs <- add_hat(full_lhs)
}

full_lhs <- modify_lhs_for_link(model, full_lhs)
class(full_lhs) <- c("character", class(model))
full_lhs
}

#' @export
#' @keywords internal
#' @noRd

extract_lhs2.glm <- function(model, ital_vars, ...) {
extract_lhs2_binomial <- function(model, ital_vars, ...){
outcome <- all.vars(formula(model))[1]
n <- unique(model$model$`(weights)`)
if (is.null(n)) {
Expand All @@ -114,32 +147,27 @@ extract_lhs2.glm <- function(model, ital_vars, ...) {
outcome_escaped <- escape_tex(outcome)
ss_escaped <- escape_tex(ss)

if (is.na(ss)) {
lhs <- add_tex_ital_v(outcome_escaped, ital_vars)
} else {
lhs <- paste0(add_tex_ital_v(outcome_escaped, ital_vars),
add_tex_subscripts(
add_tex_ital_v(ss_escaped, ital_vars)
)
)
}
lhs <- add_tex_ital_v(outcome_escaped, ital_vars)
p <- paste0("\\operatorname{prob}",
add_tex_subscripts(
paste0(
add_tex_ital_v(outcome_escaped, ital_vars), " = ",
add_tex_ital_v(ss_escaped, ital_vars)
)))

rhs <- paste0("B\\left(",
"\\operatorname{prob} = \\hat{P},",
"\\operatorname{size} = ", n,
rhs <- paste0("Bernoulli\\left(", p,
"= \\hat{P}",
"\\right)")

topline <- paste(lhs, "&\\sim", rhs)
second_line <- "\\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right]"

full_lhs <- paste(topline, "\\\\\n", second_line, "\n")
second_line <- modify_lhs_for_link(model, "\\hat{P}")

class(full_lhs) <- c("character", class(model))
full_lhs <- paste(topline, "\\\\\n", second_line, "\n")
full_lhs
}



#' Extract left-hand side of a polr object
#'
#' Extract a string of the outcome/dependent/y variable with the appropriate
Expand All @@ -160,7 +188,7 @@ extract_lhs.polr <- function(model, ital_vars, ...) {

lhs <- lapply(strsplit(lhs_escaped, "\\|"), add_tex_ital_v, ital_vars)
lhs <- lapply(lhs, paste, collapse = " \\geq ")

lhs <- lapply(lhs, function(.x) paste0("P( ", .x , " )"))
full_lhs <- lapply(lhs, function(.x) modify_lhs_for_link(model, .x))

class(full_lhs) <- c("list", class(model))
Expand Down Expand Up @@ -188,7 +216,7 @@ extract_lhs.clm <- function(model, ital_vars, ...) {

lhs <- lapply(strsplit(lhs_escaped, "\\|"), add_tex_ital_v, ital_vars)
lhs <- lapply(lhs, paste, collapse = " \\geq ")

lhs <- lapply(lhs, function(.x) paste("P(", .x , ")"))
full_lhs <- lapply(lhs, function(.x) modify_lhs_for_link(model, .x))

class(full_lhs) <- c("list", class(model))
Expand All @@ -213,6 +241,9 @@ modify_lhs_for_link.glm <- function(model, lhs) {
model$family$link <- "identity"
}
matched_row_bool <- grepl(model$family$link, link_function_df$link_name)
if (sum(matched_row_bool)>1){
matched_row_bool[1] <- FALSE
}
filtered_link_formula <- link_function_df[matched_row_bool, "link_formula"]
gsub("y", lhs, filtered_link_formula, fixed = TRUE)
}
Expand All @@ -234,9 +265,12 @@ modify_lhs_for_link.clm <- function(model, lhs) {
if (!(any(grepl(model$info$link, link_function_df$link_name)))) {
message("This link function is not presently supported; using an identity
function instead")
model$family$link <- "identity"
model$info$link <- "identity"
}
matched_row_bool <- grepl(model$info$link, link_function_df$link_name)
if (sum(matched_row_bool) > 1){
matched_row_bool[1] <- FALSE
}
filtered_link_formula <- link_function_df[matched_row_bool, "link_formula"]
gsub("y", lhs, filtered_link_formula, fixed = TRUE)
}
Expand All @@ -253,11 +287,11 @@ link_name <- c("logit, logistic",

# not sure how to address this one: quasi(link = "identity", variance = "constant")

link_formula <- c("\\log\\left[ \\frac { P( y ) }{ 1 - P( y ) } \\right]",
"P(y)",
"\\frac { 1 }{ P( y ) }",
link_formula <- c("\\log\\left[ \\frac { y }{ 1 - y } \\right]",
"y",
"\\frac { 1 }{ y }",
# "\\frac { 1 }{ 1/{ y }^{ 2 } } ", # inverse gaussian - correct?
"\\log ( { y )} ",
"\\log ({ y }) ",
"y") # are the parentheses italicized here?

link_function_df <- data.frame(link_name, link_formula,
Expand Down
1 change: 0 additions & 1 deletion equatiomatic.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
Expand Down
Empty file added tests/testthat/.Rapp.history
Empty file.
8 changes: 4 additions & 4 deletions tests/testthat/test-clm.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ test_that("Ordered models with clm work", {
\\end{aligned}"

actual_nowrap_probit <- "\\begin{aligned}
P(\\operatorname{A} \\geq \\operatorname{B}) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
P(\\operatorname{B} \\geq \\operatorname{C}) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})]
P( \\operatorname{A} \\geq \\operatorname{B} ) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
P( \\operatorname{B} \\geq \\operatorname{C} ) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1}) + \\beta_{2}(\\operatorname{continuous\\_2})]
\\end{aligned}"

actual_wrap_probit <- "\\begin{aligned}
P(\\operatorname{A} \\geq \\operatorname{B}) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
P( \\operatorname{A} \\geq \\operatorname{B} ) &= \\Phi[\\alpha_{1} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
&\\qquad\\ \\beta_{2}(\\operatorname{continuous\\_2})] \\\\
P(\\operatorname{B} \\geq \\operatorname{C}) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
P( \\operatorname{B} \\geq \\operatorname{C} ) &= \\Phi[\\alpha_{2} + \\beta_{1}(\\operatorname{continuous\\_1})\\ + \\\\
&\\qquad\\ \\beta_{2}(\\operatorname{continuous\\_2})]
\\end{aligned}"

Expand Down
52 changes: 44 additions & 8 deletions tests/testthat/test-glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ test_that("Probit regression works", {
family = binomial(link = "probit"))

tex <- extract_eq(model_probit)
actual <- "P(\\operatorname{outcome} = \\operatorname{1}) = \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]"
actual <- "P( \\operatorname{outcome} = \\operatorname{1} ) = \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]"

expect_equal(tex, equation_class(actual),
label = "basic equation builds correctly")

# Everything works when there are no categorical variables
model_gaussian <- glm(mpg ~ cyl + disp, data = mtcars, family = gaussian())
tex <- extract_eq(model_gaussian)
actual <- "\\operatorname{mpg} = \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})"
actual <- "E( \\operatorname{mpg} ) = \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})"

expect_equal(tex, equation_class(actual),
label = "equation sans categorical variables builds correctly")
Expand Down Expand Up @@ -66,13 +66,15 @@ test_that("Distribution-based equations work", {
tex_probit <- extract_eq(model_probit, show_distribution = TRUE)

actual_logit <- "\\begin{aligned}
\\operatorname{outcome}_{\\operatorname{1}} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 300\\right) \\\\
\\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})
\\operatorname{outcome} &\\sim Bernoulli\\left(\\operatorname{prob}_{\\operatorname{outcome} = \\operatorname{1}}= \\hat{P}\\right) \\\\
\\log\\left[ \\frac { \\hat{P} }{ 1 - \\hat{P} } \\right]
&= \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})
\\end{aligned}"

actual_probit <- "\\begin{aligned}
\\operatorname{outcome}_{\\operatorname{1}} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 300\\right) \\\\
\\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]
\\operatorname{outcome} &\\sim Bernoulli\\left(\\operatorname{prob}_{\\operatorname{outcome} = \\operatorname{1}}= \\hat{P}\\right) \\\\
\\hat{P}
&= \\Phi[\\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})]
\\end{aligned}"

expect_equal(tex_logit, equation_class(actual_logit),
Expand All @@ -85,8 +87,7 @@ test_that("Distribution-based equations work", {
model_gaussian <- glm(mpg ~ cyl + disp, data = mtcars, family = gaussian())
tex <- extract_eq(model_gaussian, show_distribution = TRUE)
actual <- "\\begin{aligned}
\\operatorname{mpg} &\\sim B\\left(\\operatorname{prob} = \\hat{P},\\operatorname{size} = 32\\right) \\\\
\\log\\left[ \\frac {\\hat{P}}{1 - \\hat{P}} \\right] \n &= \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})
E( \\operatorname{mpg} ) &= \\alpha + \\beta_{1}(\\operatorname{cyl}) + \\beta_{2}(\\operatorname{disp})
\\end{aligned}"

expect_equal(tex, equation_class(actual),
Expand All @@ -106,3 +107,38 @@ test_that("Weights work", {

expect_warning(extract_eq(model_logit, show_distribution = TRUE))
})

test_that("non-binomial regression works", {
set.seed(1234)
df <- data.frame(outcome = sample(0:15, 300, replace = TRUE),
categorical = rep(letters[1:3], 100),
continuous_1 = rnorm(300, 100, 1),
continuous_2 = rnorm(300, 50, 5))

model_log <- glm(outcome ~ ., data = df,
family = poisson(link = "log"))
tex_log <- extract_eq(model_log)
actual_log <- "\\log ({ E( \\operatorname{outcome} ) }) = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})"

expect_equal(tex_log, equation_class(actual_log),
label = "basic log equation builds correctly")

expect_message(extract_eq(model_log, show_distribution = TRUE))

model_id <- glm(outcome ~ ., data = df,
family = gaussian(link = "identity"))
tex_id <- extract_eq(model_log)
actual_id <- "\\log ({ E( \\operatorname{outcome} ) }) = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2})"

expect_equal(tex_id, equation_class(actual_id),
label = "basic identity equation builds correctly")

model_offset <- glm(outcome ~ . + offset(rep(1, 300)), data = df,
family = poisson(link = "log"))
tex_offset <- extract_eq(model_offset)
actual_offset <- "\\log ({ E( \\operatorname{outcome} ) }) = \\alpha + \\beta_{1}(\\operatorname{categorical}_{\\operatorname{b}}) + \\beta_{2}(\\operatorname{categorical}_{\\operatorname{c}}) + \\beta_{3}(\\operatorname{continuous\\_1}) + \\beta_{4}(\\operatorname{continuous\\_2}) + \\operatorname{offset(rep(1, 300))}"

expect_equal(tex_offset, equation_class(actual_offset),
label = "basic equation with offset builds correctly")
})

Loading

0 comments on commit d8246b3

Please sign in to comment.