-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path_setup.R
228 lines (200 loc) · 7 KB
/
_setup.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# This setup script is run at the top of each Quarto report subsection to load
# libraries, data, and other objects needed for the report. It only loads
# objects if they don't already exist in the environment, so it can be run
# idempotently to avoid reloading data sets and libraries
## Libraries and paths ---------------------------------------------------------
library(purrr)
library(here)
# Load list of helper files and main libraries
purrr::walk(list.files(here::here("R"), "\\.R$", full.names = TRUE), source)
# Load reporting-only R libraries
suppressPackageStartupMessages({
reporting_libs <- "Config/renv/profiles/reporting/dependencies"
purrr::walk(
strsplit(read_yaml(here::here("DESCRIPTION"))[[reporting_libs]], ", ")[[1]],
library,
character.only = TRUE
)
})
# TODO: Catch for weird Arrow bug with SIGPIPE. Need to permanently fix later
# https://github.com/apache/arrow/issues/32026
cpp11::cpp_source(code = "
#include <csignal>
#include <cpp11.hpp>
[[cpp11::register]] void ignore_sigpipes() {
signal(SIGPIPE, SIG_IGN);
}
")
ignore_sigpipes()
# Initialize a dictionary of file paths. See misc/file_dict.csv for details
paths <- model_file_dict(model_params$run_id, model_params$year)
## Input data ------------------------------------------------------------------
# Ingest training set used for this run from DVC bucket (if not local)
if (!exists("training_data")) {
training_data <- read_parquet(paths$input$training$local)
}
# Load assessment set used for this run
if (!exists("assessment_data")) {
assessment_data <- read_parquet(paths$input$assessment$local)
}
# Load characteristics data
if (!exists("chars_data")) {
chars_data <- read_parquet(paths$input$char$local)
}
## Output data -----------------------------------------------------------------
# Grab metadata to check output data <> params alignment
metadata <- read_parquet(paths$output$metadata$local)
if (metadata$run_id != params$run_id) {
stop(
"Local run outputs are NOT equal to the requested run_id. You ",
"should run model_fetch_run() to fetch model outputs from S3"
)
}
# Extract hash values from dvc.lock file
dvc_lock_values <- sapply(
read_yaml(here::here("dvc.lock"))$stages$ingest$outs,
function(x) x$md5
)
metadata_dvc_md5 <- metadata %>%
select(starts_with("dvc_md5"))
# Compare hash for each metadata value with the corresponding dvc.lock value
comparison_results <- purrr::map2_lgl(
metadata_dvc_md5,
dvc_lock_values,
~ .x == .y
)
# Check if all hashes match between metadata and dvc.lock
if (!all(comparison_results)) {
stop(
"Hash values between the dvc.lock file and the metadata ",
"do not match. Potential mismatch in input and output data."
)
}
# Get the triad of the run to use for filtering
run_triad <- tools::toTitleCase(metadata$assessment_triad)
run_triad_code <- ccao::town_dict %>%
filter(triad_name == run_triad) %>%
distinct(triad_code) %>%
pull(triad_code)
# Load model object and recipe
model_fit <- lightsnip::lgbm_load(paths$output$workflow_fit$local)
model_recipe <- readRDS(paths$output$workflow_recipe$local)
# Load model-generated output data sets
if (!exists("assessment_card")) {
assessment_card <- read_parquet(paths$output$assessment_card$local)
}
if (!exists("assessment_pin")) {
assessment_pin <- read_parquet(paths$output$assessment_pin$local)
}
if (!exists("test_card")) {
test_card <- read_parquet(paths$output$test_card$local)
}
if (!exists("model_performance_test")) {
model_performance_test <-
arrow::read_parquet(paths$output$performance_test$local)
}
if (!exists("model_performance_test_linear")) {
model_performance_test_linear <-
arrow::read_parquet(paths$output$performance_test_linear$local)
}
if (!exists("model_performance_quantile_test")) {
# nolint start: object_length_linter
model_performance_quantile_test <-
arrow::read_parquet(paths$output$performance_quantile_test$local)
}
if (!exists("model_performance_quantile_test_linear")) {
model_performance_quantile_test_linear <-
arrow::read_parquet(paths$output$performance_quantile_test_linear$local)
# nolint end
}
if (!exists("model_performance_assessment")) {
model_performance_assessment <-
arrow::read_parquet(paths$output$performance_assessment$local)
}
if (!exists("model_parameter_final")) {
model_parameter_final <-
arrow::read_parquet(paths$output$parameter_final$local)
}
if (!exists("feat_imp_df")) {
feat_imp_df <- read_parquet(paths$output$feature_importance$local)
}
# Load SHAP data if it exists (only exists for important runs)
if (file.exists(paths$output$shap$local) && metadata$shap_enable) {
shap_df <- read_parquet(paths$output$shap$local)
shap_exists <- nrow(shap_df) > 0
} else {
shap_exists <- FALSE
}
# Add colors to re-use across plots
plot_colors <- list(
"sales" = "#66c2a5",
"main" = "#5773b3",
"linear" = "#fc8d62",
"met" = "#d3f2c2",
"bg_line" = "#2C3E50"
)
# Helper function to help with x-axis tick labels
shorten_number <- function(x) {
scales::dollar(x, accuracy = 1, scale = 1 / 1000, suffix = "K")
}
# Chunk to populate the metadata / dataset summaries in the text of each module
# Anything prefixed with m_ is a variable that will be used directly in the text
m_test_min_date <- min(test_card$meta_sale_date)
m_test_max_date <- max(test_card$meta_sale_date)
m_test_n_sales <- test_card %>%
nrow() %>%
scales::comma()
m_test_n_sales_triad <- test_card %>%
filter(meta_triad_code == run_triad_code) %>%
nrow() %>%
scales::comma()
m_test_n_sales_prop <- (
nrow(filter(test_card, meta_triad_code == run_triad_code)) /
nrow(test_card)
) %>%
scales::percent(accuracy = 0.01)
m_test_med_sp <- test_card$meta_sale_price %>%
median() %>%
scales::dollar()
m_test_split_prop <- scales::percent(
1 - metadata$cv_split_prop,
accuracy = 0.01
)
m_train_min_date <- min(training_data$meta_sale_date)
m_train_max_date <- max(training_data$meta_sale_date)
m_train_n_sales <- training_data %>%
nrow() %>%
scales::comma()
m_train_n_sales_triad <- training_data %>%
filter(meta_triad_code == run_triad_code) %>%
nrow() %>%
scales::comma()
m_train_n_sales_prop <- (
nrow(filter(training_data, meta_triad_code == run_triad_code)) /
nrow(training_data)
) %>%
scales::percent(accuracy = 0.01)
m_train_med_sp <- training_data$meta_sale_price %>%
median() %>%
scales::dollar()
m_train_n_outliers <- training_data$sv_is_outlier %>%
sum() %>%
scales::comma()
m_train_n_outliers_prop <-
((sum(training_data$sv_is_outlier) / nrow(training_data))) %>%
scales::percent(accuracy = 0.01)
m_assess_min_date <- min(assessment_pin$sale_ratio_study_date, na.rm = TRUE)
m_assess_max_date <- max(assessment_pin$sale_ratio_study_date, na.rm = TRUE)
m_assess_n_sales <- table(is.na(assessment_pin$sale_ratio_study_price))[1] %>%
scales::comma()
m_assess_med_sp <- assessment_pin$sale_ratio_study_price %>%
median(na.rm = TRUE) %>%
scales::dollar()
m_assess_stage_far <- paste(
metadata$ratio_study_far_year,
metadata$ratio_study_far_stage
)
m_assess_stage_near <- paste(
metadata$ratio_study_near_year,
metadata$ratio_study_near_stage
)