Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix shaps and comps to work with multi-card solution #360

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pipeline/02-assess.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ assessment_card_data_mc <- assessment_card_data_pred %>%
# that prediction as the PIN value. For > 3 cards, we predict each card with
# its original square footage then sum the predictions to get the PIN value
group_by(meta_pin) %>%
arrange(meta_pin, desc(char_bldg_sf)) %>%
arrange(meta_pin, desc(char_bldg_sf), meta_card_num) %>%
mutate(
pred_pin_card_sum = ifelse(
meta_pin_num_cards > 3,
Expand Down
97 changes: 87 additions & 10 deletions pipeline/04-interpret.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,32 @@ if (shap_enable || comp_enable) {
# PINs) that need values. Will use the the trained model to calc SHAP values
assessment_data <- as_tibble(read_parquet(paths$input$assessment$local))

# Aggregate square footage to building level. We do this to ensure consistent
# shap values and comps based on the generated prediction for multi-card pins.
# More details in multi-card handling in the assess stage.
# https://github.com/ccao-data/model-res-avm/issues/358

# Persist sort order
assessment_data_ordered <- assessment_data %>%
group_by(meta_pin) %>%
arrange(desc(char_bldg_sf), meta_card_num) %>%
mutate(sqft_card_num_sort = row_number()) %>%
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This column sqft_card_num_sort is persisted throughout the shaps and comps calculations to maintain the correct ordering

ungroup()

assessment_data <- assessment_data_ordered %>%
mutate(
char_bldg_sf = ifelse(
ind_pin_is_multicard & meta_pin_num_cards %in% c(2, 3),
sum(char_bldg_sf),
char_bldg_sf
),
.by = meta_pin
)

# Run the saved recipe on the assessment data to format it for prediction
assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = assessment_data,
new_data = assessment_data %>% select(-sqft_card_num_sort),
all_predictors()
)
}
Expand Down Expand Up @@ -77,13 +99,33 @@ if (shap_enable) {
shap_values_final <- assessment_data %>%
select(
meta_year, meta_pin, meta_card_num,
township_code = meta_township_code
meta_pin_num_cards,
township_code = meta_township_code,
sqft_card_num_sort
) %>%
bind_cols(shap_values_tbl) %>%
select(
meta_year, meta_pin, meta_card_num, pred_card_shap_baseline_fmv,
meta_year, meta_pin, meta_card_num, sqft_card_num_sort,
meta_pin_num_cards, pred_card_shap_baseline_fmv,
all_of(params$model$predictor$all), township_code
) %>%
group_by(meta_pin) %>%
# Replicate sort used in the assess stage to ensure the same card's chars
# are used accross the assess stage and interpret stage
arrange(sqft_card_num_sort) %>%
group_modify(~ {
shap_cols <- c("pred_card_shap_baseline_fmv", params$model$predictor$all)
# If the first row indicates 2 or 3 cards,
# duplicate its SHAP values across the group
# https://github.com/ccao-data/model-res-avm/issues/358#issue-2897826811
if (.x$meta_pin_num_cards[1] %in% c(2, 3)) {
.x[shap_cols] <- .x[rep(1, nrow(.x)), shap_cols]
}
.x
}) %>%
arrange(meta_pin, meta_card_num) %>%
ungroup() %>%
select(-meta_pin_num_cards, -sqft_card_num_sort) %>%
write_parquet(paths$output$shap$local)
} else {
# If SHAP creation is disabled, we still need to write an empty stub file
Expand Down Expand Up @@ -124,17 +166,37 @@ if (comp_enable) {

# Filter target properties for only the current triad, to speed up the comps
# algorithm
comp_assessment_data <- assessment_data %>%
comp_assessment_data_preprocess <- assessment_data %>%
filter(
meta_township_code %in% (
ccao::town_dict %>%
filter(triad_name == tools::toTitleCase(params$assessment$triad)) %>%
pull(township_code)
)
)

# Multi-card handling. For multi-card pins with 2-3 cards, we predict by
# aggregating the bldg_sf to a single card, and using that card to predict
# which becomes the value for the mult-card pin. Since we don't predict on the
# other cards, we set them aside for comp generation, to re-attach them later
multicard_pins <- comp_assessment_data_preprocess %>%
filter(meta_pin_num_cards %in% c(2, 3))

selected_cards <- multicard_pins %>%
group_by(meta_pin) %>%
arrange(sqft_card_num_sort) %>%
slice(1) %>%
ungroup()

singlecard_pins <- comp_assessment_data_preprocess %>%
filter(!meta_pin %in% multicard_pins$meta_pin)

comp_assessment_data_intermediate <-
bind_rows(singlecard_pins, selected_cards)

comp_assessment_data_prepped <- recipes::bake(
object = lgbm_final_full_recipe,
new_data = comp_assessment_data,
new_data = comp_assessment_data_intermediate,
all_predictors()
)

Expand Down Expand Up @@ -254,14 +316,29 @@ if (comp_enable) {
) %>%
select(-starts_with("comp_idx_")) %>%
cbind(
pin = comp_assessment_data$meta_pin,
card = comp_assessment_data$meta_card_num
pin = comp_assessment_data_intermediate$meta_pin,
card = comp_assessment_data_intermediate$meta_card_num
) %>%
relocate(pin, card)

# Combine the comp indexes and scores into one dataframe and write to a file
cbind(comps[[1]], comps[[2]]) %>%
write_parquet(paths$output$comp$local)
final_comps_data <- cbind(comps[[1]], comps[[2]])

# Grab removed multi-card cards,re-add them, and assign them the comps data
# that we used for the main frankencard used for prediction
removed_cards <- multicard_pins %>%
anti_join(selected_cards, by = c("meta_pin", "meta_card_num")) %>%
select(meta_pin, meta_card_num)

removed_cards_comps <- removed_cards %>%
rename(pin = meta_pin, card = meta_card_num) %>%
left_join(final_comps_data %>% select(-card), by = "pin")

complete_comps_data <-
bind_rows(final_comps_data, removed_cards_comps) %>%
arrange(pin, card)

# Save final combined comps data
arrow::write_parquet(complete_comps_data, paths$output$comp$local)
} else {
# If comp creation is disabled, we still need to write an empty stub file
# so DVC doesn't complain
Expand Down