init fen-update branch

SamanthaToet · Feb 7, 2023 · e395209 · e395209
1 parent a8b02c9
commit e395209
Show file tree

Hide file tree

Showing 31 changed files with 374,479 additions and 490,525 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,6 @@
 .Rhistory
-.DS_Store
+*.DS_Store
 *.Rproj.user
 *.Rproj
-
-civilcases/
-processed-data/
-.Rproj.user
+data/
+processed-data/
diff --git a/advisory-committee-doc.Rmd b/advisory-committee-doc.Rmd
diff --git a/advisory-committee-doc.html b/advisory-committee-doc.html
diff --git a/clean-eviction-data.R b/clean-eviction-data.R
diff --git a/clean.R b/clean.R
@@ -0,0 +1,143 @@
+##############################################################
+# Eviction case data cleaning script                         #
+# Authors: Jacob Goldstein-Greenwood, Michele Claibourn      #
+# GitHub: jacob-gg, mclaibourn                               #
+# Last revised: 2023-02-07                                   #
+##############################################################
+
+######################## Instructions ########################
+# 1. Check the modifiable user preset variables below
+# 2. With those set, the code should run all the way through
+#    using data in the general format provided by the LSC
+case_id_var <- 'c2dp_case_id'
+data_directory <- 'data'
+output_directory <- 'processed-data'
+##############################################################
+
+# Packages
+required <- c('devtools', 'dplyr', 'lubridate', 'virginiaequitycenter/ECtools')
+handle_package <- function(pkg) {
+  if (grepl(x = pkg, pattern = '\\/')) { devtools::install_github(pkg) }
+  else if (!(pkg %in% installed.packages())) { install.packages(pkg) }
+  pkg <- sub(x = pkg, pattern = '.+\\/', replacement = '')
+  library(pkg, character.only = TRUE)
+}
+lapply(required, function(x) handle_package(x))
+
+# Read data
+keywords <- c('case', 'defendant', 'hearing', 'judgment', 'plaintiff')
+files <- dir(data_directory)
+dat_list <- lapply(seq_along(keywords), function(x) read.csv(paste0(data_directory, '/', files[grepl(x = files, pattern = keywords[x])])))
+
+# For variable names that are duplicated across data frames, prefix them with the name of their source data frame
+var_names <- unlist(sapply(dat_list, function(x) colnames(x)), use.names = F)
+duplicated_var_names <- unique(var_names[duplicated(var_names)]) %>% .[. != case_id_var]
+dat_list <- lapply(seq_along(dat_list), function(z) {
+  nms <- colnames(dat_list[[z]])
+  nms[nms %in% duplicated_var_names] <- paste0(keywords[z], '_', nms[nms %in% duplicated_var_names])
+  colnames(dat_list[[z]]) <- nms
+  dat_list[[z]]
+})
+
+# Handle duplicated case IDs and set names of data list elements
+########################### Canary ###########################
+# As of 2022-02-06, there's >= 1 duplicated case ID in the data; these lines handle that/those, albeit in a brutish way
+# This will be updated to a more principled, multi-outcome system down the line
+duplicated_case_ids <- unique(dat_list[[which(keywords == 'case')]][[case_id_var]][duplicated(dat_list[[which(keywords == 'case')]][[case_id_var]])])
+dat_list <- lapply(seq_along(dat_list), function(x) eval(parse(text = paste0("dat_list[[", x, "]][(dat_list[[", x, "]][[case_id_var]] %in% duplicated_case_ids) == F, ]"))))
+names(dat_list) <- keywords
+##############################################################
+
+# Aggregate
+source('functions_aggregation.R')
+dat_list[['case']] <- case_aggregator(dat_list[['case']])
+dat_list[['defendant']] <- defendant_aggregator(dat_list[['defendant']])
+dat_list[['plaintiff']] <- plaintiff_aggregator(dat_list[['plaintiff']])
+dat_list[['judgment']] <- judgment_aggregator(dat_list[['judgment']])
+dat_list[['hearing']] <- hearing_aggregator(dat_list[['hearing']])
+
+# Merge
+cases <- Reduce(function(x, y) merge(x, y, by = case_id_var, all = TRUE), dat_list)
+
+# Extract years of case filings
+cases$filed_year <- extract_year(cases$filed_date, expect_modern = TRUE, return_numeric = FALSE)
+
+########################### Canary ###########################
+# Currently, we only keep cases from 2018-onward
+cases$filed_year <- as.numeric(cases$filed_year)
+cases <- cases[cases$filed_year >= 2018, ]
+##############################################################
+
+# Extract quarters of case filings
+cases$filed_quarter <- assign_quarter(cases$filed_date, return_QX = TRUE)
+
+# Standardize names
+cases$defendant_name <- standardize_name(cases$defendant_name, case_out = 'upper')
+cases$plaintiff_name <- standardize_name(cases$plaintiff_name, case_out = 'upper')
+
+# Correct punctuation spacing in names
+cases$defendant_name <- correct_punctuation_spacing(cases$defendant_name)
+cases$plaintiff_name <- correct_punctuation_spacing(cases$plaintiff_name)
+
+# Remove commas before business identifiers in plaintiff names
+# Drawn from: https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country#United_States
+# LC, LLC, PLLC, LP, LLP, LLLP, CO, CO OP, COOP, CORP, CP, LTD, INC, PB, PBD, FSB, NA, L3C
+cases$plaintiff_name <- stringi::stri_replace_all(cases$plaintiff_name,
+                                                  regex = ', (?=(P?LL?C|LL?L?P|CO( ?OP|RP)?|CP|LTD|INC|PB?C|FSB|NA|L3C)$)',
+                                                  replacement = ' ')
+
+# Trim middle initials
+# max_nchar_names <- max(c(max(nchar(cases$defendant_name), max(nchar(cases$plaintiff_name)))))
+# pattern <- paste0('(?<=^[A-Za-z ]{1,', max_nchar_names, '}, [A-Za-z ]{1,', max_nchar_names, '}) [A-Za-z]{1}$')
+# cases$defendant_name <- stringi::stri_replace_all(cases$defendant_name, regex = pattern, replacement = '')
+# cases$plaintiff_name <- stringi::stri_replace_all(cases$plaintiff_name, regex = pattern, replacement = '')
+
+# Expand common housing acronyms
+cases$defendant_name <- expand_shorthand(cases$defendant_name, type = 'housing', case_out = 'upper')
+cases$plaintiff_name <- expand_shorthand(cases$plaintiff_name, type = 'housing', case_out = 'upper')
+
+# Extract ZIP Codes
+cases$defendant_zip <- extract_zip(cases$defendant_address, if_multiple = 'first', must_follow_state = TRUE)
+cases$plaintiff_zip <- extract_zip(cases$plaintiff_address, if_multiple = 'first', must_follow_state = TRUE)
+# Convert non-VA ZIPs to NA
+va_zips <- as.character(c(20100:20199, 22000:24699)) # https://en.wikipedia.org/wiki/List_of_ZIP_Code_prefixes
+cases$defendant_zip <- ifelse(cases$defendant_zip %in% va_zips, cases$defendant_zip, NA)
+cases$plaintiff_zip <- ifelse(cases$plaintiff_zip %in% va_zips, cases$plaintiff_zip, NA)
+
+# Identify and remove true duplicates (note that this process uses plaintiff_name and defendant_name, which have been cleaned)
+duplicate_check_vars <- c('filed_date', 'judgment', 'costs', 'attorney_fees', 'principal_amount',
+                          'other_amount', 'plaintiff_name', 'defendant_name', 'defendant_zip')
+cases <- remove_duplicates_df(dat = cases, column_names = duplicate_check_vars, save_removed_rows_as = 'removed')
+
+# Identify serial cases
+source('functions_serial_cases.R')
+cases <- id_serials(cases)
+
+# Identify non-residential defendants
+cases$defendant_non_residential <- identify_non_residential(cases$defendant_name)
+########################### Canary ###########################
+# Un-flag cases with "OCCUPANT(S)" in the primary defendant name (likely residential, e.g., "ANY AND ALL OCCUPANTS")
+cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\boccupants?\\b'), FALSE, cases$defendant_non_residential)
+# Un-flag cases with "ESTATE OF" in the defendant names (likely residential, e.g., "ESTATE OF JANE SMITH")
+cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\bestate of?\\b'), FALSE, cases$defendant_non_residential)
+##############################################################
+
+# Write out resulting data
+if (dir.exists(output_directory) == FALSE) { dir.create(output_directory) }
+write.csv(cases, file = paste0(output_directory, '/cases.txt'), row.names = FALSE)
+cases_residential_only <- cases[cases$defendant_non_residential == FALSE, ]
+write.csv(cases_residential_only, file = paste0(output_directory, '/cases_residential_only.txt'), row.names = FALSE)
+
+# Log file
+out <- c('run_date' = as.character(Sys.Date()),
+         'time_finished' = format(Sys.time(), '%R'),
+         'n_residential_cases' = nrow(cases_residential_only),
+         'n_cases' = nrow(cases),
+         'min_year_residential_cases' = min(cases_residential_only$filed_year, na.rm = T),
+         'max_year_residential_cases' = max(cases_residential_only$filed_year, na.rm = T),
+         'n_serial_residential_cases' = sum(cases_residential_only$serial_filing, na.rm = T),
+         'n_true_duplicates_removed' = nrow(removed),
+         'n_duplicate_case_ids_removed' = length(duplicated_case_ids),
+         'duplicate_case_ids_removed' = paste0(duplicated_case_ids, collapse = ', '))
+writeLines(con = paste0('log.txt'), text = paste0(names(out), ': ', out))
+
diff --git a/cleaning-notes.txt b/cleaning-notes.txt
diff --git a/data-non-residential-regex.R b/data-non-residential-regex.R