Skip to content

Commit

Permalink
init fen-update branch
Browse files Browse the repository at this point in the history
  • Loading branch information
jacob-gg committed Feb 7, 2023
1 parent a8b02c9 commit e395209
Show file tree
Hide file tree
Showing 31 changed files with 374,479 additions and 490,525 deletions.
8 changes: 3 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
.Rhistory
.DS_Store
*.DS_Store
*.Rproj.user
*.Rproj

civilcases/
processed-data/
.Rproj.user
data/
processed-data/
416 changes: 0 additions & 416 deletions advisory-committee-doc.Rmd

This file was deleted.

541 changes: 0 additions & 541 deletions advisory-committee-doc.html

This file was deleted.

489 changes: 0 additions & 489 deletions clean-eviction-data.R

This file was deleted.

143 changes: 143 additions & 0 deletions clean.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
##############################################################
# Eviction case data cleaning script #
# Authors: Jacob Goldstein-Greenwood, Michele Claibourn #
# GitHub: jacob-gg, mclaibourn #
# Last revised: 2023-02-07 #
##############################################################

######################## Instructions ########################
# 1. Check the modifiable user preset variables below
# 2. With those set, the code should run all the way through
# using data in the general format provided by the LSC
case_id_var <- 'c2dp_case_id'
data_directory <- 'data'
output_directory <- 'processed-data'
##############################################################

# Packages
required <- c('devtools', 'dplyr', 'lubridate', 'virginiaequitycenter/ECtools')
handle_package <- function(pkg) {
if (grepl(x = pkg, pattern = '\\/')) { devtools::install_github(pkg) }
else if (!(pkg %in% installed.packages())) { install.packages(pkg) }
pkg <- sub(x = pkg, pattern = '.+\\/', replacement = '')
library(pkg, character.only = TRUE)
}
lapply(required, function(x) handle_package(x))

# Read data
keywords <- c('case', 'defendant', 'hearing', 'judgment', 'plaintiff')
files <- dir(data_directory)
dat_list <- lapply(seq_along(keywords), function(x) read.csv(paste0(data_directory, '/', files[grepl(x = files, pattern = keywords[x])])))

# For variable names that are duplicated across data frames, prefix them with the name of their source data frame
var_names <- unlist(sapply(dat_list, function(x) colnames(x)), use.names = F)
duplicated_var_names <- unique(var_names[duplicated(var_names)]) %>% .[. != case_id_var]
dat_list <- lapply(seq_along(dat_list), function(z) {
nms <- colnames(dat_list[[z]])
nms[nms %in% duplicated_var_names] <- paste0(keywords[z], '_', nms[nms %in% duplicated_var_names])
colnames(dat_list[[z]]) <- nms
dat_list[[z]]
})

# Handle duplicated case IDs and set names of data list elements
########################### Canary ###########################
# As of 2022-02-06, there's >= 1 duplicated case ID in the data; these lines handle that/those, albeit in a brutish way
# This will be updated to a more principled, multi-outcome system down the line
duplicated_case_ids <- unique(dat_list[[which(keywords == 'case')]][[case_id_var]][duplicated(dat_list[[which(keywords == 'case')]][[case_id_var]])])
dat_list <- lapply(seq_along(dat_list), function(x) eval(parse(text = paste0("dat_list[[", x, "]][(dat_list[[", x, "]][[case_id_var]] %in% duplicated_case_ids) == F, ]"))))
names(dat_list) <- keywords
##############################################################

# Aggregate
source('functions_aggregation.R')
dat_list[['case']] <- case_aggregator(dat_list[['case']])
dat_list[['defendant']] <- defendant_aggregator(dat_list[['defendant']])
dat_list[['plaintiff']] <- plaintiff_aggregator(dat_list[['plaintiff']])
dat_list[['judgment']] <- judgment_aggregator(dat_list[['judgment']])
dat_list[['hearing']] <- hearing_aggregator(dat_list[['hearing']])

# Merge
cases <- Reduce(function(x, y) merge(x, y, by = case_id_var, all = TRUE), dat_list)

# Extract years of case filings
cases$filed_year <- extract_year(cases$filed_date, expect_modern = TRUE, return_numeric = FALSE)

########################### Canary ###########################
# Currently, we only keep cases from 2018-onward
cases$filed_year <- as.numeric(cases$filed_year)
cases <- cases[cases$filed_year >= 2018, ]
##############################################################

# Extract quarters of case filings
cases$filed_quarter <- assign_quarter(cases$filed_date, return_QX = TRUE)

# Standardize names
cases$defendant_name <- standardize_name(cases$defendant_name, case_out = 'upper')
cases$plaintiff_name <- standardize_name(cases$plaintiff_name, case_out = 'upper')

# Correct punctuation spacing in names
cases$defendant_name <- correct_punctuation_spacing(cases$defendant_name)
cases$plaintiff_name <- correct_punctuation_spacing(cases$plaintiff_name)

# Remove commas before business identifiers in plaintiff names
# Drawn from: https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country#United_States
# LC, LLC, PLLC, LP, LLP, LLLP, CO, CO OP, COOP, CORP, CP, LTD, INC, PB, PBD, FSB, NA, L3C
cases$plaintiff_name <- stringi::stri_replace_all(cases$plaintiff_name,
regex = ', (?=(P?LL?C|LL?L?P|CO( ?OP|RP)?|CP|LTD|INC|PB?C|FSB|NA|L3C)$)',
replacement = ' ')

# Trim middle initials
# max_nchar_names <- max(c(max(nchar(cases$defendant_name), max(nchar(cases$plaintiff_name)))))
# pattern <- paste0('(?<=^[A-Za-z ]{1,', max_nchar_names, '}, [A-Za-z ]{1,', max_nchar_names, '}) [A-Za-z]{1}$')
# cases$defendant_name <- stringi::stri_replace_all(cases$defendant_name, regex = pattern, replacement = '')
# cases$plaintiff_name <- stringi::stri_replace_all(cases$plaintiff_name, regex = pattern, replacement = '')

# Expand common housing acronyms
cases$defendant_name <- expand_shorthand(cases$defendant_name, type = 'housing', case_out = 'upper')
cases$plaintiff_name <- expand_shorthand(cases$plaintiff_name, type = 'housing', case_out = 'upper')

# Extract ZIP Codes
cases$defendant_zip <- extract_zip(cases$defendant_address, if_multiple = 'first', must_follow_state = TRUE)
cases$plaintiff_zip <- extract_zip(cases$plaintiff_address, if_multiple = 'first', must_follow_state = TRUE)
# Convert non-VA ZIPs to NA
va_zips <- as.character(c(20100:20199, 22000:24699)) # https://en.wikipedia.org/wiki/List_of_ZIP_Code_prefixes
cases$defendant_zip <- ifelse(cases$defendant_zip %in% va_zips, cases$defendant_zip, NA)
cases$plaintiff_zip <- ifelse(cases$plaintiff_zip %in% va_zips, cases$plaintiff_zip, NA)

# Identify and remove true duplicates (note that this process uses plaintiff_name and defendant_name, which have been cleaned)
duplicate_check_vars <- c('filed_date', 'judgment', 'costs', 'attorney_fees', 'principal_amount',
'other_amount', 'plaintiff_name', 'defendant_name', 'defendant_zip')
cases <- remove_duplicates_df(dat = cases, column_names = duplicate_check_vars, save_removed_rows_as = 'removed')

# Identify serial cases
source('functions_serial_cases.R')
cases <- id_serials(cases)

# Identify non-residential defendants
cases$defendant_non_residential <- identify_non_residential(cases$defendant_name)
########################### Canary ###########################
# Un-flag cases with "OCCUPANT(S)" in the primary defendant name (likely residential, e.g., "ANY AND ALL OCCUPANTS")
cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\boccupants?\\b'), FALSE, cases$defendant_non_residential)
# Un-flag cases with "ESTATE OF" in the defendant names (likely residential, e.g., "ESTATE OF JANE SMITH")
cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\bestate of?\\b'), FALSE, cases$defendant_non_residential)
##############################################################

# Write out resulting data
if (dir.exists(output_directory) == FALSE) { dir.create(output_directory) }
write.csv(cases, file = paste0(output_directory, '/cases.txt'), row.names = FALSE)
cases_residential_only <- cases[cases$defendant_non_residential == FALSE, ]
write.csv(cases_residential_only, file = paste0(output_directory, '/cases_residential_only.txt'), row.names = FALSE)

# Log file
out <- c('run_date' = as.character(Sys.Date()),
'time_finished' = format(Sys.time(), '%R'),
'n_residential_cases' = nrow(cases_residential_only),
'n_cases' = nrow(cases),
'min_year_residential_cases' = min(cases_residential_only$filed_year, na.rm = T),
'max_year_residential_cases' = max(cases_residential_only$filed_year, na.rm = T),
'n_serial_residential_cases' = sum(cases_residential_only$serial_filing, na.rm = T),
'n_true_duplicates_removed' = nrow(removed),
'n_duplicate_case_ids_removed' = length(duplicated_case_ids),
'duplicate_case_ids_removed' = paste0(duplicated_case_ids, collapse = ', '))
writeLines(con = paste0('log.txt'), text = paste0(names(out), ': ', out))

Empty file removed cleaning-notes.txt
Empty file.
106 changes: 0 additions & 106 deletions data-non-residential-regex.R

This file was deleted.

Loading

0 comments on commit e395209

Please sign in to comment.