-
Notifications
You must be signed in to change notification settings - Fork 2
/
clean.R
169 lines (147 loc) · 9.94 KB
/
clean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
##############################################################
# Eviction case data cleaning script #
# Authors: Jacob Goldstein-Greenwood, Michele Claibourn #
# GitHub: jacob-gg, mclaibourn #
# Last revised: 2024-04-19 #
##############################################################
######################## Instructions ########################
# 1. Check the modifiable user-preset variables below
# 2. With those set, the code should run all the way through
# using data in the general format provided by the LSC
case_id_var <- 'c2dp_case_id'
data_directory <- 'data/original'
data_updates_directory <- 'data/updates'
output_directory <- 'processed-data'
##############################################################
# Packages
required <- c('devtools', 'dplyr', 'lubridate', 'virginiaequitycenter/ECtools')
handle_package <- function(pkg) {
if (grepl(x = pkg, pattern = '\\/')) { devtools::install_github(pkg) }
else if (!(pkg %in% installed.packages())) { install.packages(pkg) }
pkg <- sub(x = pkg, pattern = '.+\\/', replacement = '')
library(pkg, character.only = TRUE)
}
lapply(required, function(x) handle_package(x))
# Read data
keywords <- c('case', 'defendant', 'hearing', 'judgment', 'plaintiff')
files <- dir(data_directory)
dat_list <- lapply(seq_along(keywords), function(x) read.csv(paste0(data_directory, '/', files[grepl(x = files, pattern = keywords[x])])))
# Update original data with LSC periodic updates (updated dispositions, etc.)
# This function relies on the existence of the dat_list, case_id_var, and keywords objects
updated_data_folders <- dir(data_updates_directory)
update_data <- function(folder, kywd = keywords) {
update_files <- dir(paste0(data_updates_directory, '/', folder))
update_dat_list <- lapply(seq_along(kywd), function(x) read.csv(paste0(data_updates_directory, '/', folder, '/', update_files[grepl(x = update_files, pattern = kywd[x])])))
needs_updating <- update_dat_list[[which(kywd == 'case')]][[case_id_var]][update_dat_list[[which(kywd == 'case')]][[case_id_var]] %in% dat_list[[which(kywd == 'case')]][[case_id_var]]]
dat_list <- lapply(seq_along(dat_list), function(x) eval(parse(text = paste0("dat_list[[", x, "]][(dat_list[[", x, "]][[case_id_var]] %in% needs_updating) == F, ]"))))
dat_list <- lapply(seq_along(dat_list), function(x) rbind(dat_list[[x]], update_dat_list[[x]]))
dat_list
}
for (i in 1:length(updated_data_folders)) { dat_list <- update_data(updated_data_folders[i]) }
# For variable names that are duplicated across data frames, prefix them with the name of their source data frame
var_names <- unlist(sapply(dat_list, function(x) colnames(x)), use.names = F)
duplicated_var_names <- unique(var_names[duplicated(var_names)]) %>% .[. != case_id_var]
dat_list <- lapply(seq_along(dat_list), function(z) {
nms <- colnames(dat_list[[z]])
nms[nms %in% duplicated_var_names] <- paste0(keywords[z], '_', nms[nms %in% duplicated_var_names])
colnames(dat_list[[z]]) <- nms
dat_list[[z]]
})
# Handle duplicated case IDs and set names of data list elements
# These lines handle duplicated cases in a fairly unprincipled way (dropping cases with duplicated IDs)
# This will be updated to a more principled, multi-outcome system down the line
duplicated_case_ids <- unique(dat_list[[which(keywords == 'case')]][[case_id_var]][duplicated(dat_list[[which(keywords == 'case')]][[case_id_var]])])
dat_list <- lapply(seq_along(dat_list), function(x) eval(parse(text = paste0("dat_list[[", x, "]][(dat_list[[", x, "]][[case_id_var]] %in% duplicated_case_ids) == F, ]"))))
names(dat_list) <- keywords
# Aggregate
source('functions_aggregation.R')
dat_list[['case']] <- case_aggregator(dat_list[['case']])
dat_list[['defendant']] <- defendant_aggregator(dat_list[['defendant']])
dat_list[['plaintiff']] <- plaintiff_aggregator(dat_list[['plaintiff']])
dat_list[['judgment']] <- judgment_aggregator(dat_list[['judgment']])
dat_list[['hearing']] <- hearing_aggregator(dat_list[['hearing']])
# Merge
cases <- Reduce(function(x, y) merge(x, y, by = case_id_var, all = TRUE), dat_list)
########################### Canary ###########################
# Richmond city cases vary in their court listing; some are associated with each of the
# following: Richmond City General District Court; Richmond-Civil General District Court. For now,
# we convert all to the former. Note: We currently *do not* update FIPS, which also vary (760 and 763),
# as we do not use FIPS as a grouping variable in the cleaning, summarizing, or app code.
cases[cases$county == 'Richmond-Civil General District Court', 'county'] <- 'Richmond City General District Court'
##############################################################
# Extract years of case filings
cases$filed_year <- extract_year(cases$filed_date, expect_modern = TRUE)
# Extract quarters of case filings
cases$filed_quarter <- assign_quarter(cases$filed_date, return_QX = TRUE)
# Only subset cases through certain date if a valid date is given in setup.txt
if ('setup.txt' %in% dir()) {
setup <- readLines('setup.txt')
use_cases_through <- setup[which(setup == 'use-data-through:') + 1]
if (grepl(pattern = '\\d{4}-\\d{2}-\\d{2}', x = use_cases_through)) {
cases <- cases[cases$filed_date <= use_cases_through, ]
}
}
# Only work with cases from 2018 onward
cases$filed_year <- as.numeric(cases$filed_year)
cases <- cases[cases$filed_year >= 2018, ]
####################### Notes on names #######################
# We use LSC cleaned_party_name for plantiff names, but we still apply our standardization and cleaning
# process to defendant names (for purposes of, e.g., IDing serial cases and cases against residential defendants),
# and we convert LSC cleaned_party_name entries to uppercase
##############################################################
# Uppercase LSC clean_party_name (plaintiff name) entries and save as plaintiff_name
cases$plaintiff_name <- toupper(cases$clean_party_name)
# Correct punctuation spacing in defendant names
cases$defendant_name <- correct_punctuation_spacing(cases$defendant_name)
# Standardize defendant names
cases$defendant_name <- standardize_name(cases$defendant_name, case_out = 'upper')
# Expand common housing acronyms in defendant names
cases$defendant_name <- expand_shorthand(cases$defendant_name, type = 'housing', case_out = 'upper')
# Extract ZIP Codes
cases$defendant_zip <- extract_zip(cases$defendant_address, if_multiple = 'first', must_follow_state = TRUE)
cases$plaintiff_zip <- extract_zip(cases$plaintiff_address, if_multiple = 'first', must_follow_state = TRUE)
# Convert non-VA ZIPs to NA
va_zips <- as.character(c(20100:20199, 22000:24699)) # https://en.wikipedia.org/wiki/List_of_ZIP_Code_prefixes
cases$defendant_zip <- ifelse(cases$defendant_zip %in% va_zips, cases$defendant_zip, NA)
cases$plaintiff_zip <- ifelse(cases$plaintiff_zip %in% va_zips, cases$plaintiff_zip, NA)
# Identify and remove true duplicates (note that this process uses plaintiff_name and defendant_name, which have been cleaned)
duplicate_check_vars <- c('filed_date', 'judgment', 'costs', 'attorney_fees', 'principal_amount',
'other_amount', 'plaintiff_name', 'defendant_name', 'defendant_zip')
cases <- remove_duplicates_df(dat = cases, column_names = duplicate_check_vars, save_removed_rows_as = 'removed')
# Identify serial cases
source('functions_serial_cases.R')
cases <- id_serials(cases)
# Identify non-residential defendants
cases$defendant_non_residential <- identify_non_residential(cases$defendant_name)
# Un-flag cases with "OCCUPANT(S)" in the primary defendant name (likely residential, e.g., "ANY AND ALL OCCUPANTS")
cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\boccupants?\\b'), FALSE, cases$defendant_non_residential)
# Un-flag cases with "ESTATE OF" in the defendant names (likely residential, e.g., "ESTATE OF JANE SMITH")
cases$defendant_non_residential <- ifelse(grepl(x = cases$defendant_name, pattern = '(?i)\\bestate of?\\b'), FALSE, cases$defendant_non_residential)
# Identify non-residential plaintiffs
cases$plaintiff_non_residential <- identify_non_residential(cases$plaintiff_name)
# Write out resulting data
if (dir.exists(output_directory) == FALSE) { dir.create(output_directory) }
write.csv(cases, file = paste0(output_directory, '/cases.txt'), row.names = FALSE)
cases_residential_only <- cases[cases$defendant_non_residential == FALSE, ]
write.csv(cases_residential_only, file = paste0(output_directory, '/cases_residential_only.txt'), row.names = FALSE)
# Log file
out <- c('run_date' = as.character(Sys.Date()),
'time_finished' = format(Sys.time(), '%R'),
'n_residential_cases' = nrow(cases_residential_only),
'n_cases' = nrow(cases),
'min_year_residential_cases' = min(cases_residential_only$filed_year, na.rm = T),
'max_year_residential_cases' = max(cases_residential_only$filed_year, na.rm = T),
'n_serial_residential_cases' = sum(cases_residential_only$serial_filing, na.rm = T),
'n_true_duplicates_removed' = nrow(removed),
'n_duplicate_case_ids_removed' = length(duplicated_case_ids),
'duplicate_case_ids_removed' = paste0(duplicated_case_ids, collapse = ', '))
writeLines(con = paste0('log.txt'), text = paste0(names(out), ': ', out))
##############################################################
################### Miscellaneous old code ###################
# Remove commas before business identifiers in plaintiff names
# Drawn from: https://en.wikipedia.org/wiki/List_of_legal_entity_types_by_country#United_States
# LC, LLC, PLLC, LP, LLP, LLLP, CO, CO OP, COOP, CORP, CP, LTD, INC, PB, PBD, FSB, NA, L3C
# cases$plaintiff_name <- stringi::stri_replace_all(cases$plaintiff_name,
# regex = ', (?=(P?LL?C|LL?L?P|CO( ?OP|RP)?|CP|LTD|INC|PB?C|FSB|NA|L3C)$)',
# replacement = ' ')
# Rendered (at least temporarily) obsolete by the switch to using plaintiff names cleaned by LSC