-
Notifications
You must be signed in to change notification settings - Fork 0
/
01-get-example-data.R
140 lines (106 loc) · 4.46 KB
/
01-get-example-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
### Download Raw Data for Governance Factor Analysis
# This file downloads the raw data needed to do factor analysis on the 2013 990 data
# This data is used as as an example for how to use the package
library(data.table)
library(dplyr)
#our training set all 990 filers from the year 2018
years = 2013
### Part IV ----------------------------------------------------------
#initialize data
dat_4 <- vector(mode = "list", length = length(years))
#get columns I want
keep_cols_part4 <- c("OBJECTID", "URL", "RETURN_VERSION", "ORG_EIN","RETURN_TYPE",
"F9_04_AFS_IND_X", "F9_04_AFS_CONSOL_X",
"F9_04_BIZ_TRANSAC_DTK_X", "F9_04_BIZ_TRANSAC_DTK_FAM_X", "F9_04_BIZ_TRANSAC_DTK_ENTITY_X",
"F9_04_CONTR_NONCSH_MT_25K_X",
"F9_04_CONTR_ART_HIST_X")
#read in the data
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/F9-P04-T00-REQUIRED-SCHEDULES-", years[i], ".csv")
temp <- fread(link, select = keep_cols_part4)
dat_4[[i]] <- temp
}
#clean up data
dat_all_4 <-
rbindlist(dat_4) %>%
mutate( year = as.numeric(substr(RETURN_VERSION, 1, 4)))%>%
filter(year <= max(years))
### Part VI Data ----------------------------------------------------------
#initialize data
dat_6 <- vector(mode = "list", length = length(years))
#get columns I want
keep_cols_part6 <- c("OBJECTID", "URL", "RETURN_VERSION", "ORG_EIN", "RETURN_TYPE",
"F9_06_GVRN_NUM_VOTING_MEMB",
"F9_06_GVRN_NUM_VOTING_MEMB_IND",
"F9_06_GVRN_DTK_FAMBIZ_RELATION_X",
"F9_06_GVRN_DELEGATE_MGMT_DUTY_X",
"F9_06_GVRN_DOC_GVRN_BODY_X",
"F9_06_POLICY_FORM990_GVRN_BODY_X",
"F9_06_POLICY_COI_X",
"F9_06_POLICY_COI_DISCLOSURE_X",
"F9_06_POLICY_COI_MONITOR_X",
"F9_06_POLICY_WHSTLBLWR_X",
"F9_06_POLICY_DOC_RETENTION_X",
"F9_06_POLICY_COMP_PROCESS_CEO_X",
"F9_06_DISCLOSURE_AVBL_OTH_X",
"F9_06_DISCLOSURE_AVBL_OTH_WEB_X",
"F9_06_DISCLOSURE_AVBL_REQUEST_X",
"F9_06_DISCLOSURE_AVBL_OWN_WEB_X"
)
#read in data
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/F9-P06-T00-GOVERNANCE-", years[i], ".csv")
temp <- fread(link, select = keep_cols_part6)
dat_6[[i]] <- temp
}
#clean up data
dat_all_6 <-
rbindlist(dat_6) %>%
mutate( year = as.numeric(substr(RETURN_VERSION, 1, 4))) %>%
filter(year <= max(years))
### Part XII Data ---------------------------------
#initialize data
dat_12 <- vector(mode = "list", length = length(years))
#Keep all columns for part XII
#keep the columns we want
keep_cols_part12 <- c("OBJECTID", "URL", "RETURN_VERSION", "ORG_EIN","RETURN_TYPE",
"F9_12_FINSTAT_METHOD_ACC_OTH",
"F9_12_FINSTAT_METHOD_ACC_ACCRU_X",
"F9_12_FINSTAT_METHOD_ACC_CASH_X")
#download the data
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/F9-P12-T00-FINANCIAL-REPORTING-", years[i], ".csv")
temp <- fread(link, select = keep_cols_part12)
dat_12[[i]] <- temp
}
#clean up data
dat_all_12 <-
rbindlist(dat_12) %>%
mutate( year = as.numeric(substr(RETURN_VERSION, 1, 4)))%>%
filter(year <= max(years))
### Schedule M Data -------------------------------------
#initialize data
dat_M <- vector(mode = "list", length = length(years))
#get columns I want
keep_cols_partM <- c("OBJECTID", "URL", "RETURN_VERSION", "ORG_EIN","RETURN_TYPE",
"SM_01_REVIEW_PROCESS_UNUSUAL_X")
for(i in 1:length(years)){
link <- paste0("https://nccs-efile.s3.us-east-1.amazonaws.com/parsed/SM-P01-T00-NONCASH-CONTRIBUTIONS-", years[i], ".csv")
temp <- fread(link, select = keep_cols_partM)
dat_M[[i]] <- temp
}
#clean up data
dat_all_M <-
rbindlist(dat_M) %>%
mutate( year = as.numeric(substr(RETURN_VERSION, 1, 4)))%>%
filter(year <= max(years))
### Merge all parts
vars.bind <- c("OBJECTID", "URL", "RETURN_VERSION", "ORG_EIN", "year", "RETURN_TYPE")
dat_example <-
dat_all_4 %>%
merge(dat_all_6, by = vars.bind) %>%
merge(dat_all_12, by = vars.bind) %>%
merge(dat_all_M, by = vars.bind) %>%
filter(RETURN_TYPE != "990EZ")
### Save this as the training data
save(dat_example, file = "data/dat_example.rda")