-
Notifications
You must be signed in to change notification settings - Fork 0
/
acfrs_govname_ncesid.Rmd
282 lines (230 loc) · 14.4 KB
/
acfrs_govname_ncesid.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
---
title: "Untitled"
output: html_document
date: '2022-08-31'
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(threadr)
```
# Round 1
```{r}
acfrs_file_name <- rio::import("data/sd_list.xlsx") %>%
dplyr::mutate(state = str_split(acfrs_file_name, " ",simplify = TRUE)[ ,1]) %>%
mutate(name = str_remove_all(acfrs_file_name, "2020.pdf"),
name = str_sub(name, 3)) %>% arrange(name)
```
## ACFRs
```{r}
acfrs_school_districts <- readRDS("data_from_dbsite.RDS") %>%
rename(acfrs_original_name = name) %>%
filter(category == "School District") %>%
mutate(name = str_to_lower(acfrs_original_name)) %>%
#select(-c(census_id, has_unconfirmed, year, category, id)) %>%
select(state, acfrs_original_name, total_liabilities, name) %>%
mutate(name = str_remove_all(name, "no\\.|#|'")) %>%
mutate(name = str_replace_all(name, "/", " ")) %>%
mutate(name = str_replace_all(name, "\\.", " ")) %>%
mutate(name = str_replace_all(name, "-", " ")) %>%
mutate(name = str_remove_all(name, "(community consolidated school district)|(community consolidated schools district)|(joint unified school district)|(center unified school district)"),
#name = str_remove_all(name, ""),
name = str_remove_all(name, "(consolidated high school district)|(consolidated independent school district)"),
name = str_remove_all(name, "(union high school district)|(city school district)|(union elementary school district)|(union school district)|(county unified school district)|(joint unified school district)"), # cali
name = str_remove_all(name, "(county school district)|(county independent school district)"),
name = str_remove_all(name, "(community unit school district)|(community unit district)"),
name = str_remove_all(name, "(public school district)|(public schools district)|(independent school district)|(district school board)"),
#OH
name = str_remove_all(name, "(union exempted village school district)|(exempted village school district)|(county joint vocational school district)"),
name = str_remove_all(name, "(high school district)|(local school district)"),
name = str_remove_all(name, "educational service district"),
name = str_remove_all(name, "fractional township"),
name = str_remove_all(name, "(the school districts of)|(board of education)|(the school board of)|(public school system)"),
name = str_remove_all(name, "unified school district"),
name = str_remove_all(name, "(community school district)|(comm unit school)|(community school dist)"),
name = str_remove_all(name,"(elementary school district)|(elementary scool district)"),
name = str_remove_all(name,"public schools"),
name = str_remove_all(name,"grade school district"),
name = str_remove_all(name, "(school district)|(comm sch dist)|(elem sch dist)|(sch dist)|(ind sch dist)")) %>%
mutate(name = str_squish(name))
```
## Govt name in NCES
```{r}
# File Paul sent to Marc: "dataformarc" file. Email Sep 7, 2022
govname_nces_id <- rio::import("data/censusID_necesID_link.xlsx") %>%
# name in this file is government unit name - also the name in file "Govt_Units_2021_Final.xlsx", sheet 3 + sheet 4
rename(gov_unit_original_name = name) %>%
mutate(gov_unit_original_name = str_to_lower(gov_unit_original_name)) %>%
mutate(name = str_trim(gov_unit_original_name)) %>%
rename(ncesid = `NCES Agency Identification Number`,
censusid = idcensus) %>%
select(censusid, ncesid, gov_unit_original_name, name)
# NCES list only has 13,713
nces <- rio::import(here::here("data", "ncesdata_DBBFFFC.xlsx"), skip = 14) %>%
select(`NCES District ID`, `District Name`, `County Name*`, City, State, `Students*`) %>%
rename(nces_original_name = `District Name`,
county_nces = `County Name*`,
state = State,
student = `Students*`,
ncesid = `NCES District ID`,
city_nces = City
)
nces %>% write_csv("nces_census.csv")
```
## Cleaning govt unit name
```{r}
census_gov_unit <- nces %>% left_join(govname_nces_id) %>% #write_csv("nces_census.csv")
# Only get those 13,713 in NCES to match with ACFRs
mutate(name = str_replace_all(name, "\\.", " ")) %>%
mutate(name = str_replace_all(name, "/", " ")) %>%
mutate(name = str_replace_all(name, "-|&", " ")) %>%
mutate(name = str_remove_all(name, "(community consolidated school district)"),
name = str_remove_all(name, "(community unit school district)|(community unit)"),
name = str_remove_all(name, "consolidated school district"),
name = str_remove_all(name, "county school district"),
name = str_remove_all(name, "community consolidated schools district"),
name = str_remove_all(name, "(community high school district)|(high school district)"),
name = str_remove_all(name, "community consolidated school district"),
name = str_remove_all(name, "consolidated high school district"),
name = str_remove_all(name,"(city unified sch dist)|(joint unified school district)"),
name = str_remove_all(name, "(unified school district)|(union high school dist)|(co office of ed)|(unified sch dist)|(unified school dist)|(union elem sch dist)|(co unif sch dist)|(union elementary sch dist)"),
name = str_remove_all(name,"(public school district)|(public schools)|(unit school district)|(union school district)"),
name = str_remove_all(name,"(elementary school district)|(elementary scool district)|(elem school district)|(elementary school dist)"),
# Ohio
name = str_remove_all(name, "(local school district)|(local sch dist)|(local school dist)|(jt voc sch dist)|(exempted sch dist)|(city sch dist)|(ex vlg sch dist)|(union sch dist)|(ex vlg school dist)"),
name = str_remove_all(name, "co jt voc sch dist"),
name = str_remove_all(name, "(community school district)|(community unit district)|(comm college district)|(uni sch dist)|(un sch dist)"),
name = str_remove_all(name, "(co ind sch dist)|(unif school dist)|(unif sch dist)|(union elem sch dt)|(jt unified sch dist)|(jt union high school dist)|(elem sch district)|(jt elem sch dist)"), #
name = str_remove_all(name, "(district school board)|(ind sch district)|(ind sch dist)|(cons sch dist)|(ind school district)"),
name = str_remove_all(name, "(school district)|(district)|(comm sch dist)|(elem sch dist)|(sch dist)|(fr t h school district)|(elem sch dt)|(union el sch d)|(jt uni sch dist)")
) %>%
# Texas
mutate(name = ifelse(state == "TX", str_remove_all(name, "[0-9]"), name)) %>%
mutate(name = str_squish(name))
round1 <- acfrs_school_districts %>% left_join(census_gov_unit) %>% drop_na(censusid)
#round1 %>% filter(enrollment == 0)
round1 %>% select(nces_original_name, acfrs_original_name, gov_unit_original_name)
```
# Round 2
## ACFRs
```{r}
acfrs_sd_2 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1$acfrs_original_name)%>%
mutate(name = str_replace_all(name, "-|,|&|#|_", " ")) %>%
mutate(name = str_remove_all(name, "(school district of the city of)|(consolidated school district)|(consolidated schools)|(community schools)|intermediate|(office of education)|(city sch dist)|(independent public school district)"),
name = str_remove_all(name, "^(the)"),
name = str_remove_all(name, "central|(union free)|(counties boces)|(county board of cooperative educational services)|centre|(community school)|(community high school)"),
#Michigan
name = str_remove_all(name, "(union free school district)|(city school district)|(board of cooperative educational services of)|(schools)|township|(district schools)|(union schools)|(public school of)"),
name = str_remove_all(name, "public school"),
name = str_remove_all(name, "^of "),
name = str_remove_all(name, "(school)|county|consolidated|(isd)|( joint)|( district)|( community)|(union districit)$")) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, " 00", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( i )|(c0)|( c)|( c )|( 0)|( 1 )", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( ity)|( 0)", " "), name)) %>%
mutate(name = str_remove_all(name, "number|(independent)")) %>%
mutate(name = str_replace_all(name, "( d )|( no )|( o[0-9])", " ")) %>%
mutate(name = str_squish(name))
```
## Gov Units
```{r}
census_sd_2 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1$gov_unit_original_name) %>%
mutate(name = str_replace_all(name, "-|,|&|#|_", " "),
name = str_remove_all(name, "'")) %>%
mutate(name = ifelse(state == "MI", str_remove_all(name, "[0-9]"), name)) %>%
mutate(name = str_remove_all(name, "(consolidated school district)|(cons school)|(joint community college)|(county community school corporation)"),
name = str_remove_all(name, "(union free school district)|(uf sch dist)|(central sch dist)|(union free)|(ctl high school dist)|(pt ool dist)|central|(centre union free school dist)"),
name = str_remove_all(name, "(city school dist)|(central sch)|(comm college)|(community college)|(ctl sch dist)|(ctl school dist)|(co comm coll)|(community high school)|(pub sch dist)|(comm sch dist)"),
name = str_remove_all(name, "(school district)|central|( ool dist)|(u f school dist)|(ctl sch)|(uf school dist)|(school dist)|(comm schs)|(city sch dist)|(township sch dist)"),
name = str_remove_all(name,"(community sch dist)|(comm school dist)"),
name = str_remove_all(name, "^of "),
name = str_remove_all(name, "(u f)|(twp)$"),
name = str_remove_all(name, "(uf)|(isd)$"),
name = str_remove_all(name, "( ool)|( pt)|( csd)$"),
name = str_remove_all(name, "( ctl)|(c s d)|( schs)$"),
name = str_remove_all(name, "(comm schools)|(schs dist)|(public school)|(consolidated school)|(community schools)|(joint union)"),
name = str_remove_all(name,
"( schools)|( public)|(co schools)|(township)|(pub)|(twp)|( comm)|( community)|( cmty)|(twp f)|(pub fr)|(consol)|( sch)|( scools)|(college)|(township f)|(twp fr)|( co)|( union)|( joint)$"),
name = str_replace_all(name, "( i 00)", " ")) %>%
mutate(name = str_squish(name))
```
```{r}
round2 <- acfrs_sd_2 %>% left_join(census_sd_2) %>% drop_na(censusid)
round1_2 <- round1 %>% rbind(round2)
```
# Round 3
```{r}
# after round 2, how many each state has left NOT matched
acfrs_sd_2 %>% left_join(census_sd_2) %>% filter(is.na(censusid)) %>% count(state) %>% arrange(desc(n))
## after round 2, how many ACFRS left in total NOT matched
acfrs_sd_3 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1_2$acfrs_original_name)
```
```{r}
# after round 2, how many census left in total NOT matched
census_sd_3 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1_2$gov_unit_original_name)
```
Now need to match acfrs_sd_3 and census_sd_3
```{r}
acfrs_sd_3_clean <- acfrs_sd_3 %>% #filter(state == "ME") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(community schools district)|(county schools district)"),
name = str_remove_all(name, "(school disrict)|(community schools)|(community schools)"),
name = str_remove_all(name, "( r)|( county)|( consolidated)$")) %>%
mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>%
mutate(name = str_remove_all(name, "( municipal)|( city)|( union)$")) %>%
mutate(name = str_squish(name))
```
```{r}
census_sd_3_clean <- census_sd_3 %>%
filter(student > 0) %>%
#filter(state == "ME") %>%
mutate(name = str_remove_all(name,"(ind school dist)|(independent rict)|(community college dist)|(community college)|(br school dist)")) %>%
mutate(name = str_remove_all(name, "( rict )|(county unified school system)"),
name = str_remove_all(name, "(city sd)|(city pub[0-9])|(city pub)"),
name = str_replace_all(name, "( 0)", " "),
name = str_replace_all(name, "serv", "service")) %>%
# name = str_remove_all(name, " [0-9]+$")) %>% #filter(str_detect(name, "city pub"))
mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>%
mutate(name = str_remove_all(name,"( college)|( independent)|( cons)|( i s)|( rict)|( co cons)|( school)|( city)|( co)|( comm)|( ind sh)|( indep)$"),
name = str_remove(name, "(olidated)|( munc)$")
) %>%
mutate(name = str_squish(name))
```
```{r}
round_3 <- acfrs_sd_3_clean %>% left_join(census_sd_3_clean) %>% drop_na(censusid)
# NOT matched after round 3
acfrs_sd_3_clean %>% filter(!acfrs_original_name %in% round_3$acfrs_original_name) %>% count(state) %>% arrange(desc(n))
filter(state == "TX") %>% arrange(name) #filter(str_detect(name, "goose"))
```
```{r}
round123 <- round1_2 %>% rbind(round_3)
round123 %>% select(state, acfrs_original_name, nces_original_name, gov_unit_original_name, name, ncesid, censusid, county_nces, city_nces, student) #%>% write_csv("acfrs_necs_census_matched.csv")
length(unique(round123$name))
round123 %>%
filter(duplicated(name, total_liabilities)) %>% arrange(name)%>% filter(str_detect(acfrs_original_name, "Bangor"))
```
# Round 4
```{r}
acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) %>% count(state) %>% arrange(desc(n))
```
## CA
```{r}
ca_acfrs <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) %>% filter(state == "CA") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(county special education local plan area)"),
name = str_remove_all(name, "(valley)|(park)|(creek)|(joint)$")
) %>%
mutate(name = case_when(acfrs_original_name == "E L Segundo Unified School District" ~ "el segundo",
TRUE ~ name),
name = str_squish(name))
```
##
```{r}
ca_census <- census_gov_unit %>% filter(!gov_unit_original_name %in% round123$gov_unit_original_name) %>% filter(state == "CA") %>% arrange(name) %>%
mutate(name = str_remove_all(name, "(val)|(pk unif sch dis)|(jt high)|(jt uhs dist)|(co spl schs oper by co supt)|(school dist)|(joint unified)")) %>%
mutate(name = case_when(gov_unit_original_name == "calexico unif sch dist" ~ "calexico",
TRUE ~ name)) %>%
mutate(name = str_remove_all(name, "( jt)|( union elem)|(creek el)|( ctr)|(union)|(joint)|(union h)$"),
name = str_squish(name))
ca_acfrs %>% left_join(ca_census) %>% drop_na(censusid)
```