acfrs_govname_ncesid.Rmd

---
title: "Untitled"
output: html_document
date: '2022-08-31'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(threadr)
```

# Round 1

```{r}
acfrs_file_name <- rio::import("data/sd_list.xlsx") %>% 
  dplyr::mutate(state = str_split(acfrs_file_name, " ",simplify = TRUE)[ ,1]) %>% 
  mutate(name = str_remove_all(acfrs_file_name, "2020.pdf"),
        name = str_sub(name, 3)) %>% arrange(name)
```


## ACFRs
```{r}
acfrs_school_districts <- readRDS("data_from_dbsite.RDS") %>% 
  rename(acfrs_original_name = name) %>% 
  filter(category == "School District") %>% 
  mutate(name = str_to_lower(acfrs_original_name)) %>% 
  #select(-c(census_id, has_unconfirmed, year, category, id)) %>% 
  select(state, acfrs_original_name, total_liabilities, name) %>% 
  
  
  mutate(name = str_remove_all(name, "no\\.|#|'")) %>% 
  mutate(name = str_replace_all(name, "/", " ")) %>%
  mutate(name = str_replace_all(name, "\\.", " ")) %>% 
  mutate(name = str_replace_all(name, "-", " ")) %>% 
  
  mutate(name = str_remove_all(name, "(community consolidated school district)|(community consolidated schools district)|(joint unified school district)|(center unified school district)"),
         #name = str_remove_all(name, ""),
         name = str_remove_all(name, "(consolidated high school district)|(consolidated independent school district)"),
         name = str_remove_all(name, "(union high school district)|(city school district)|(union elementary school district)|(union school district)|(county unified school district)|(joint unified school district)"), # cali
         name = str_remove_all(name, "(county school district)|(county independent school district)"),
         name = str_remove_all(name, "(community unit school district)|(community unit district)"),
         name = str_remove_all(name, "(public school district)|(public schools district)|(independent school district)|(district school board)"),
         #OH
         name = str_remove_all(name, "(union exempted village school district)|(exempted village school district)|(county joint vocational school district)"), 
         name = str_remove_all(name, "(high school district)|(local school district)"),
         name = str_remove_all(name, "educational service district"),
         name = str_remove_all(name, "fractional township"), 
         name = str_remove_all(name, "(the school districts of)|(board of education)|(the school board of)|(public school system)"),
         name = str_remove_all(name, "unified school district"),
         name = str_remove_all(name, "(community school district)|(comm unit school)|(community school dist)"),
         name = str_remove_all(name,"(elementary school district)|(elementary scool district)"),
         name = str_remove_all(name,"public schools"),
         name = str_remove_all(name,"grade school district"),
         name = str_remove_all(name, "(school district)|(comm sch dist)|(elem sch dist)|(sch dist)|(ind sch dist)")) %>% 
  mutate(name = str_squish(name))
```

## Govt name in NCES
```{r}
# File Paul sent to Marc: "dataformarc" file. Email Sep 7, 2022
govname_nces_id <- rio::import("data/censusID_necesID_link.xlsx") %>% 
  # name in this file is government unit name - also the name in file "Govt_Units_2021_Final.xlsx", sheet 3 + sheet 4
  rename(gov_unit_original_name = name) %>% 
  mutate(gov_unit_original_name = str_to_lower(gov_unit_original_name)) %>% 
  mutate(name = str_trim(gov_unit_original_name)) %>% 
  rename(ncesid = `NCES Agency Identification Number`, 
         censusid = idcensus) %>% 
select(censusid, ncesid, gov_unit_original_name, name) 

# NCES list only has 13,713
nces <- rio::import(here::here("data", "ncesdata_DBBFFFC.xlsx"), skip = 14) %>% 
  select(`NCES District ID`, `District Name`, `County Name*`, City, State, `Students*`) %>% 
  rename(nces_original_name = `District Name`,
    county_nces = `County Name*`, 
    state = State, 
    student = `Students*`, 
    ncesid = `NCES District ID`,
    city_nces = City
   ) 
nces %>% write_csv("nces_census.csv")
```

## Cleaning govt unit name
```{r}
census_gov_unit <- nces %>% left_join(govname_nces_id) %>% #write_csv("nces_census.csv")
  
  # Only get those 13,713 in NCES to match with ACFRs
  mutate(name = str_replace_all(name, "\\.", " ")) %>% 
  mutate(name = str_replace_all(name, "/", " ")) %>% 
  mutate(name = str_replace_all(name, "-|&", " ")) %>% 
  
  mutate(name = str_remove_all(name, "(community consolidated school district)"),
         name = str_remove_all(name, "(community unit school district)|(community unit)"),
         name = str_remove_all(name, "consolidated school district"), 
         name = str_remove_all(name, "county school district"),
         name = str_remove_all(name, "community consolidated schools district"),
         name = str_remove_all(name, "(community high school district)|(high school district)"), 
         name = str_remove_all(name, "community consolidated school district"),
         name = str_remove_all(name, "consolidated high school district"),
         name = str_remove_all(name,"(city unified sch dist)|(joint unified school district)"), 
        name = str_remove_all(name, "(unified school district)|(union high school dist)|(co office of ed)|(unified sch dist)|(unified school dist)|(union elem sch dist)|(co unif sch dist)|(union elementary sch dist)"),
         
         name = str_remove_all(name,"(public school district)|(public schools)|(unit school district)|(union school district)"),
         name = str_remove_all(name,"(elementary school district)|(elementary scool district)|(elem school district)|(elementary school dist)"),
        # Ohio
        name = str_remove_all(name, "(local school district)|(local sch dist)|(local school dist)|(jt voc sch dist)|(exempted sch dist)|(city sch dist)|(ex vlg sch dist)|(union sch dist)|(ex vlg school dist)"), 
        name = str_remove_all(name, "co jt voc sch dist"),
        
         name = str_remove_all(name, "(community school district)|(community unit district)|(comm college district)|(uni sch dist)|(un sch dist)"),
         name = str_remove_all(name, "(co ind sch dist)|(unif school dist)|(unif sch dist)|(union elem sch dt)|(jt unified sch dist)|(jt union high school dist)|(elem sch district)|(jt elem sch dist)"), #
         name = str_remove_all(name, "(district school board)|(ind sch district)|(ind sch dist)|(cons sch dist)|(ind school district)"),
         
         name = str_remove_all(name, "(school district)|(district)|(comm sch dist)|(elem sch dist)|(sch dist)|(fr t h school district)|(elem sch dt)|(union el sch d)|(jt uni sch dist)")
         ) %>% 
  
     # Texas 
  mutate(name = ifelse(state == "TX", str_remove_all(name, "[0-9]"), name)) %>% 
  mutate(name = str_squish(name))

round1 <- acfrs_school_districts %>% left_join(census_gov_unit) %>% drop_na(censusid)
#round1 %>% filter(enrollment == 0)

round1 %>% select(nces_original_name, acfrs_original_name, gov_unit_original_name)
```

# Round 2 
## ACFRs
```{r}
acfrs_sd_2 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1$acfrs_original_name)%>% 
  mutate(name = str_replace_all(name, "-|,|&|#|_", " ")) %>%  
  mutate(name = str_remove_all(name, "(school district of the city of)|(consolidated school district)|(consolidated schools)|(community schools)|intermediate|(office of education)|(city sch dist)|(independent public school district)"),
         name = str_remove_all(name, "^(the)"),
    name = str_remove_all(name, "central|(union free)|(counties boces)|(county board of cooperative educational services)|centre|(community school)|(community high school)"),
  #Michigan
  name = str_remove_all(name, "(union free school district)|(city school district)|(board of cooperative educational services of)|(schools)|township|(district schools)|(union schools)|(public school of)"),
  
  name = str_remove_all(name, "public school"),
  name = str_remove_all(name, "^of "),
  name = str_remove_all(name, "(school)|county|consolidated|(isd)|( joint)|( district)|( community)|(union districit)$")) %>% 
  mutate(name = ifelse(state == "OK", str_replace_all(name, " 00", " "), name)) %>% 
  mutate(name = ifelse(state == "OK", str_replace_all(name, "( i )|(c0)|( c)|( c )|( 0)|( 1 )", " "), name)) %>%
mutate(name = ifelse(state == "OK", str_replace_all(name, "( ity)|( 0)", " "), name)) %>%
  mutate(name = str_remove_all(name, "number|(independent)")) %>% 
  mutate(name = str_replace_all(name, "( d )|( no )|( o[0-9])", " ")) %>% 
  mutate(name = str_squish(name))
```

## Gov Units
```{r}
census_sd_2 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1$gov_unit_original_name) %>% 
mutate(name = str_replace_all(name, "-|,|&|#|_", " "),
       name = str_remove_all(name, "'")) %>%  
  mutate(name = ifelse(state == "MI", str_remove_all(name, "[0-9]"), name)) %>% 
mutate(name = str_remove_all(name, "(consolidated school district)|(cons school)|(joint community college)|(county community school corporation)"),
       
       name = str_remove_all(name, "(union free school district)|(uf sch dist)|(central sch dist)|(union free)|(ctl high school dist)|(pt ool dist)|central|(centre union free school dist)"),
  name = str_remove_all(name, "(city school dist)|(central sch)|(comm college)|(community college)|(ctl sch dist)|(ctl school dist)|(co comm coll)|(community high school)|(pub sch dist)|(comm sch dist)"),
  name = str_remove_all(name, "(school district)|central|( ool dist)|(u f school dist)|(ctl sch)|(uf school dist)|(school dist)|(comm schs)|(city sch dist)|(township sch dist)"),
  
  name = str_remove_all(name,"(community sch dist)|(comm school dist)"),
  name = str_remove_all(name, "^of "),
  
  name = str_remove_all(name, "(u f)|(twp)$"),
  name = str_remove_all(name, "(uf)|(isd)$"),
  name = str_remove_all(name, "( ool)|( pt)|( csd)$"),
  name = str_remove_all(name, "( ctl)|(c s d)|( schs)$"),
  
  name = str_remove_all(name, "(comm schools)|(schs dist)|(public school)|(consolidated school)|(community schools)|(joint union)"),
  name = str_remove_all(name, 
"( schools)|( public)|(co schools)|(township)|(pub)|(twp)|( comm)|( community)|( cmty)|(twp f)|(pub fr)|(consol)|( sch)|( scools)|(college)|(township f)|(twp fr)|( co)|( union)|( joint)$"),
  name = str_replace_all(name, "( i 00)", " ")) %>% 

  mutate(name = str_squish(name)) 

```

```{r}
round2 <- acfrs_sd_2 %>% left_join(census_sd_2) %>% drop_na(censusid) 
round1_2 <- round1 %>% rbind(round2)
```

# Round 3
```{r}
# after round 2, how many each state has left NOT matched
acfrs_sd_2 %>% left_join(census_sd_2) %>% filter(is.na(censusid)) %>% count(state) %>% arrange(desc(n))

## after round 2, how many ACFRS left in total NOT matched
acfrs_sd_3 <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round1_2$acfrs_original_name) 
```

```{r}
# after round 2, how many census left in total NOT matched
census_sd_3 <- census_gov_unit %>% filter(!gov_unit_original_name %in% round1_2$gov_unit_original_name)

```

Now need to match acfrs_sd_3 and census_sd_3
```{r}
acfrs_sd_3_clean <- acfrs_sd_3 %>% #filter(state == "ME") %>% arrange(name) %>% 
  mutate(name = str_remove_all(name, "(community schools district)|(county schools district)"),
    name = str_remove_all(name, "(school disrict)|(community schools)|(community schools)"),
         
         name = str_remove_all(name, "( r)|( county)|( consolidated)$")) %>% 
    mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>% 
  mutate(name = str_remove_all(name, "( municipal)|( city)|( union)$")) %>% 
  mutate(name = str_squish(name))
```

```{r}
census_sd_3_clean <- census_sd_3 %>% 
  filter(student > 0) %>% 
  #filter(state == "ME") %>% 
  mutate(name = str_remove_all(name,"(ind school dist)|(independent rict)|(community college dist)|(community college)|(br school dist)")) %>% 
  mutate(name = str_remove_all(name, "( rict )|(county unified school system)"),
         name = str_remove_all(name, "(city sd)|(city pub[0-9])|(city pub)"),
         
    name = str_replace_all(name, "( 0)", " "),
         name = str_replace_all(name, "serv", "service")) %>% 
   # name = str_remove_all(name, " [0-9]+$")) %>% #filter(str_detect(name, "city pub"))
  mutate(name = ifelse(state == "NE", str_remove_all(name, " [0-9]+$"), name)) %>% 

mutate(name = str_remove_all(name,"( college)|( independent)|( cons)|( i s)|( rict)|( co cons)|( school)|( city)|( co)|( comm)|( ind sh)|( indep)$"),
  name = str_remove(name, "(olidated)|( munc)$")
  ) %>% 
   mutate(name = str_squish(name))
```

```{r}
round_3 <- acfrs_sd_3_clean %>% left_join(census_sd_3_clean) %>% drop_na(censusid)
# NOT matched after round 3
acfrs_sd_3_clean %>% filter(!acfrs_original_name %in% round_3$acfrs_original_name) %>% count(state) %>% arrange(desc(n))
  filter(state == "TX") %>% arrange(name)  #filter(str_detect(name, "goose"))
```

```{r}
round123 <- round1_2 %>% rbind(round_3) 

round123 %>% select(state, acfrs_original_name, nces_original_name, gov_unit_original_name, name, ncesid, censusid, county_nces, city_nces, student) #%>% write_csv("acfrs_necs_census_matched.csv")

length(unique(round123$name))
round123 %>% 
  filter(duplicated(name, total_liabilities)) %>% arrange(name)%>% filter(str_detect(acfrs_original_name, "Bangor"))
```


# Round 4

```{r}
acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) %>% count(state) %>% arrange(desc(n))
```

## CA
```{r}
ca_acfrs <- acfrs_school_districts %>% filter(!acfrs_original_name %in% round123$acfrs_original_name) %>% filter(state == "CA") %>% arrange(name) %>% 
mutate(name = str_remove_all(name, "(county special education local plan area)"),
       name = str_remove_all(name, "(valley)|(park)|(creek)|(joint)$")
       ) %>% 
  
  mutate(name = case_when(acfrs_original_name == "E L Segundo Unified School District" ~ "el segundo",
                          TRUE ~ name),
         name = str_squish(name))
         
```
##
```{r}
ca_census <- census_gov_unit %>% filter(!gov_unit_original_name %in% round123$gov_unit_original_name) %>% filter(state == "CA") %>% arrange(name) %>% 
  mutate(name = str_remove_all(name, "(val)|(pk unif sch dis)|(jt high)|(jt uhs dist)|(co spl schs oper by co supt)|(school dist)|(joint unified)")) %>% 
  mutate(name = case_when(gov_unit_original_name == "calexico unif sch dist" ~ "calexico",
         TRUE ~ name)) %>% 
    mutate(name = str_remove_all(name, "( jt)|( union elem)|(creek el)|( ctr)|(union)|(joint)|(union h)$"),
           name = str_squish(name))
ca_acfrs %>% left_join(ca_census) %>% drop_na(censusid)           
```