Skip to content

Commit

Permalink
Merge pull request #156 from declanrjb/master
Browse files Browse the repository at this point in the history
data and parsers for chester, clearfield, and cumberland
  • Loading branch information
dwillis authored Aug 19, 2024
2 parents aa20dcf + cff9d3b commit fa507c1
Show file tree
Hide file tree
Showing 6 changed files with 32,356 additions and 0 deletions.
18,685 changes: 18,685 additions & 0 deletions 2024/20240423__pa__primary__chester__precinct.csv

Large diffs are not rendered by default.

5,391 changes: 5,391 additions & 0 deletions 2024/20240423__pa__primary__clearfield__precinct.csv

Large diffs are not rendered by default.

7,583 changes: 7,583 additions & 0 deletions 2024/20240423__pa__primary__cumberland__precinct.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import pdfplumber\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"def extract_votes_from_row(data_row,header):\n",
" votes_ls = re.search(r'[0-9]+\\s[0-9]+\\s[0-9]+\\s[0-9]+',data_row).group(0).split(' ')\n",
" votes = {}\n",
" for i in range(0,len(header)):\n",
" votes[header[i]] = [votes_ls[i]]\n",
" df = pd.DataFrame(votes)\n",
" df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()\n",
" return df\n",
"\n",
"def extract_votes(data_rows,header):\n",
" return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])\n",
"\n",
"def extract_data_rows(table_rows):\n",
" data_rows = []\n",
" for row in table_rows:\n",
" if not re.search(r'[0-9]+\\s[0-9]+\\s[0-9]+\\s[0-9]+',row) is None:\n",
" data_rows.append(row)\n",
"\n",
" return data_rows\n",
"\n",
"def extract_box_data(page,bbox):\n",
" data_section = page.crop(bbox)\n",
" data_text = data_section.extract_text()\n",
" table_rows = data_text.split('\\n')\n",
" race_title = table_rows[0]\n",
"\n",
" table_header = [x.replace('\\n',' ') for x in list(filter(lambda x: len(x) > 0,data_section.extract_table()[0]))]\n",
" \n",
" data_rows = extract_data_rows(table_rows)\n",
"\n",
" df = extract_votes(data_rows,table_header)\n",
"\n",
" if not re.search(r'^[A-Z]{,3}',race_title) is None:\n",
" df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)\n",
" if not re.search(r'[0-9]+[a-z]{2}\\sDistrict',race_title) is None:\n",
" df['district'] = re.search(r'[0-9]+[a-z]{2}\\sDistrict',race_title).group(0)\n",
" if not re.search(r'\\s\\D+',race_title) is None:\n",
" df['office'] = re.search(r'\\s\\D+',race_title).group(0).strip()\n",
"\n",
" return df\n",
"\n",
"def extract_precinct_name(page,strip_start=80,strip_height=25):\n",
" return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()\n",
"\n",
"def extract_page_data(page):\n",
" vote_headers = page.search('Vote For')\n",
" all_data = []\n",
" i = 0\n",
" while i < len(vote_headers):\n",
" if i < len(vote_headers) - 1:\n",
" pair = vote_headers[i:i+2]\n",
" bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)\n",
" else:\n",
" bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)\n",
" temp = extract_box_data(page,bbox)\n",
" all_data.append(temp)\n",
" i += 1\n",
"\n",
" df = pd.concat(all_data)\n",
" df['precinct'] = extract_precinct_name(page)\n",
" return df\n",
"\n",
"def extract_statistics(page):\n",
" if len(page.search('STATISTICS')) > 0:\n",
" bbox = (0,page.search('STATISTICS')[0]['bottom'],page.width,page.search('STATISTICS')[0]['bottom'] + 150)\n",
" stats_text = page.crop(bbox).extract_text()\n",
"\n",
" stats = pd.DataFrame({\n",
" 'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],\n",
" 'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]\n",
" })\n",
"\n",
" stats = stats.melt().rename(columns={\n",
" 'variable':'office',\n",
" 'value':'votes'\n",
" })\n",
"\n",
" stats['precinct'] = extract_precinct_name(page)\n",
"\n",
" return stats\n",
" else:\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"file = '../data_2024/primary/Chester PA 2024Primary_Official_Precinct_Results.pdf'\n",
"pdf = pdfplumber.open(file)\n",
"county_name = 'Chester'"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([extract_page_data(page) for page in pdf.pages])"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"df['county'] = county_name"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={\n",
" 'TOTAL':'votes',\n",
" 'Election Day':'election_day',\n",
" 'Provisional Votes':'provisional',\n",
" 'Mail Votes':'absentee'\n",
"})\n",
"df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('STATISTICS')) > 0])\n",
"stats_df['county'] = county_name"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df,stats_df])\n",
"df['district'] = df['district'].str.extract('(\\d+)')\n",
"df = df.fillna('')\n",
"df = df.reset_index()\n",
"df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"df['candidate'] = df['candidate'].str.title()\n",
"df['office'] = df['office'].replace('President of the United States','President')\n",
"df['office'] = df['office'].replace('United States Senator','U.S. Senate')\n",
"df['office'] = df['office'].replace('Representative in Congress','U.S. House')\n",
"df['office'] = df['office'].replace('Senator in the General Assembly','State Senate')\n",
"df['office'] = df['office'].replace('Representative in the General Assembly','General Assembly')"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"df = df.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit fa507c1

Please sign in to comment.