-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #156 from declanrjb/master
data and parsers for chester, clearfield, and cumberland
- Loading branch information
Showing
6 changed files
with
32,356 additions
and
0 deletions.
There are no files selected for viewing
18,685 changes: 18,685 additions & 0 deletions
18,685
2024/20240423__pa__primary__chester__precinct.csv
Large diffs are not rendered by default.
Oops, something went wrong.
5,391 changes: 5,391 additions & 0 deletions
5,391
2024/20240423__pa__primary__clearfield__precinct.csv
Large diffs are not rendered by default.
Oops, something went wrong.
7,583 changes: 7,583 additions & 0 deletions
7,583
2024/20240423__pa__primary__cumberland__precinct.csv
Large diffs are not rendered by default.
Oops, something went wrong.
226 changes: 226 additions & 0 deletions
226
parsers/2024-primary_parsers/pa_chester_primary_2024_results_parser.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 64, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import pdfplumber\n", | ||
"import re" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 65, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def extract_votes_from_row(data_row,header):\n", | ||
" votes_ls = re.search(r'[0-9]+\\s[0-9]+\\s[0-9]+\\s[0-9]+',data_row).group(0).split(' ')\n", | ||
" votes = {}\n", | ||
" for i in range(0,len(header)):\n", | ||
" votes[header[i]] = [votes_ls[i]]\n", | ||
" df = pd.DataFrame(votes)\n", | ||
" df['candidate'] = re.search(r'[^0-9]*',data_row).group(0).strip()\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_votes(data_rows,header):\n", | ||
" return pd.concat([extract_votes_from_row(data_row,header) for data_row in data_rows])\n", | ||
"\n", | ||
"def extract_data_rows(table_rows):\n", | ||
" data_rows = []\n", | ||
" for row in table_rows:\n", | ||
" if not re.search(r'[0-9]+\\s[0-9]+\\s[0-9]+\\s[0-9]+',row) is None:\n", | ||
" data_rows.append(row)\n", | ||
"\n", | ||
" return data_rows\n", | ||
"\n", | ||
"def extract_box_data(page,bbox):\n", | ||
" data_section = page.crop(bbox)\n", | ||
" data_text = data_section.extract_text()\n", | ||
" table_rows = data_text.split('\\n')\n", | ||
" race_title = table_rows[0]\n", | ||
"\n", | ||
" table_header = [x.replace('\\n',' ') for x in list(filter(lambda x: len(x) > 0,data_section.extract_table()[0]))]\n", | ||
" \n", | ||
" data_rows = extract_data_rows(table_rows)\n", | ||
"\n", | ||
" df = extract_votes(data_rows,table_header)\n", | ||
"\n", | ||
" if not re.search(r'^[A-Z]{,3}',race_title) is None:\n", | ||
" df['party'] = re.search(r'^[A-Z]{,3}',race_title).group(0)\n", | ||
" if not re.search(r'[0-9]+[a-z]{2}\\sDistrict',race_title) is None:\n", | ||
" df['district'] = re.search(r'[0-9]+[a-z]{2}\\sDistrict',race_title).group(0)\n", | ||
" if not re.search(r'\\s\\D+',race_title) is None:\n", | ||
" df['office'] = re.search(r'\\s\\D+',race_title).group(0).strip()\n", | ||
"\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_precinct_name(page,strip_start=80,strip_height=25):\n", | ||
" return page.crop((0,strip_start,page.width,strip_start+strip_height)).extract_text()\n", | ||
"\n", | ||
"def extract_page_data(page):\n", | ||
" vote_headers = page.search('Vote For')\n", | ||
" all_data = []\n", | ||
" i = 0\n", | ||
" while i < len(vote_headers):\n", | ||
" if i < len(vote_headers) - 1:\n", | ||
" pair = vote_headers[i:i+2]\n", | ||
" bbox = (0,pair[0]['bottom']-30,page.width,pair[1]['top']-20)\n", | ||
" else:\n", | ||
" bbox = (0,vote_headers[i]['bottom']-30,page.width,page.height)\n", | ||
" temp = extract_box_data(page,bbox)\n", | ||
" all_data.append(temp)\n", | ||
" i += 1\n", | ||
"\n", | ||
" df = pd.concat(all_data)\n", | ||
" df['precinct'] = extract_precinct_name(page)\n", | ||
" return df\n", | ||
"\n", | ||
"def extract_statistics(page):\n", | ||
" if len(page.search('STATISTICS')) > 0:\n", | ||
" bbox = (0,page.search('STATISTICS')[0]['bottom'],page.width,page.search('STATISTICS')[0]['bottom'] + 150)\n", | ||
" stats_text = page.crop(bbox).extract_text()\n", | ||
"\n", | ||
" stats = pd.DataFrame({\n", | ||
" 'Registered Voters': [re.search(r'Registered Voters - Total ([0-9]*)',stats_text).group(1)],\n", | ||
" 'Ballots Cast': [re.search(r'Ballots Cast - Total ([0-9]*)',stats_text).group(1)]\n", | ||
" })\n", | ||
"\n", | ||
" stats = stats.melt().rename(columns={\n", | ||
" 'variable':'office',\n", | ||
" 'value':'votes'\n", | ||
" })\n", | ||
"\n", | ||
" stats['precinct'] = extract_precinct_name(page)\n", | ||
"\n", | ||
" return stats\n", | ||
" else:\n", | ||
" return None" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 66, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"file = '../data_2024/primary/Chester PA 2024Primary_Official_Precinct_Results.pdf'\n", | ||
"pdf = pdfplumber.open(file)\n", | ||
"county_name = 'Chester'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 68, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.concat([extract_page_data(page) for page in pdf.pages])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 69, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['county'] = county_name" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 70, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df.rename(columns={\n", | ||
" 'TOTAL':'votes',\n", | ||
" 'Election Day':'election_day',\n", | ||
" 'Provisional Votes':'provisional',\n", | ||
" 'Mail Votes':'absentee'\n", | ||
"})\n", | ||
"df = df[df['candidate'].apply(lambda x: x not in ['Total Votes Cast','Overvotes','Undervotes','Contest Totals'])]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 71, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"stats_df = pd.concat([extract_statistics(page) for page in pdf.pages if len(page.search('STATISTICS')) > 0])\n", | ||
"stats_df['county'] = county_name" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 72, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.concat([df,stats_df])\n", | ||
"df['district'] = df['district'].str.extract('(\\d+)')\n", | ||
"df = df.fillna('')\n", | ||
"df = df.reset_index()\n", | ||
"df = df[['county','precinct','office','district','party','candidate','votes','election_day','provisional','absentee']]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 73, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['candidate'] = df['candidate'].str.title()\n", | ||
"df['office'] = df['office'].replace('President of the United States','President')\n", | ||
"df['office'] = df['office'].replace('United States Senator','U.S. Senate')\n", | ||
"df['office'] = df['office'].replace('Representative in Congress','U.S. House')\n", | ||
"df['office'] = df['office'].replace('Senator in the General Assembly','State Senate')\n", | ||
"df['office'] = df['office'].replace('Representative in the General Assembly','General Assembly')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 74, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = df.drop_duplicates()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 76, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df.to_csv(f'../data_cleaned/20240423__pa__primary__{county_name.lower()}__precinct.csv',index=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.