Skip to content

Commit

Permalink
Merge pull request #53 from datamade/feature/52-optimize-court-call-s…
Browse files Browse the repository at this point in the history
…crape

Optimize court call scrape
  • Loading branch information
antidipyramid authored Apr 5, 2024
2 parents fa5d2fe + ec28b3c commit 0dc2b64
Showing 1 changed file with 99 additions and 20 deletions.
119 changes: 99 additions & 20 deletions courtscraper/spiders/court_calls.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
from datetime import datetime, timedelta
from collections import defaultdict
from itertools import chain

from scrapy import Spider, Request
from scrapy.http import FormRequest
Expand All @@ -17,6 +19,7 @@ class CourtCallSpider(Spider):

def __init__(self, **kwargs):
self.failures = set()
self.case_calendars = {}
super().__init__(**kwargs)

def next_business_days(self, n):
Expand All @@ -25,8 +28,7 @@ def next_business_days(self, n):
current_date = datetime.today()
count = 0
while count <= n:
day = str(current_date.day).zfill(2) # Zero pad the date
yield f"{current_date.month}/{day}/{current_date.year}"
yield f"{current_date.month}/{current_date.day}/{current_date.year}"

next_date = current_date + timedelta(days=1)
while next_date.weekday() > 4:
Expand Down Expand Up @@ -67,7 +69,7 @@ def start_requests(self):
"action": "waitForSelector",
"selector": {
"type": "css",
"value": "#MainContent_dtTxt",
"value": "#MainContent_ddlDivisionCode",
},
"timeout": 5,
"onError": "return",
Expand Down Expand Up @@ -112,9 +114,12 @@ def start_requests(self):
"date": date,
"result_page_num": 1,
"division": division,
"calendars": {},
"priority": -1,
},
errback=self.handle_error,
callback=self.parse_results_page,
priority=-1,
)

def has_page_num(self, n, response):
Expand All @@ -136,7 +141,7 @@ def has_results(self, response):
" criteria.')]]"
)
if no_results:
return None
return False

return True

Expand All @@ -147,31 +152,99 @@ def get_court_calls(self, response):
results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0]
rows = results_table.xpath(".//tr")
headers = rows[0].xpath(".//a/text()")

court_calls = defaultdict(list)
case_details_to_fetch = []
already_requested_case = set()
for result_num, row in enumerate(rows[1:-1]):
cells = row.xpath(".//td/text()")
if cells:
court_call = dict(zip(headers, cells))
case_num = court_call["Case Number"]

if case_num in self.case_calendars:
# Only get a case's calendar value once
court_call["Calendar"] = self.case_calendars[case_num]
court_call["hash"] = dict_hash(court_call)
elif case_num not in already_requested_case:
# We need to remember what position this case occupies
# in the results list to request the detail page
case_details_to_fetch.append((case_num, result_num))
already_requested_case.add(case_num)

court_calls[case_num].append(court_call)

try:
case_num, result_num = case_details_to_fetch.pop()
except IndexError:
# We already have calendar values for all the cases on this page
yield from chain.from_iterable(court_calls.values())
return

# Get calendar value from the case detail page
form_data = self.extract_form(response, "//form[@id='ctl01']")
form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
form_data["__EVENTARGUMENT"] = f"Select${result_num}"
yield FormRequest.from_response(
response,
meta={"court_call": court_call},
formxpath="//form[@id='ctl01']",
formdata=form_data,
callback=self.parse_calendar,
dont_click=True,
)
form_data = self.extract_form(response, "//form[@id='ctl01']")
form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
form_data["__EVENTARGUMENT"] = f"Select${result_num}"
yield FormRequest.from_response(
response,
meta={
"current_case": case_num,
"case_details_to_fetch": case_details_to_fetch,
"court_calls": court_calls,
"result_page_form": form_data,
"result_page_response": response,
"priority": response.meta["priority"] - 1,
},
formxpath="//form[@id='ctl01']",
formdata=form_data,
callback=self.parse_calendar,
dont_click=True,
priority=response.meta["priority"] - 1,
)

def parse_calendar(self, response):
"""Adds the calendar and hash to a court call's dictionary."""

calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get()
court_call = {**response.meta["court_call"], "Calendar": calendar}
court_call["hash"] = dict_hash(court_call)
return court_call
current_case_calls = response.meta["court_calls"][response.meta["current_case"]]
for call in current_case_calls:
call["Calendar"] = calendar
call["hash"] = dict_hash(call)

self.case_calendars[response.meta["current_case"]] = calendar

if not response.meta["case_details_to_fetch"]:
# We've got the calendar value of all of the results
# on the current page
yield from chain.from_iterable(response.meta["court_calls"].values())

else:
# Request the case detail for the next case on our stack
next_case_num, next_result_num = response.meta[
"case_details_to_fetch"
].pop()

form_data = response.meta["result_page_form"]
form_data["__EVENTARGUMENT"] = f"Select${next_result_num}"
yield FormRequest.from_response(
response.meta["result_page_response"],
meta={
"current_case": next_case_num,
"case_details_to_fetch": response.meta["case_details_to_fetch"],
"court_calls": response.meta["court_calls"],
"result_page_form": form_data,
"result_page_response": response.meta["result_page_response"],
"priority": response.meta["priority"] - 1,
},
formxpath="//form[@id='ctl01']",
formdata=form_data,
callback=self.parse_calendar,
dont_click=True,
priority=response.meta["priority"] - 1,
)

logging.info(
f"Fetching calendar for case {response.meta['current_case']}..."
)

def extract_form(self, response, form_xpath):
"""
Expand Down Expand Up @@ -232,13 +305,19 @@ def parse_results_page(self, response):
key: response.meta[key] for key in ["date", "result_page_num", "division"]
}

logging.info(
f"Requesting page {next_page_num} of cases from "
f"{response.meta['division']} on {response.meta['date']}..."
)
yield FormRequest.from_response(
response,
meta=prev_meta | {"result_page_num": next_page_num},
meta=prev_meta
| {"result_page_num": next_page_num, "priority": -next_page_num * 100},
formxpath="//form[@id='ctl01']",
formdata=next_page_form_data,
callback=self.parse_results_page,
dont_click=True,
priority=-next_page_num * 100,
)

def _failing_responses(self, response):
Expand Down

0 comments on commit 0dc2b64

Please sign in to comment.