Merge pull request #53 from datamade/feature/52-optimize-court-call-s…

…crape Optimize court call scrape
datamade · Apr 5, 2024 · 0dc2b64 · 0dc2b64
2 parents fa5d2fe + ec28b3c
commit 0dc2b64
Showing 1 changed file with 99 additions and 20 deletions.
diff --git a/courtscraper/spiders/court_calls.py b/courtscraper/spiders/court_calls.py
@@ -1,5 +1,7 @@
 import logging
 from datetime import datetime, timedelta
+from collections import defaultdict
+from itertools import chain
 
 from scrapy import Spider, Request
 from scrapy.http import FormRequest
@@ -17,6 +19,7 @@ class CourtCallSpider(Spider):
 
  def __init__(self, **kwargs):
  self.failures = set()
+ self.case_calendars = {}
  super().__init__(**kwargs)
 
  def next_business_days(self, n):
@@ -25,8 +28,7 @@ def next_business_days(self, n):
  current_date = datetime.today()
  count = 0
  while count <= n:
- day = str(current_date.day).zfill(2) # Zero pad the date
- yield f"{current_date.month}/{day}/{current_date.year}"
+ yield f"{current_date.month}/{current_date.day}/{current_date.year}"
 
  next_date = current_date + timedelta(days=1)
  while next_date.weekday() > 4:
@@ -67,7 +69,7 @@ def start_requests(self):
  "action": "waitForSelector",
  "selector": {
  "type": "css",
- "value": "#MainContent_dtTxt",
+ "value": "#MainContent_ddlDivisionCode",
  },
  "timeout": 5,
  "onError": "return",
@@ -112,9 +114,12 @@ def start_requests(self):
  "date": date,
  "result_page_num": 1,
  "division": division,
+ "calendars": {},
+ "priority": -1,
  },
  errback=self.handle_error,
  callback=self.parse_results_page,
+ priority=-1,
  )
 
  def has_page_num(self, n, response):
@@ -136,7 +141,7 @@ def has_results(self, response):
  " criteria.')]]"
  )
  if no_results:
- return None
+ return False
 
  return True
 
@@ -147,31 +152,99 @@ def get_court_calls(self, response):
  results_table = tree.xpath("//table[@id='MainContent_grdRecords']")[0]
  rows = results_table.xpath(".//tr")
  headers = rows[0].xpath(".//a/text()")
+
+ court_calls = defaultdict(list)
+ case_details_to_fetch = []
+ already_requested_case = set()
  for result_num, row in enumerate(rows[1:-1]):
  cells = row.xpath(".//td/text()")
  if cells:
  court_call = dict(zip(headers, cells))
+ case_num = court_call["Case Number"]
+
+ if case_num in self.case_calendars:
+ # Only get a case's calendar value once
+ court_call["Calendar"] = self.case_calendars[case_num]
+ court_call["hash"] = dict_hash(court_call)
+ elif case_num not in already_requested_case:
+ # We need to remember what position this case occupies
+ # in the results list to request the detail page
+ case_details_to_fetch.append((case_num, result_num))
+ already_requested_case.add(case_num)
+
+ court_calls[case_num].append(court_call)
+
+ try:
+ case_num, result_num = case_details_to_fetch.pop()
+ except IndexError:
+ # We already have calendar values for all the cases on this page
+ yield from chain.from_iterable(court_calls.values())
+ return
 
- # Get calendar value from the case detail page
- form_data = self.extract_form(response, "//form[@id='ctl01']")
- form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
- form_data["__EVENTARGUMENT"] = f"Select${result_num}"
- yield FormRequest.from_response(
- response,
- meta={"court_call": court_call},
- formxpath="//form[@id='ctl01']",
- formdata=form_data,
- callback=self.parse_calendar,
- dont_click=True,
- )
+ form_data = self.extract_form(response, "//form[@id='ctl01']")
+ form_data["__EVENTTARGET"] = "ctl00$MainContent$grdRecords"
+ form_data["__EVENTARGUMENT"] = f"Select${result_num}"
+ yield FormRequest.from_response(
+ response,
+ meta={
+ "current_case": case_num,
+ "case_details_to_fetch": case_details_to_fetch,
+ "court_calls": court_calls,
+ "result_page_form": form_data,
+ "result_page_response": response,
+ "priority": response.meta["priority"] - 1,
+ },
+ formxpath="//form[@id='ctl01']",
+ formdata=form_data,
+ callback=self.parse_calendar,
+ dont_click=True,
+ priority=response.meta["priority"] - 1,
+ )
 
  def parse_calendar(self, response):
  """Adds the calendar and hash to a court call's dictionary."""
 
  calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get()
- court_call = {**response.meta["court_call"], "Calendar": calendar}
- court_call["hash"] = dict_hash(court_call)
- return court_call
+ current_case_calls = response.meta["court_calls"][response.meta["current_case"]]
+ for call in current_case_calls:
+ call["Calendar"] = calendar
+ call["hash"] = dict_hash(call)
+
+ self.case_calendars[response.meta["current_case"]] = calendar
+
+ if not response.meta["case_details_to_fetch"]:
+ # We've got the calendar value of all of the results
+ # on the current page
+ yield from chain.from_iterable(response.meta["court_calls"].values())
+
+ else:
+ # Request the case detail for the next case on our stack
+ next_case_num, next_result_num = response.meta[
+ "case_details_to_fetch"
+ ].pop()
+
+ form_data = response.meta["result_page_form"]
+ form_data["__EVENTARGUMENT"] = f"Select${next_result_num}"
+ yield FormRequest.from_response(
+ response.meta["result_page_response"],
+ meta={
+ "current_case": next_case_num,
+ "case_details_to_fetch": response.meta["case_details_to_fetch"],
+ "court_calls": response.meta["court_calls"],
+ "result_page_form": form_data,
+ "result_page_response": response.meta["result_page_response"],
+ "priority": response.meta["priority"] - 1,
+ },
+ formxpath="//form[@id='ctl01']",
+ formdata=form_data,
+ callback=self.parse_calendar,
+ dont_click=True,
+ priority=response.meta["priority"] - 1,
+ )
+
+ logging.info(
+ f"Fetching calendar for case {response.meta['current_case']}..."
+ )
 
  def extract_form(self, response, form_xpath):
  """
@@ -232,13 +305,19 @@ def parse_results_page(self, response):
  key: response.meta[key] for key in ["date", "result_page_num", "division"]
  }
 
+ logging.info(
+ f"Requesting page {next_page_num} of cases from "
+ f"{response.meta['division']} on {response.meta['date']}..."
+ )
  yield FormRequest.from_response(
  response,
- meta=prev_meta | {"result_page_num": next_page_num},
+ meta=prev_meta
+ | {"result_page_num": next_page_num, "priority": -next_page_num * 100},
  formxpath="//form[@id='ctl01']",
  formdata=next_page_form_data,
  callback=self.parse_results_page,
  dont_click=True,
+ priority=-next_page_num * 100,
  )
 
  def _failing_responses(self, response):