Skip to content

Commit

Permalink
Merge 2
Browse files Browse the repository at this point in the history
  • Loading branch information
judtinzhang committed Mar 7, 2024
2 parents 147ab97 + 0860326 commit 757fc15
Showing 1 changed file with 38 additions and 38 deletions.
76 changes: 38 additions & 38 deletions backend/penndata/management/commands/get_penn_today_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.options import Options

from penndata.models import Event

Expand All @@ -28,21 +29,8 @@ def handle(self, *args, **kwargs):
# past_events.delete()

# Scrapes Penn Today
try:
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))

driver.get(PENN_TODAY_WEBSITE)
events_list = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "events-list"))
)

html_content = events_list.get_attribute("innerHTML")
driver.quit()
except ConnectionError:
print("Connection Error to webdriver")
return None

soup = BeautifulSoup(html_content, "html.parser")
if not (soup := self.connect_and_parse_html(PENN_TODAY_WEBSITE, EC.presence_of_element_located((By.ID, "events-list")))):
return

event_articles = soup.find_all("article", class_="tease")

Expand Down Expand Up @@ -76,11 +64,15 @@ def handle(self, *args, **kwargs):
if start_date.month < current_month:
# If scraped month is before current month, increment year
start_date = start_date.replace(year=current_year + 1)
if start_time_str == ALL_DAY:
print(start_date_str)
if ALL_DAY in start_time_str.lower():
start_time = datetime.time(0, 0)
else:
start_time = datetime.datetime.strptime(start_time_str, "%I:%M%p").time()
start_date = datetime.datetime.combine(start_date, start_time)

if start_date > now + datetime.timedelta(days=31):
continue

event_url = urljoin(PENN_TODAY_WEBSITE, article.find("a", class_="tease__link")["href"])

Expand All @@ -98,47 +90,55 @@ def handle(self, *args, **kwargs):
end_of_day = datetime.time(23, 59, 59)
if end_date_elem: # end date but no end time
end_date_str = end_date_elem.text.strip().split(" ")[-1]
end_date = datetime.combine(
end_date = datetime.datetime.combine(
datetime.datetime.strptime(end_date_str, "%m/%d/%Y"), end_of_day
)

else: # no end date or end time
end_date = datetime.combine(start_date, end_of_day)
end_date = datetime.datetime.combine(start_date, end_of_day)

Event.objects.update_or_create(
name=name,
defaults={
"event_type": "",
"event_type": "Penn Today",
"image_url": "",
"start": start_date,
"end": end_date,
"start": timezone.make_aware(start_date),
"end": timezone.make_aware(end_date),
"location": location,
"website": event_url,
"description": description,
"email": "",
},
)

self.stdout.write("Uploaded Events!")
self.stdout.write("Uploaded Penn Today Events!")

def connect_and_parse_html(self, event_url, condition):
try:
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)

driver.get(event_url)
print("WAITING FOR ELEMENT")
element = WebDriverWait(driver, 10).until(condition)
print("ELEMENT FOUND")

html_content = element.get_attribute("innerHTML")
driver.quit()
return BeautifulSoup(html_content, "html.parser")
except ConnectionError:
print("Connection Error to webdriver")
return None

def get_end_time(self, event_url):
driver = webdriver.Chrome()
driver.get(event_url)
event_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content"))
)
end_time_soup = BeautifulSoup(event_element.get_attribute("innerHTML"), "html.parser")
end_time_soup = self.connect_and_parse_html(event_url, EC.presence_of_element_located((By.CLASS_NAME, "event__topper-content")))

end_time_range_str = (
end_time_soup.find("p", class_="event__meta event__time").text.strip().replace(".", "")
)
print(end_time_range_str)
if not end_time_range_str or ALL_DAY in end_time_range_str.lower():
driver.quit()

if not end_time_range_str or ALL_DAY in end_time_range_str.lower() or len(times := end_time_range_str.split(" - ")) <= 1:
return None # No end time if the event is all day
times = end_time_range_str.split(" - ")
if len(times) <= 1:
driver.quit()
return None
end_time_str = times[1]
driver.quit()
return end_time_str

return times[1]

0 comments on commit 757fc15

Please sign in to comment.