Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

Commit

Permalink
Merge pull request #750 from alonisser/master
Browse files Browse the repository at this point in the history
resolve broken future events scraper
  • Loading branch information
alonisser authored Nov 10, 2016
2 parents 1ef1a0e + bd8db98 commit 431f05c
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 81 deletions.
6 changes: 5 additions & 1 deletion events/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@

from models import Event


class EventDetailView(DetailView):
model = Event

def get_context_data(self, *args, **kwargs):
context = super(EventDetailView, self).get_context_data(**kwargs)
obj = context['object']
Expand All @@ -24,9 +26,10 @@ def get_context_data(self, *args, **kwargs):
for i in obj.who.all():
if i.mk:
creators.append(i.mk)
context['creators']=creators
context['creators'] = creators
return context


class MoreUpcomingEventsView(GetMoreView):
"""Get partially rendered member actions content for AJAX calls to 'More'"""

Expand All @@ -36,6 +39,7 @@ class MoreUpcomingEventsView(GetMoreView):
def get_queryset(self):
return Event.objects.get_upcoming()


def icalendar(request, summary_length=50, future_only=True):
"""
return a single icalendar file, default to future_only.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,129 +1,139 @@
# encoding: utf-8

import urllib,urllib2,re,datetime,traceback,sys,os,subprocess
import urllib, urllib2, re, datetime, traceback, sys, os, subprocess
from BeautifulSoup import BeautifulSoup
from django.conf import settings
from committees.models import Committee, CommitteeMeeting
from simple.management.utils import antiword
import logging
from knesset.utils import send_chat_notification

URL="http://www.knesset.gov.il/plenum/heb/plenum_queue.aspx"
ROBOTS_URL="http://www.knesset.gov.il/robots.txt"
FULL_URL="http://www.knesset.gov.il/plenum/heb/display_full.asp"
FILE_BASE_URL="http://www.knesset.gov.il/plenum/heb/"
WORDS_OF_THE_KNESSET=u"דברי הכנסת"
WORDS_OF_THE_KNESSET_FULL=u"כל הפרוטוקול"
DISCUSSIONS_ON_DATE=u"הדיונים בתאריך"
URL = "http://www.knesset.gov.il/plenum/heb/plenum_queue.aspx"
ROBOTS_URL = "http://www.knesset.gov.il/robots.txt"
FULL_URL = "http://www.knesset.gov.il/plenum/heb/display_full.asp"
FILE_BASE_URL = "http://www.knesset.gov.il/plenum/heb/"
WORDS_OF_THE_KNESSET = u"דברי הכנסת"
WORDS_OF_THE_KNESSET_FULL = u"כל הפרוטוקול"
DISCUSSIONS_ON_DATE = u"הדיונים בתאריך"

logger = logging.getLogger('open-knesset')


def _get_committees_index_page(full):
if full:
url=FULL_URL
encoding='iso_8859_8'
url = FULL_URL
encoding = 'iso_8859_8'
else:
url=URL
url = URL
# encoding='utf8'
# the encoding of this page used to be utf-8 but looks like they reverted back to iso-8859-8
encoding='iso_8859_8'
logger.info('getting index page html from '+url)
encoding = 'iso_8859_8'
logger.info('getting index page html from %s' % url)
try:
return unicode(urllib2.urlopen(url).read(), encoding)
except:
logger.error('could not fetch committees_index_page, exception: '+traceback.format_exc())
logger.exception(u'could not fetch committees_index_page for url %s' % url)
send_chat_notification(__name__, "could not fetch committees index page", {'url': url})
return ''


def _copy(url, to, recopy=False):
# logger.debug("copying from "+url+" to "+to)
d=os.path.dirname(to)
d = os.path.dirname(to)
if not os.path.exists(d):
os.makedirs(d)
if not os.path.exists(to) or recopy:
urllib.urlretrieve(url, to+".tmp")
os.rename(to+'.tmp', to)
urllib.urlretrieve(url, to + ".tmp")
os.rename(to + '.tmp', to)
else:
logger.debug('already downloaded')
logger.debug(u'already downloaded url %s' % url)


def _antiword(filename):
try:
return antiword(filename, logger)
except:
logger.error('antiword failure '+traceback.format_exc())
logger.exception(u'antiword failure with file: %s' % filename)
return ''


def _urlAlreadyDownloaded(url):
plenum=Committee.objects.filter(type='plenum')[0]
if CommitteeMeeting.objects.filter(committee=plenum,src_url=url).count()>0:
plenum = Committee.objects.filter(type='plenum')[0]
if CommitteeMeeting.objects.filter(committee=plenum, src_url=url).count() > 0:
return True
else:
return False


def _updateDb(xmlData, url, year, mon, day):
logger.debug('update db %s, %s, %s, %s, %s'%(len(xmlData), url, year, mon, day))
plenum=Committee.objects.filter(type='plenum')[0]
cms=CommitteeMeeting.objects.filter(committee=plenum,src_url=url)
if cms.count()>0:
meeting=cms[0]
logger.debug('update db %s, %s, %s, %s, %s' % (len(xmlData), url, year, mon, day))
plenum = Committee.objects.filter(type='plenum')[0]
cms = CommitteeMeeting.objects.filter(committee=plenum, src_url=url)
if cms.count() > 0:
meeting = cms[0]
else:
meeting=CommitteeMeeting(
meeting = CommitteeMeeting(
committee=plenum,
date=datetime.datetime(int(year),int(mon),int(day)),
date=datetime.datetime(int(year), int(mon), int(day)),
src_url=url,
topics=u'ישיבת מליאה מתאריך '+day+'/'+mon+'/'+year,
date_string=''+day+'/'+mon+'/'+year
topics=u'ישיבת מליאה מתאריך ' + day + '/' + mon + '/' + year,
date_string='' + day + '/' + mon + '/' + year
)
meeting.protocol_text=xmlData
meeting.protocol_text = xmlData
meeting.save()

def _downloadLatest(full,redownload):
html=_get_committees_index_page(full)
soup=BeautifulSoup(html)

def _downloadLatest(full, redownload):
html = _get_committees_index_page(full)
soup = BeautifulSoup(html)
if full:
words_of_the_knesset=WORDS_OF_THE_KNESSET_FULL
words_of_the_knesset = WORDS_OF_THE_KNESSET_FULL
else:
words_of_the_knesset=WORDS_OF_THE_KNESSET
aelts=soup('a',text=words_of_the_knesset)
words_of_the_knesset = WORDS_OF_THE_KNESSET
aelts = soup('a', text=words_of_the_knesset)
for aelt in aelts:
selt=aelt.findPrevious('span',text=re.compile(DISCUSSIONS_ON_DATE))
selt = aelt.findPrevious('span', text=re.compile(DISCUSSIONS_ON_DATE))
href = aelt.parent.get('href')
if href.startswith('http'):
url = href
else:
url=FILE_BASE_URL+href
filename=re.search(r"[^/]*$",url).group()
url = FILE_BASE_URL + href
filename = re.search(r"[^/]*$", url).group()
logger.debug(filename)
m=re.search(r"\((.*)/(.*)/(.*)\)",selt)
m = re.search(r"\((.*)/(.*)/(.*)\)", selt)
if m is None:
selt=selt.findNext()
m=re.search(r"\((.*)/(.*)/(.*)\)",unicode(selt))
selt = selt.findNext()
m = re.search(r"\((.*)/(.*)/(.*)\)", unicode(selt))
if m is not None:
day=m.group(1)
mon=m.group(2)
year=m.group(3)
url=url.replace('/heb/..','')
day = m.group(1)
mon = m.group(2)
year = m.group(3)
url = url.replace('/heb/..', '')
logger.debug(url)
if not redownload and _urlAlreadyDownloaded(url):
logger.debug('url already downloaded')
else:
DATA_ROOT = getattr(settings, 'DATA_ROOT')
_copy(url.replace('/heb/..',''), DATA_ROOT+'plenum_protocols/'+year+'_'+mon+'_'+day+'_'+filename, recopy=redownload)
xmlData=_antiword(DATA_ROOT+'plenum_protocols/'+year+'_'+mon+'_'+day+'_'+filename)
os.remove(DATA_ROOT+'plenum_protocols/'+year+'_'+mon+'_'+day+'_'+filename)
_copy(url.replace('/heb/..', ''),
DATA_ROOT + 'plenum_protocols/' + year + '_' + mon + '_' + day + '_' + filename,
recopy=redownload)
xmlData = _antiword(DATA_ROOT + 'plenum_protocols/' + year + '_' + mon + '_' + day + '_' + filename)
os.remove(DATA_ROOT + 'plenum_protocols/' + year + '_' + mon + '_' + day + '_' + filename)
if xmlData != '':
_updateDb(xmlData,url,year,mon,day)
_updateDb(xmlData, url, year, mon, day)


def Download(redownload, _logger):
global logger
logger = _logger
_downloadLatest(False,redownload)
_downloadLatest(True,redownload)
_downloadLatest(False, redownload)
_downloadLatest(True, redownload)


def download_for_existing_meeting(meeting):
DATA_ROOT = getattr(settings, 'DATA_ROOT')
_copy(meeting.src_url, DATA_ROOT+'plenum_protocols/tmp')
xmlData = _antiword(DATA_ROOT+'plenum_protocols/tmp')
os.remove(DATA_ROOT+'plenum_protocols/tmp')
meeting.protocol_text=xmlData
_copy(meeting.src_url, DATA_ROOT + 'plenum_protocols/tmp')
xmlData = _antiword(DATA_ROOT + 'plenum_protocols/tmp')
os.remove(DATA_ROOT + 'plenum_protocols/tmp')
meeting.protocol_text = xmlData
meeting.save()
1 change: 1 addition & 0 deletions simple/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@
GOVT_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/MKIndex_Current.asp?view=4"
KNESSET_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/MKIndex_Current.asp?view=7"
MK_HTML_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/mk_print.asp?mk_individual_id_t="
KNESSET_COMMITTEES_AGENDA_PAGE = 'http://knesset.gov.il/agenda/heb/CommitteesByDate.asp'
47 changes: 24 additions & 23 deletions simple/management/commands/parse_future_committee_meetings.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
from events.models import Event

# NB: All dates scraped from the knesset site are assumed to be in timezone Israel.
from simple.constants import KNESSET_COMMITTEES_AGENDA_PAGE

isr_tz = zoneinfo.gettz('Israel')
utc_tz = zoneinfo.gettz('UTC')

logger = logging.getLogger("open-knesset.parse_future_committee_meetings")
spamWriter = csv.writer(open('eggs.csv', 'wb'))
# spamWriter = csv.writer(open('eggs.csv', 'wb'))

ParsedResult = namedtuple('ParseResult',
'name, year, month, day, hour, minute, '
Expand All @@ -34,7 +36,7 @@ class Command(BaseCommand):
def parse_future_committee_meetings(self):
retval = []

url = 'http://knesset.gov.il/agenda/heb/CommitteesByDate.asp'
url = KNESSET_COMMITTEES_AGENDA_PAGE

data = urllib2.urlopen(url).read()

Expand Down Expand Up @@ -90,53 +92,52 @@ def parse_meeting_data(meeting_data):
end_minute=parsed.minute,
end_guessed=False)
retval[-1] = new_last

retval.append(parsed)

# since this is now a two pass, kinda, do the logging after.
for p in retval:
spamWriter.writerow([p.name.encode('utf8'), p.year, p.month,
p.day, p.hour, p.minute, p.end_hour,
p.end_minute, p.end_guessed,
p.title.encode('utf8')])

return retval

def update_future_committee_meetings_db(self, r):
for p in r:
def update_future_committee_meetings_db(self, raw_future_commitee_meetings):
if not raw_future_commitee_meetings or not len(raw_future_commitee_meetings):
logger.info('No future committee meetings found!')
for meeting in raw_future_commitee_meetings:
try:
committee = Committee.objects.get(name=p.name)
committee = Committee.objects.get(name=meeting.name)
when_over = datetime.datetime(
year=p.year, month=p.month, day=p.day, hour=p.end_hour,
minute=p.end_minute, second=0, tzinfo=isr_tz).astimezone(utc_tz)
year=meeting.year, month=meeting.month, day=meeting.day, hour=meeting.end_hour,
minute=meeting.end_minute, second=0, tzinfo=isr_tz).astimezone(utc_tz)
when = datetime.datetime(
year=p.year, month=p.month, day=p.day, hour=p.hour,
minute=p.minute, second=0, tzinfo=isr_tz).astimezone(utc_tz)
year=meeting.year, month=meeting.month, day=meeting.day, hour=meeting.hour,
minute=meeting.minute, second=0, tzinfo=isr_tz).astimezone(utc_tz)
ev, created = Event.objects.get_or_create(when=when,
when_over=when_over,
when_over_guessed=p.end_guessed,
when_over_guessed=meeting.end_guessed,
where=unicode(committee),
what=p.title,
what=meeting.title,
which_pk=committee.id,
which_type=self.committee_ct,
)
logger.debug("new event at %s - %s%s: %s" % (ev.when, ev.when_over,
'' if not ev.when_over_guessed else '(guess)',
ev.what))
except Committee.DoesNotExist:
logger.debug("couldn't find committee %s" % p.name)
logger.error("couldn't find committee %s" % meeting.name)
try:
ev, created = Event.objects.get_or_create(
when=datetime.datetime(year=p.year, month=p.month,
day=p.day, hour=p.hour,
minute=p.minute, second=0),
what=p.title)
when=datetime.datetime(year=meeting.year, month=meeting.month,
day=meeting.day, hour=meeting.hour,
minute=meeting.minute, second=0),
what=meeting.title)
except Event.MultipleObjectsReturned:
created = False
if created:
logger.debug("created %s" % ev)

def handle(self, *args, **options):
logger.debug('Events objects count before update: %d' % Event.objects.count())
r = self.parse_future_committee_meetings()
raw_future_commitee_meetings = self.parse_future_committee_meetings()
# logger.debug(r)
self.update_future_committee_meetings_db(r)
self.update_future_committee_meetings_db(raw_future_commitee_meetings)
logger.debug('Events objects count after update: %d' % Event.objects.count())

0 comments on commit 431f05c

Please sign in to comment.