diff --git a/deploy/crontab.txt b/deploy/crontab.txt index 4981b70d..ce7de61f 100644 --- a/deploy/crontab.txt +++ b/deploy/crontab.txt @@ -13,8 +13,10 @@ 30 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py okscrape lobbyists --dblog 2>&1 | /usr/bin/logger -t open_knesset #20 05 * * 1,3,5 /oknesset_data/oknesset/Open-Knesset/manage.py update_sitemap 2>&1 | /usr/bin/logger -t open_knesset 26 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py scrape_votes 2>&1 | /usr/bin/logger -t open_knesset +30 16 * * * /oknesset_data/oknesset/Open-Knesset/manage.py rescrape_missing_data_votes 2>&1 | /usr/bin/logger -t open_knesset 43 04 * * * /oknesset_data/oknesset/Open-Knesset/manage.py update_links_from_kikar 2>&1 | /usr/bin/logger -t open_knesset 53 03 * * * /oknesset_data/oknesset/Open-Knesset/manage.py okscrape events PersonsEventsScraper --dblog 2>&1 | /usr/bin/logger -t open_knesset + # email handling 02 01,05,09,13,17,21 * * * /oknesset_data/oknesset/Open-Knesset/manage.py send_mail 2>&1 | /usr/bin/logger -t open_knesset 15 17,21 * * * /oknesset_data/oknesset/Open-Knesset/manage.py retry_deferred 2>&1 | /usr/bin/logger -t open_knesset diff --git a/knesset/browser_cases.py b/knesset/browser_cases.py index 96470e6c..4257f575 100644 --- a/knesset/browser_cases.py +++ b/knesset/browser_cases.py @@ -6,13 +6,17 @@ # also, they must use the @on_platforms decorator. This decorator can run the test case several times - for different browser and platforms. @on_platforms() -class MyTestCase(BrowserTestCase): +class MainSIteBrowserTestCase(BrowserTestCase): """ Simple demo test case - just makes sure the tidbit carousel appears on the homepage """ def testHomepage(self): # inside the tests you can use self.drive which will have a ready selenium driver to use - self.driver.get(self.live_server_url+'/') + self.driver.get(self.live_server_url+'/main') # Until we return old page # most functions throw an exception if they don't find what their looking for, so you don't have to assert self.driver.find_element_by_id('tidbitCarousel') + + def testHelpPageDisplayFacebookUpdates(self): + self.driver.get(self.live_server_url + '/help') # Until we return old page + self.driver.find_element_by_id('kikar-facebook-updates-ul') diff --git a/knesset/browser_test_runner.py b/knesset/browser_test_runner.py index eb328fb2..a716999b 100644 --- a/knesset/browser_test_runner.py +++ b/knesset/browser_test_runner.py @@ -13,9 +13,9 @@ sauce_accesskey = '' sauce_platforms = [ - {"platform": "Mac OS X 10.9", "browserName": "chrome", "version": "35"}, + {"platform": "MacOS El Capitan 10.11", "browserName": "chrome", "version": "52"}, {"platform": "Windows 8.1", "browserName": "internet explorer", "version": "11"}, - {"platform": "Linux", "browserName": "firefox", "version": "29"} + {"platform": "Linux", "browserName": "firefox", "version": "44"} ] diff --git a/laws/admin.py b/laws/admin.py index b180a0cf..1d757b5c 100644 --- a/laws/admin.py +++ b/laws/admin.py @@ -1,3 +1,4 @@ +from django.db.models import Q from import_export.admin import ImportExportModelAdmin from models import Vote, Law, PrivateProposal, KnessetProposal, GovProposal, Bill, GovLegislationCommitteeDecision @@ -6,6 +7,40 @@ from django.contrib import admin +class MissingDataVotesFilter(admin.SimpleListFilter): + # Human-readable title which will be displayed in the + # right admin sidebar just above the filter options. + title = _('Missing data votes') + + # Parameter for the filter that will be used in the URL query. + parameter_name = 'is_missing_data_vote' + + def lookups(self, request, model_admin): + """ + Returns a list of tuples. The first element in each + tuple is the coded value for the option that will + appear in the URL query. The second element is the + human-readable name for the option that will appear + in the right sidebar. + """ + return ( + ('is_missing_data_vote', _('Vote has missing data')), + ) + + def queryset(self, request, queryset): + """ + Returns the filtered queryset based on the value + provided in the query string and retrievable via + `self.value()`. + """ + # Compare the requested value + # to decide how to filter the queryset. + if self.value() == 'is_missing_data_vote': + return queryset.filter(Q(votes_count=0) | Q(votes_count=None)) + else: + return queryset + + class VoteAdmin(ImportExportModelAdmin): # filter_horizontal = ('voted_for','voted_against','voted_abstain','didnt_vote') list_display = ( @@ -13,6 +48,7 @@ class VoteAdmin(ImportExportModelAdmin): 'abstain_votes_count') search_fields = ('title', 'summary', 'full_text', 'id', 'src_id') + list_filter = (MissingDataVotesFilter, ) def update_vote(self, request, queryset): vote_count = queryset.count() @@ -25,7 +61,9 @@ def update_vote(self, request, queryset): def recreate_vote(self, request, queryset): recreated_votes = ScrapeVotesCommand().recreate_objects(queryset.values_list('pk', flat=True)) - self.message_user(request, "successfully recreated {0} votes".format(len(recreated_votes), ', '.join([str(v.pk) for v in recreated_votes]))) + recreated_vote_ids_string = ', '.join([str(v.pk) for v in recreated_votes]) + self.message_user(request, "successfully recreated {0} votes: {1}".format(len(recreated_votes), + recreated_vote_ids_string)) recreate_vote.short_description = "recreate vote by deleting and then getting fresh data from knesset api" @@ -36,7 +74,7 @@ def recreate_vote(self, request, queryset): class LawAdmin(ImportExportModelAdmin): - search_fields = ('title', ) + search_fields = ('title',) list_display = ('title', 'merged_into') @@ -60,7 +98,7 @@ class KnessetProposalAdmin(admin.ModelAdmin): class GovProposalAdmin(admin.ModelAdmin): search_fields = ('title', 'booklet_number') list_display = ('bill', 'booklet_number', 'knesset_id', 'date') - list_filter = ('knesset_id', ) + list_filter = ('knesset_id',) admin.site.register(GovProposal, GovProposalAdmin) diff --git a/laws/enums.py b/laws/enums.py index f8690d24..98588466 100644 --- a/laws/enums.py +++ b/laws/enums.py @@ -1,3 +1,4 @@ +# encoding: utf-8 from django.utils.translation import ugettext_lazy as _ from knesset.enums import Enum @@ -18,9 +19,13 @@ class BillStages(Enum): FAILED_APPROVAL = u'-6' +VOTE_TYPES = {'law-approve': u'אישור החוק', 'second-call': u'קריאה שנייה', 'demurrer': u'הסתייגות', + 'no-confidence': u'הצעת אי-אמון', 'pass-to-committee': u'להעביר את ', + 'continuation': u'להחיל דין רציפות'} + VOTE_ACTION_TYPE_CHOICES = ( (u'for', _('For')), (u'against', _('Against')), (u'abstain', _('Abstain')), (u'no-vote', _('No Vote')), -) \ No newline at end of file +) diff --git a/laws/helpers.py b/laws/helpers.py new file mode 100644 index 00000000..161fba48 --- /dev/null +++ b/laws/helpers.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -* +from laws.enums import VOTE_TYPES + + +def resolve_vote_type_by_title(title): + if type(title) == str: + transform_func = str.decode + else: # its already unicode, do nothing + transform_func = lambda x, y: x + for vtype, vtype_prefix in VOTE_TYPES.items(): + if transform_func(title, 'utf8').startswith(vtype_prefix): + return vtype + return '' + + +class MissingVotePartyException(Exception): + pass + + +def party_at_or_error(member, vote_date, vote_id): + party = member.party_at(vote_date) + if party: + return party + else: + raise MissingVotePartyException( + 'could not find which party member %s belonged to during vote %s' % (member.pk, vote_id)) \ No newline at end of file diff --git a/laws/management/commands/rescrape_missing_data_votes.py b/laws/management/commands/rescrape_missing_data_votes.py new file mode 100644 index 00000000..7bad7fc7 --- /dev/null +++ b/laws/management/commands/rescrape_missing_data_votes.py @@ -0,0 +1,35 @@ +# encoding: utf-8 +from __future__ import print_function +from laws.management.commands.scrape_votes import Command as ScrapeVotesCommand +from django.core.management.base import BaseCommand +from optparse import make_option + +from django.db.models import Q + +from laws.models import Vote + +import logging + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Rescrape data for votes missing actual voting data" + + option_list = BaseCommand.option_list + ( + make_option( + '-n', action='store_true', dest="dryrun", default=False, + help='Dry run, changes nothing in the db, just display results' + ), + ) + + def handle(self, *args, **options): + votes_to_update = Vote.objects.filter(Q(votes_count=0) | Q(votes_count=None)) + logger.info('Found %s votes with missing data' % votes_to_update.count()) + if options['dryrun']: + logger.info("Not updating the db, dry run was specified") + return + + votes_ids = votes_to_update.values_list('pk', flat=True) + ScrapeVotesCommand().recreate_objects(votes_ids) + logger.info(u'Completed re scraping votes %s' % votes_ids) diff --git a/laws/management/commands/scrape_votes.py b/laws/management/commands/scrape_votes.py index b5d1a35e..78300c4f 100644 --- a/laws/management/commands/scrape_votes.py +++ b/laws/management/commands/scrape_votes.py @@ -1,7 +1,11 @@ # encoding: utf-8 +from logging import getLogger + +from django.db import transaction from knesset_data.dataservice.votes import Vote as DataserviceVote from knesset_data.html_scrapers.votes import HtmlVote from laws.models import Vote, VoteAction +from simple.constants import KNESSET_VOTE_PAGE from simple.scrapers import hebrew_strftime from simple.scrapers.management import BaseKnessetDataserviceCollectionCommand from mks.models import Member @@ -9,6 +13,8 @@ from links.models import Link from django.contrib.contenttypes.models import ContentType +logger = getLogger(__name__) + class VoteScraperException(Exception): def __init__(self, *args, **kwargs): @@ -27,42 +33,42 @@ class Command(BaseKnessetDataserviceCollectionCommand): 'time': 'datetime', 'meeting_number': "session_num", 'vote_number': 'nbr_in_sess', - 'src_url': lambda vote: "http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s" % vote.id + 'src_url': lambda vote: KNESSET_VOTE_PAGE % vote.id } VALIDATE_FIELDS_TO_AUTOFIX = ['title', 'src_url'] help = "Scrape votes data from the knesset" + @transaction.atomic def _update_or_create_vote(self, dataservice_vote, oknesset_vote=None): vote_kwargs = self._get_dataservice_model_kwargs(dataservice_vote) if oknesset_vote: - [setattr(oknesset_vote, k, v) for k, v in vote_kwargs.iteritems()] + [setattr(oknesset_vote, k, v) for k, v in vote_kwargs.items()] oknesset_vote.save() else: oknesset_vote = Vote.objects.create(**vote_kwargs) self._add_vote_actions(dataservice_vote, oknesset_vote) oknesset_vote.update_vote_properties() SyncdataCommand().find_synced_protocol(oknesset_vote) - Link.objects.create( + + Link.objects.get_or_create( title=u'ההצבעה באתר הכנסת', - url='http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s' % oknesset_vote.src_id, + url=KNESSET_VOTE_PAGE % oknesset_vote.src_id, content_type=ContentType.objects.get_for_model(oknesset_vote), object_pk=str(oknesset_vote.id) ) return oknesset_vote - # if v.full_text_url != None: - # l = Link(title=u'מסמך הצעת החוק באתר הכנסת', url=v.full_text_url, content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) - # l.save() def _add_vote_actions(self, dataservice_vote, oknesset_vote): for member_id, vote_result_code in HtmlVote.get_from_vote_id(dataservice_vote.id).member_votes: member_qs = Member.objects.filter(pk=member_id) if member_qs.exists(): member = member_qs.first() - vote_type = self._resolve_vote_type(vote_result_code) + vote_type = self._resolve_vote_type(vote_result_code) # TODO: Move to static helper + member_party_at_time = member.party_at(oknesset_vote.time.date()) vote_action, created = VoteAction.objects.get_or_create(vote=oknesset_vote, member=member, defaults={'type': vote_type, - 'party': member.current_party}) + 'party': member_party_at_time}) if created: vote_action.save() else: @@ -76,7 +82,10 @@ def _get_existing_object(self, dataservice_object): return Vote.objects.get(src_id=dataservice_object.id) def _create_new_object(self, dataservice_vote): - return self._update_or_create_vote(dataservice_vote) + try: + return self._update_or_create_vote(dataservice_vote) + except VoteScraperException: + logger.exception('Vote scraping exception for %s' % dataservice_vote) def _resolve_vote_type(cls, vote_result_code): return { @@ -89,13 +98,24 @@ def _resolve_vote_type(cls, vote_result_code): def recreate_objects(self, vote_ids): recreated_votes = [] for vote_id in vote_ids: - oknesset_vote = Vote.objects.get(id=int(vote_id)) - vote_src_id = oknesset_vote.src_id - dataservice_vote = self.DATASERVICE_CLASS.get(vote_src_id) - VoteAction.objects.filter(vote=oknesset_vote).delete() - Link.objects.filter(content_type=ContentType.objects.get_for_model(oknesset_vote), - object_pk=oknesset_vote.id).delete() - recreated_votes.append(self._update_or_create_vote(dataservice_vote, oknesset_vote)) + logger.info('Attempting rescraping for vote id %s' % vote_id) + try: + try: + oknesset_vote = Vote.objects.get(id=int(vote_id)) + except Vote.DoesNotExist: + raise VoteScraperException('Vote to recreate does not exist %s' % vote_id) + vote_src_id = oknesset_vote.src_id + try: + dataservice_vote = self.DATASERVICE_CLASS.get(vote_src_id) + except Exception: + raise VoteScraperException('Failure to fetch knesset data dto for vote id %s' % vote_id) + VoteAction.objects.filter(vote=oknesset_vote).delete() + Link.objects.filter(content_type=ContentType.objects.get_for_model(oknesset_vote), + object_pk=oknesset_vote.id).delete() + recreated_votes.append(self._update_or_create_vote(dataservice_vote, oknesset_vote)) + logger.info('Success rescraping for vote id %s' % vote_id) + except VoteScraperException: + logger.exception('Vote scraper exception for vote %s' % vote_id) return recreated_votes def _get_validate_first_object_title(self, dataservice_object): diff --git a/laws/models/vote.py b/laws/models/vote.py index df3e6b9b..c45655be 100644 --- a/laws/models/vote.py +++ b/laws/models/vote.py @@ -9,6 +9,8 @@ from tagging.models import TaggedItem, Tag from laws import constants +from laws.enums import VOTE_TYPES +from laws.helpers import resolve_vote_type_by_title from laws.models.bill import Bill from laws.models.vote_action import VoteAction from laws.vote_choices import TYPE_CHOICES @@ -23,15 +25,12 @@ class VoteManager(models.Manager): # TODO: add i18n to the types so we'd have # {'law-approve': _('approve law'), ... - VOTE_TYPES = {'law-approve': u'אישור החוק', 'second-call': u'קריאה שנייה', 'demurrer': u'הסתייגות', - 'no-confidence': u'הצעת אי-אמון', 'pass-to-committee': u'להעביר את ', - 'continuation': u'להחיל דין רציפות'} def filter_and_order(self, *args, **kwargs): qs = self.all() filter_kwargs = {} if kwargs.get('vtype') and kwargs['vtype'] != 'all': - filter_kwargs['title__startswith'] = self.VOTE_TYPES[kwargs['vtype']] + filter_kwargs['title__startswith'] = VOTE_TYPES[kwargs['vtype']] if filter_kwargs: qs = qs.filter(**filter_kwargs) @@ -139,16 +138,6 @@ def against_coalition_votes(self): def against_own_bill_votes(self): return self.votes.filter(voteaction__against_own_bill=True) - def _vote_type(self): - if type(self.title) == str: - f = str.decode - else: # its already unicode, do nothing - f = lambda x, y: x - for vtype, vtype_prefix in VoteManager.VOTE_TYPES.iteritems(): - if f(self.title, 'utf8').startswith(vtype_prefix): - return vtype - return '' - def short_summary(self): if self.summary is None: return '' @@ -298,12 +287,12 @@ def party_at_or_error(member, vote_date): self.abstain_votes_count = VoteAction.objects.filter(vote=self, type='abstain').count() self.controversy = min(self.for_votes_count or 0, self.against_votes_count or 0) - self.vote_type = self._vote_type() + self.vote_type = resolve_vote_type_by_title(self.title) self.save() def redownload_votes_page(self): from simple.management.commands.syncdata import Command as SyncdataCommand - (page, vote_src_url) = SyncdataCommand().read_votes_page(self.src_id) + page, vote_src_url = SyncdataCommand().read_votes_page(self.src_id) return page def update_from_knesset_data(self): @@ -340,14 +329,13 @@ def reparse_members_from_votes_page(self, page=None): results = syncdata.read_member_votes(page, return_ids=True) for (voter_id, voter_party, vote) in results: try: - m = Member.objects.get(pk=int(voter_id)) - except: - exceptionType, exceptionValue, exceptionTraceback = sys.exc_info() - logger.error("%svoter_id = %s", - ''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback)), - str(voter_id)) + member = Member.objects.get(pk=int(voter_id)) + except Exception: + + logger.exception("reparse vote member exception for vote %s member %s" % (self.pk, member.pk)) continue - va, created = VoteAction.objects.get_or_create(vote=self, member=m, - defaults={'type': vote, 'party': m.current_party}) + + va, created = VoteAction.objects.get_or_create(vote=self, member=member, + defaults={'type': vote, 'party': member.current_party}) if created: va.save() diff --git a/mks/admin.py b/mks/admin.py index 8d8c20d0..70dee8f4 100644 --- a/mks/admin.py +++ b/mks/admin.py @@ -126,7 +126,10 @@ class CorrelationAdmin(admin.ModelAdmin): class MembershipAdmin(ImportExportModelAdmin): + list_select_related = True ordering = ('member__name',) + list_display = ('member', 'party', 'start_date', 'end_date') + list_filter = ('party', ) admin.site.register(Membership, MembershipAdmin) @@ -148,7 +151,7 @@ class AwardAdmin(ImportExportModelAdmin): class KnessetAdmin(admin.ModelAdmin): - pass + list_display = ('number', 'start_date', 'end_date') admin.site.register(Knesset, KnessetAdmin) diff --git a/mks/managers.py b/mks/managers.py index 6795a146..5a8ea491 100644 --- a/mks/managers.py +++ b/mks/managers.py @@ -31,9 +31,9 @@ def get_knesset_by_date(self, a_date): return self.get_queryset().get(start_date__lte=a_date, end_date__gt=a_date) -class BetterManager(models.Manager): +class NameAwareManager(models.Manager): def __init__(self): - super(BetterManager, self).__init__() + super(NameAwareManager, self).__init__() self._names = [] def find(self, name): @@ -56,7 +56,11 @@ def find(self, name): return ret -class PartyManager(BetterManager): +class MemberManager(NameAwareManager): + pass + + +class PartyManager(NameAwareManager): def parties_during_range(self, ranges=None): from agendas.models import Agenda filters_folded = Agenda.generateSummaryFilters(ranges, 'start_date', 'end_date') @@ -68,23 +72,23 @@ def __init__(self): super(CurrentKnessetPartyManager, self).__init__() self._current = None - def get_query_set(self): + def get_queryset(self): # caching won't help here, as the query set will be re-run on each # request, and we may need to further run queries down the road from mks.models import Knesset - qs = super(CurrentKnessetPartyManager, self).get_query_set() + qs = super(CurrentKnessetPartyManager, self).get_queryset() qs = qs.filter(knesset=Knesset.objects.current_knesset()) return qs @property def current_parties(self): if self._current is None: - self._current = list(self.get_query_set()) + self._current = list(self.get_queryset()) return self._current -class CurrentKnessetMembersManager(models.Manager): +class CurrentKnessetMembersManager(MemberManager): """ Adds the ability to filter on current knesset """ diff --git a/mks/models.py b/mks/models.py index e7e08d1d..72b5a64e 100644 --- a/mks/models.py +++ b/mks/models.py @@ -20,8 +20,8 @@ from links.models import Link from mks.managers import ( - BetterManager, PartyManager, KnessetManager, CurrentKnessetMembersManager, - CurrentKnessetPartyManager, MembershipManager, CurrentKnessetActiveMembersManager) + PartyManager, KnessetManager, CurrentKnessetMembersManager, + CurrentKnessetPartyManager, MembershipManager, CurrentKnessetActiveMembersManager, MemberManager) GENDER_CHOICES = ( (u'M', _('Male')), @@ -248,7 +248,7 @@ class Member(models.Model): backlinks_enabled = models.BooleanField(default=True) - objects = BetterManager() + objects = MemberManager() current_knesset = CurrentKnessetMembersManager() current_members = CurrentKnessetActiveMembersManager() @@ -574,12 +574,6 @@ def awards(self): def convictions(self): return self.awards_and_convictions.filter(award_type__valence__lt=0) - # @property - # def committees(self): - # """Committee list (splitted by comma)""" - # - # return [x.strip() for x in self.committees.split(',')] - class WeeklyPresence(models.Model): member = models.ForeignKey('Member') diff --git a/simple/constants.py b/simple/constants.py index 0b743ecd..456e38a2 100644 --- a/simple/constants.py +++ b/simple/constants.py @@ -1,11 +1,6 @@ # -*- coding: utf-8 -* SPECIAL_COMMITTEES_NAMES = [u"הוועדה המשותפת לנושא סביבה ובריאות", ] -SECOND_AND_THIRD_READING_LAWS_URL = 'http://www.knesset.gov.il/privatelaw/plaw_display.asp?LawTp=2' -PRIVATE_LAWS_URL = r"http://www.knesset.gov.il/privatelaw/Plaw_display.asp?lawtp=1" -KNESSET_LAWS_URL = r"http://www.knesset.gov.il/laws/heb/template.asp?Type=3" -GOV_LAWS_URL = r"http://www.knesset.gov.il/laws/heb/template.asp?Type=4" - # some party names appear in the knesset website in several forms. # this dictionary is used to transform them to canonical form. CANONICAL_PARTY_ALIASES = {'עבודה': 'העבודה', @@ -20,7 +15,17 @@ 'יחד (ישראל חברתית דמוקרטית) והבחירה הדמוקרטית': 'מרצ-יחד והבחירה הדמוקרטית', 'יחד (ישראל חברתית דמוקרטית) והבחירה הדמוקרטית': 'מרצ-יחד והבחירה הדמוקרטית', } + +SECOND_AND_THIRD_READING_LAWS_URL = 'http://www.knesset.gov.il/privatelaw/plaw_display.asp?LawTp=2' +PRIVATE_LAWS_URL = r"http://www.knesset.gov.il/privatelaw/Plaw_display.asp?lawtp=1" +KNESSET_LAWS_URL = r"http://www.knesset.gov.il/laws/heb/template.asp?Type=3" +GOV_LAWS_URL = r"http://www.knesset.gov.il/laws/heb/template.asp?Type=4" GOVT_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/MKIndex_Current.asp?view=4" KNESSET_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/MKIndex_Current.asp?view=7" MK_HTML_INFO_PAGE = r"http://www.knesset.gov.il/mk/heb/mk_print.asp?mk_individual_id_t=" KNESSET_COMMITTEES_AGENDA_PAGE = 'http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx' +KNESSET_VOTE_PAGE = 'http://www.knesset.gov.il/vote/heb/Vote_Res_Map.asp?vote_id_t=%s' +KNESSET_PROTOCOL_SEARCH_PAGE = "http://online.knesset.gov.il/eprotocol/PUBLIC/SearchPEOnline.aspx" +KNESSET_SYNCED_PROTOCOL_PAGE = 'http://online.knesset.gov.il/eprotocol/PLAYER/PEPlayer.aspx?ProtocolID=%s' + +KNESSET_PRESENT_MKS_PAGE = 'http://www.knesset.gov.il/presence/heb/PresentList.aspx' diff --git a/simple/management/commands/syncdata.py b/simple/management/commands/syncdata.py index feeae2fe..d197a356 100644 --- a/simple/management/commands/syncdata.py +++ b/simple/management/commands/syncdata.py @@ -1,11 +1,11 @@ # -*- coding: utf-8 -*- -import cookielib + import datetime import gzip import logging import os import re -import socket + import time import traceback import urllib @@ -28,11 +28,12 @@ KnessetProposal, GovProposal, GovLegislationCommitteeDecision) from links.models import Link from mks.models import Member, Party, Membership, WeeklyPresence, Knesset -from mks.utils import get_all_mk_names + from persons.models import Person, PersonAlias -from simple.constants import SPECIAL_COMMITTEES_NAMES, SECOND_AND_THIRD_READING_LAWS_URL, CANONICAL_PARTY_ALIASES -from simple.management.utils import antiword +from simple.constants import SPECIAL_COMMITTEES_NAMES, SECOND_AND_THIRD_READING_LAWS_URL, CANONICAL_PARTY_ALIASES, \ + KNESSET_PROTOCOL_SEARCH_PAGE, KNESSET_SYNCED_PROTOCOL_PAGE, KNESSET_PRESENT_MKS_PAGE + from simple.parsers import mk_roles_parser from simple.parsers import parse_laws from simple.parsers import parse_remote @@ -74,8 +75,6 @@ class Command(NoArgsDbLogCommand): help="download and parse presence"), make_option('--update', action='store_true', dest='update', help="online update of data."), - make_option('--committees', action='store_true', dest='committees', - help="online update of committees data."), make_option('--update-run-only', action='store', dest='update-run-only', help="only run update for the provided functions. Should contain comma-seperated list of functions to run.") ) @@ -100,7 +99,6 @@ def _handle_noargs(self, **options): update = options.get('update', False) laws = options.get('laws', False) presence = options.get('presence', False) - committees = options.get('committees', False) if all_options: download = True @@ -108,7 +106,7 @@ def _handle_noargs(self, **options): process = True dump_to_file = True - selected_options = [all_options, download, load, process, dump_to_file, update, laws, committees] + selected_options = [all_options, download, load, process, dump_to_file, update, laws] if not any(selected_options): logger.error( "no arguments found. doing nothing. \ntry -h for help.\n--all to run the full syncdata flow.\n--update for an online dynamic update.") @@ -152,7 +150,6 @@ def _handle_noargs(self, **options): update_run_only = None for func in ['update_laws_data', 'update_presence', - # 'get_protocols', - handled by the new okscraper 'parse_laws', 'find_proposals_in_other_data', 'merge_duplicate_laws', @@ -174,10 +171,6 @@ def _handle_noargs(self, **options): logger.exception("Caught Exception in syncdata update phase %s", func) logger.info('finished update') - if committees: - self.get_protocols() - logger.info('finished committees update') - def read_laws_page(self, index): url = '%s' % SECOND_AND_THIRD_READING_LAWS_URL @@ -267,46 +260,6 @@ def update_laws_data(self): self.get_approved_bill_text_for_vote(vote) logger.debug("finished updating laws data") - def update_vote_from_page(self, vote_id, vote_src_url, page): - (vote_label, vote_meeting_num, vote_num, date) = self.get_vote_data(page) - logger.debug("downloaded data with vote id %d" % vote_id) - vote_time_string = date.replace(' ', ' ') - for i in self.heb_months: - if i in vote_time_string: - month = self.heb_months.index(i) + 1 - day = re.search("""(\d\d?)""", vote_time_string).group(1) - year = re.search("""(\d\d\d\d)""", vote_time_string).group(1) - vote_hm = datetime.datetime.strptime(vote_time_string.split(' ')[-1], "%H:%M") - vote_time = datetime.datetime(int(year), int(month), int(day), vote_hm.hour, vote_hm.minute) - # vote_label_for_search = self.get_search_string(vote_label) - - try: - v = Vote.objects.get(src_id=vote_id) - created = False - except Vote.DoesNotExist: - v = Vote(title=vote_label, time_string=vote_time_string, importance=1, src_id=vote_id, time=vote_time) - try: - vote_meeting_num = int(vote_meeting_num) - v.meeting_number = vote_meeting_num - except: - pass - try: - vote_num = int(vote_num) - v.vote_number = vote_num - except: - pass - v.src_url = vote_src_url - v.save() - if v.full_text_url != None: - l = Link(title=u'מסמך הצעת החוק באתר הכנסת', url=v.full_text_url, - content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) - l.save() - - v.reparse_members_from_votes_page(page) - v.update_vote_properties() - v = Vote.objects.get(src_id=vote_id) - self.find_synced_protocol(v) - def get_votes_data(self): # TODO: is this ever used? self.update_last_downloaded_vote_id() @@ -377,7 +330,7 @@ def get_members_data(self, max_mk_id=1000, min_mk_id=1): 'דואר אלקטרוני', 'מצב משפחתי', 'מספר ילדים', 'תאריך לידה', 'שנת לידה', 'מקום לידה', 'תאריך פטירה', 'שנת עלייה', - 'כנסת 18', 'כנסת 19'] + 'כנסת 18', 'כנסת 19', 'כנסת 20'] # note that hebrew strings order is right-to-left # so output file order is id, name, img_link, phone, ... @@ -431,18 +384,21 @@ def update_last_downloaded_vote_id(self): f.close() def update_mks_is_current(self): - """Set is_current=True if and only if mk is currently serving. + """ + Set is_current=True if and only if mk is currently serving. This is done by looking at the presence page in the knesset website. """ - URL = 'http://www.knesset.gov.il/presence/heb/PresentList.aspx' - x = urllib2.urlopen(URL).read() - m = re.search('lbHowManyMKs2(.*)lbHowManyMKs', x, re.DOTALL) - mks = re.findall('mk_individual_id_t=(\d+)', m.group()) - logger.debug('found %d current mks' % len(mks)) - updated = Member.objects.filter(id__in=mks).update(is_current=True) - logger.debug('updated %d mks to is_current=True' % updated) - updated = Member.objects.exclude(id__in=mks).update(is_current=False) - logger.debug('updated %d mks to is_current=False' % updated) + + page = urllib2.urlopen(KNESSET_PRESENT_MKS_PAGE).read() + mk_current_area = re.search('lbHowManyMKs2(.*)lbHowManyMKs', page, re.DOTALL) + mks_ids = re.findall('mk_individual_id_t=(\d+)', mk_current_area.group()) + logger.info('found %d current mks' % len(mks_ids)) + if not len(mks_ids): + logger.error('No current mks!') + updated = Member.objects.filter(id__in=mks_ids).update(is_current=True) + logger.info('updated %d mks to is_current=True' % updated) + updated = Member.objects.exclude(id__in=mks_ids).update(is_current=False) + logger.info('updated %d mks to is_current=False' % updated) def update_members_from_file(self): logger.debug('update_members_from_file') @@ -803,298 +759,50 @@ def get_vote_data(self, page): return \ name, meeting_num, vote_num, date - def find_synced_protocol(self, v): + def find_synced_protocol(self, vote): + search_text = '' try: - search_text = '' - url = "http://online.knesset.gov.il/eprotocol/PUBLIC/SearchPEOnline.aspx" - to_day = from_day = str(v.time.day) - to_month = from_month = str(v.time.month) - to_year = from_year = str(v.time.year) - m = re.search(' - (.*),?', v.title) + + to_day = from_day = str(vote.time.day) + to_month = from_month = str(vote.time.month) + to_year = from_year = str(vote.time.year) + m = re.search(' - (.*),?', vote.title) if not m: - logger.debug("couldn't create search string for vote\nvote.id=%s\nvote.title=%s\n", str(v.id), v.title) + logger.debug(u"couldn't create search string for vote\nvote.id=%s\nvote.title=%s\n", str(vote.id), + vote.title) return search_text = urllib2.quote(m.group(1).replace('(', '').replace(')', '').replace('`', '').encode('utf8')) # I'm really sorry for the next line, but I really had no choice: params = '__EVENTARGUMENT=&__EVENTTARGET=&__LASTFOCUS=&__PREVIOUSPAGE=bEfxzzDx0cPgMul_87gMIa3L4OOi0E21r4EnHaLHKQAsWXdde-10pzxRGZZaJFCK0&__SCROLLPOSITIONX=0&__SCROLLPOSITIONY=0&__VIEWSTATE=%2FwEPDwUKMjA3MTAzNTc1NA8WCB4VU0VTU0lPTl9SQU5ET01fTlVNQkVSAswEHhFPTkxZX0RBVEVTX1NFQVJDSGgeEFBSRVZJRVdfRFRfQ0FDSEUy5AQAAQAAAP%2F%2F%2F%2F8BAAAAAAAAAAQBAAAA7AFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5EaWN0aW9uYXJ5YDJbW1N5c3RlbS5JbnQzMiwgbXNjb3JsaWIsIFZlcnNpb249Mi4wLjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1iNzdhNWM1NjE5MzRlMDg5XSxbU3lzdGVtLkRhdGEuRGF0YVRhYmxlLCBTeXN0ZW0uRGF0YSwgVmVyc2lvbj0yLjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPWI3N2E1YzU2MTkzNGUwODldXQMAAAAHVmVyc2lvbghDb21wYXJlcghIYXNoU2l6ZQADAAiRAVN5c3RlbS5Db2xsZWN0aW9ucy5HZW5lcmljLkdlbmVyaWNFcXVhbGl0eUNvbXBhcmVyYDFbW1N5c3RlbS5JbnQzMiwgbXNjb3JsaWIsIFZlcnNpb249Mi4wLjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1iNzdhNWM1NjE5MzRlMDg5XV0IAAAAAAkCAAAAAAAAAAQCAAAAkQFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5HZW5lcmljRXF1YWxpdHlDb21wYXJlcmAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTIuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAAAAAAseFEFQUFJOQ19DT1VOVEVSX0NBQ0hFMtgEAAEAAAD%2F%2F%2F%2F%2FAQAAAAAAAAAEAQAAAOABU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuRGljdGlvbmFyeWAyW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTIuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV0sW1N5c3RlbS5JbnQzMiwgbXNjb3JsaWIsIFZlcnNpb249Mi4wLjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj1iNzdhNWM1NjE5MzRlMDg5XV0DAAAAB1ZlcnNpb24IQ29tcGFyZXIISGFzaFNpemUAAwAIkQFTeXN0ZW0uQ29sbGVjdGlvbnMuR2VuZXJpYy5HZW5lcmljRXF1YWxpdHlDb21wYXJlcmAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTIuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dCAAAAAAJAgAAAAAAAAAEAgAAAJEBU3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuR2VuZXJpY0VxdWFsaXR5Q29tcGFyZXJgMVtbU3lzdGVtLkludDMyLCBtc2NvcmxpYiwgVmVyc2lvbj0yLjAuMC4wLCBDdWx0dXJlPW5ldXRyYWwsIFB1YmxpY0tleVRva2VuPWI3N2E1YzU2MTkzNGUwODldXQAAAAALFgJmD2QWAgIDD2QWAgIDD2QWCgIDDw8WAh4EVGV4dAX%2BBiBTRUxFQ1QgICAgIHRNZXRhRGF0YS5pSXRlbUlELCB0TWV0YURhdGEuaVRvcklELCB0TWV0YURhdGEuaUl0ZW1UeXBlLCB0TWV0YURhdGEuaVBhcmVudCwgdE1ldGFEYXRhLmlJdGVtUmF3SWQsIHRNZXRhRGF0YS5zVGl0bGUsICAgICAgICAgICAgICB0TWV0YURhdGEuc1RleHQsIHRNZXRhRGF0YS5pUGFnZSwgIHRNZXRhRGF0YS5pV29yZENvdW50ZXIsIHRNZXRhRGF0YS5pQnVsa051bSwgdE1ldGFEYXRhLmlFbGVtZW50SW5lZHhlciAgRlJPTSAgICAgICB0RGlzY3Vzc2lvbnMgSU5ORVIgSk9JTiAgICAgICAgICAgICB0VG9yaW0gT04gdERpc2N1c3Npb25zLmlEaXNjSUQgPSB0VG9yaW0uaURpc2NJRCBJTk5FUiBKT0lOICAgICAgICAgICAgIHRNZXRhRGF0YSBPTiB0VG9yaW0uaVRvciA9IHRNZXRhRGF0YS5pVG9ySUQgIFdIRVJFICB0VG9yaW0uYkhhc0ZpbmFsRG9jPTAgQU5EICAoQ09OVEFJTlMoc1RleHQsIE4nIteQ15nXqdeV16gg15TXl9eV16cg15TXptei16og15fXldenINeT15XXkyDXkdefINeS15XXqNeZ15XXnyDXqteZ16fXldefINeU16rXqSIi16IgMjAxMCIgICcpIE9SIENPTlRBSU5TKHNUaXRsZSwgTici15DXmdep15XXqCDXlNeX15XXpyDXlNem16LXqiDXl9eV16cg15PXldeTINeR158g15LXldeo15nXldefINeq15nXp9eV158g15TXqtepIiLXoiAyMDEwIiAgJykpIEFORCAgREFURURJRkYoREFZLCAnMi8yMi8yMDEwJyAsIHREaXNjdXNzaW9ucy5kRGF0ZSk%2BPTAgQU5EICBEQVRFRElGRihEQVksIHREaXNjdXNzaW9ucy5kRGF0ZSwgJzIvMjIvMjAxMCcpPj0wIEFORCAgdERpc2N1c3Npb25zLmlLbmVzc2V0IElOICgxOCkgQU5EICB0RGlzY3Vzc2lvbnMuaURpc2NUeXBlID0gMSBPUkRFUiBCWSBbaVRvcklEXSBERVNDLCBbaUVsZW1lbnRJbmVkeGVyXWRkAgUPDxYCHwRlZGQCBw9kFhYCAQ8PFgIfBAUi15fXmdek15XXqSDXkSLXk9eR16jXmSDXlNeb16DXodeqImRkAgMPD2QWAh4Jb25rZXlkb3duBcgBaWYgKChldmVudC53aGljaCAmJiBldmVudC53aGljaCA9PSAxMykgfHwgKGV2ZW50LmtleUNvZGUgJiYgZXZlbnQua2V5Q29kZSA9PSAxMykpICAgICB7ZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQoJ2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfYnRuU2VhcmNoJykuY2xpY2soKTtyZXR1cm4gZmFsc2U7fSAgICAgZWxzZSByZXR1cm4gdHJ1ZTtkAgcPDxYCHhRDdHJsRm9jdXNBZnRlclNlbGVjdAUpY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9idG5TZWFyY2hkFgQCAw8PZBYEHgZvbmJsdXIFSEhpZGVBQ1BvcHVsYXRlX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaERvdmVyX3dzQXV0b0NvbXBsZXRlMR4Hb25rZXl1cAVbcmV0dXJuIEF1dG9Db21wbGV0ZUNoZWNrRGVsZXRlKGV2ZW50LCAnY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9zcmNoRG92ZXJfaGRuVmFsdWUnKWQCBQ8WBh4RT25DbGllbnRQb3B1bGF0ZWQFVkF1dG9Db21wbGV0ZV9DbGllbnRQb3B1bGF0ZWRfY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9zcmNoRG92ZXJfd3NBdXRvQ29tcGxldGUxHhRPbkNsaWVudEl0ZW1TZWxlY3RlZAVUd3NBdXRvQ29tcGxldGVfanNfc2VsZWN0ZWRfY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9zcmNoRG92ZXJfd3NBdXRvQ29tcGxldGUxHhJPbkNsaWVudFBvcHVsYXRpbmcFSFNob3dBQ1BvcHVsYXRlX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaERvdmVyX3dzQXV0b0NvbXBsZXRlMWQCCQ8PFgIfBgUpY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9idG5TZWFyY2hkFgQCAw8PZBYEHwcFSkhpZGVBQ1BvcHVsYXRlX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaE1hbmFnZXJfd3NBdXRvQ29tcGxldGUxHwgFXXJldHVybiBBdXRvQ29tcGxldGVDaGVja0RlbGV0ZShldmVudCwgJ2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaE1hbmFnZXJfaGRuVmFsdWUnKWQCBQ8WBh8JBVhBdXRvQ29tcGxldGVfQ2xpZW50UG9wdWxhdGVkX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaE1hbmFnZXJfd3NBdXRvQ29tcGxldGUxHwoFVndzQXV0b0NvbXBsZXRlX2pzX3NlbGVjdGVkX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaE1hbmFnZXJfd3NBdXRvQ29tcGxldGUxHwsFSlNob3dBQ1BvcHVsYXRlX2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaE1hbmFnZXJfd3NBdXRvQ29tcGxldGUxZAIND2QWBAIBDxBkEBUGFdeh15XXkteZINeT15nXldeg15nXnQzXqdeQ15nXnNeq15QP15TXptei16og15fXldenFteU16bXoteqINeQ15kg15DXnteV158a15TXptei15Qg15zXodeT16gg15TXmdeV150j15TXptei15Qg15zXodeT16gg15nXldedINeb15XXnNec16oVBgEwATEBMgEzATQCMTUUKwMGZ2dnZ2dnZGQCCQ8PZBYCHwUFyAFpZiAoKGV2ZW50LndoaWNoICYmIGV2ZW50LndoaWNoID09IDEzKSB8fCAoZXZlbnQua2V5Q29kZSAmJiBldmVudC5rZXlDb2RlID09IDEzKSkgICAgIHtkb2N1bWVudC5nZXRFbGVtZW50QnlJZCgnY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9idG5TZWFyY2gnKS5jbGljaygpO3JldHVybiBmYWxzZTt9ICAgICBlbHNlIHJldHVybiB0cnVlO2QCDw8PFgIeBERhdGUGAABgHGmBzAhkFgJmD2QWAmYPZBYCAgEPZBYEZg9kFgpmD2QWAgIBDw8WAh8EBQQyMDEwFgIfCAVVcmV0dXJuIERhdGVQaWNrZXJEZWxldGUoZXZlbnQsICdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXJXcmFwcGVyX3NyY2hEYXRlc1BlcmlvZEZyb20nKWQCAg9kFgICAQ8PFgIfBAUBMhYCHwgFVXJldHVybiBEYXRlUGlja2VyRGVsZXRlKGV2ZW50LCAnY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9zcmNoRGF0ZXNQZXJpb2RGcm9tJylkAgQPZBYCAgEPDxYCHwQFAjIyFgIfCAVVcmV0dXJuIERhdGVQaWNrZXJEZWxldGUoZXZlbnQsICdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXJXcmFwcGVyX3NyY2hEYXRlc1BlcmlvZEZyb20nKWQCBg9kFgICAQ8WAh8EBQbXqdeg15lkAgcPZBYCAgEPDxYCHwQFCjIyLzAyLzIwMTAWBB8IBQ92YWxpZERhdGUodGhpcykfBwVJaXNEYXRlKHRoaXMsJ2N0bDAwX0NvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXJfc3JjaERhdGVzUGVyaW9kRnJvbV9sYmxNc2cnKWQCAQ9kFgJmD2QWAmYPDxYCHwQFFteXJyDXkdeQ15PXqCDXlNeq16ki16JkZAIRDw8WAh8MBgAAYBxpgcwIZBYCZg9kFgJmD2QWAgIBD2QWBGYPZBYKZg9kFgICAQ8PFgIfBAUEMjAxMBYCHwgFU3JldHVybiBEYXRlUGlja2VyRGVsZXRlKGV2ZW50LCAnY3RsMDBfQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlcl9zcmNoRGF0ZXNQZXJpb2RUbycpZAICD2QWAgIBDw8WAh8EBQEyFgIfCAVTcmV0dXJuIERhdGVQaWNrZXJEZWxldGUoZXZlbnQsICdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXJXcmFwcGVyX3NyY2hEYXRlc1BlcmlvZFRvJylkAgQPZBYCAgEPDxYCHwQFAjIyFgIfCAVTcmV0dXJuIERhdGVQaWNrZXJEZWxldGUoZXZlbnQsICdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXJXcmFwcGVyX3NyY2hEYXRlc1BlcmlvZFRvJylkAgYPZBYCAgEPFgIfBAUG16nXoNeZZAIHD2QWAgIBDw8WAh8EBQoyMi8wMi8yMDEwFgQfCAUPdmFsaWREYXRlKHRoaXMpHwcFR2lzRGF0ZSh0aGlzLCdjdGwwMF9Db250ZW50UGxhY2VIb2xkZXJXcmFwcGVyX3NyY2hEYXRlc1BlcmlvZFRvX2xibE1zZycpZAIBD2QWAmYPZBYCZg8PFgIfBAUW15cnINeR15DXk9eoINeU16rXqSLXomRkAhUPEA8WAh4LXyFEYXRhQm91bmRnZBAVARDXlNeb16DXodeqINeUIDE4FQECMTgUKwMBZ2RkAhkPDxYCHgtQb3N0QmFja1VybAUlL2Vwcm90b2NvbC9QVUJMSUMvU2VhcmNoUEVPbmxpbmUuYXNweGRkAhsPDxYCHw4FJS9lcHJvdG9jb2wvUFVCTElDL1NlYXJjaFBFT25saW5lLmFzcHhkZAIdDw8WBB8EBTfXnNeQINeg157XpteQ15Ug16rXldem15DXldeqINec15fXmdek15XXqSDXlNee15HXlden16kuHgdWaXNpYmxlaGRkAgkPZBYGAgEPDxYCHwQFYSDXnteZ15zXlFzXmdedOiA8Yj7XkDwvYj4sICAgICAgICDXkdeY15XXldeXINeq15DXqNeZ15vXmdedOiA8Yj7Xni0yMi8wMi8yMDEwINei15MtMjIvMDIvMjAxMDwvYj5kZAIDDw8WAh8EBQEwZGQCBw8PFgIfBGVkZAILDw8WBh8EBRzXl9eW15XXqCDXnNee16HXmiDXl9eZ16TXldepHw4FJS9lcHJvdG9jb2wvUFVCTElDL1NlYXJjaFBFT25saW5lLmFzcHgfD2hkZBgBBR5fX0NvbnRyb2xzUmVxdWlyZVBvc3RCYWNrS2V5X18WCQU4Y3RsMDAkQ29udGVudFBsYWNlSG9sZGVyV3JhcHBlciRzcmNoQ0tfaW50ZXJydXB0X3NwZWFrZXIFMWN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXIkcmRvU2VhcmNoQnlOdW1iZXIFMWN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXIkcmRvU2VhcmNoQnlOdW1iZXIFL2N0bDAwJENvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXIkcmRvU2VhcmNoQnlUZXh0BTFjdGwwMCRDb250ZW50UGxhY2VIb2xkZXJXcmFwcGVyJHNyY2hfcmRvX1llc2hpdml0BTFjdGwwMCRDb250ZW50UGxhY2VIb2xkZXJXcmFwcGVyJHNyY2hfcmRvX1llc2hpdml0BS5jdGwwMCRDb250ZW50UGxhY2VIb2xkZXJXcmFwcGVyJHNyY2hfcmRvX1RvcmltBTxjdGwwMCRDb250ZW50UGxhY2VIb2xkZXJXcmFwcGVyJHNyY2hEYXRlc1BlcmlvZEZyb20kYnRuUG9wVXAFOmN0bDAwJENvbnRlbnRQbGFjZUhvbGRlcldyYXBwZXIkc3JjaERhdGVzUGVyaW9kVG8kYnRuUG9wVXCpRkP1sigDyMUEQRUVvHjI2IVBFw%3D%3D&ctl00%24ContentPlaceHolderWrapper%24STATUS=srch_rdo_Torim&ctl00%24ContentPlaceHolderWrapper%24SearchSubjectRDO=rdoSearchByText&ctl00%24ContentPlaceHolderWrapper%24btnSearch=%D7%97%D7%A4%D7%A9&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodFrom%24txtDate=' + from_day + '%2F' + from_month + '%2F' + from_year + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodFrom%24txtDay=' + from_day + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodFrom%24txtMonth=' + from_month + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodFrom%24txtYear=' + from_year + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodTo%24txtDate=' + to_day + '%2F' + to_month + '%2F' + to_year + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodTo%24txtDay=' + to_day + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodTo%24txtMonth=' + to_month + '&ctl00%24ContentPlaceHolderWrapper%24srchDatesPeriodTo%24txtYear=' + to_year + '&ctl00%24ContentPlaceHolderWrapper%24srchDover%24hdnValue=&ctl00%24ContentPlaceHolderWrapper%24srchDover%24myTextBox=&ctl00%24ContentPlaceHolderWrapper%24srchExcludeFreeText=&ctl00%24ContentPlaceHolderWrapper%24srchFreeText=' + search_text + '&ctl00%24ContentPlaceHolderWrapper%24srchKnesset=18&ctl00%24ContentPlaceHolderWrapper%24srchManager%24hdnValue=&ctl00%24ContentPlaceHolderWrapper%24srchManager%24myTextBox=&ctl00%24ContentPlaceHolderWrapper%24srchSubject=&ctl00%24ContentPlaceHolderWrapper%24srchSubjectType=0&ctl00%24ContentPlaceHolderWrapper%24srch_SubjectNumber=&hiddenInputToUpdateATBuffer_CommonToolkitScripts=1' - page = urllib2.urlopen(url, params).read() + page = urllib2.urlopen(KNESSET_PROTOCOL_SEARCH_PAGE, params).read() m = re.search('ProtEOnlineLoad\((.*), \'false\'\);', page) if not m: - logger.debug("couldn't find vote in synched protocol\nvote.id=%s\nvote.title=%s\nsearch_text=%s", - str(v.id), v.title, search_text) + logger.debug(u"couldn't find vote in synced protocol\nvote.id=%s\nvote.title=%s\nsearch_text=%s", + str(vote.id), vote.title, search_text) return - l = Link(title=u'פרוטוקול מסונכרן (וידאו וטקסט) של הישיבה', - url='http://online.knesset.gov.il/eprotocol/PLAYER/PEPlayer.aspx?ProtocolID=%s' % m.group(1), - content_type=ContentType.objects.get_for_model(v), object_pk=str(v.id)) - l.save() + + Link.objects.get_or_create(title=u'פרוטוקול מסונכרן (וידאו וטקסט) של הישיבה', + url=KNESSET_SYNCED_PROTOCOL_PAGE % m.group( + 1), + content_type=ContentType.objects.get_for_model(vote), object_pk=str(vote.id)) + except Exception: - logger.exception(u'Exception in find synced protocol: search_text=' + search_text.encode( - 'utf8') + u'\nvote.title=' + v.title.encode('utf8')) + logger.exception(u'Exception in find synced protocol: vote id: %s search_text= %s' % (vote.pk, search_text)) - def check_vote_mentioned_in_cm(self, v, cm): - m = v.title[v.title.find(' - ') + 2:] + def check_vote_mentioned_in_cm(self, vote, cm): + m = vote.title[vote.title.find(' - ') + 2:] v_search_text = self.get_search_string(m.encode('utf8')) cm_search_text = self.get_search_string(cm.protocol_text.encode('utf8')).replace('\n', '') if cm_search_text.find(v_search_text) >= 0: - cm.votes_mentioned.add(v) + cm.votes_mentioned.add(vote) def find_votes_in_cms(self): for cm in CommitteeMeeting.objects.all(): for v in Vote.objects.all(): self.check_vote_mentioned_in_cm(v, cm) - def get_protocols_page(self, page, page_num): - logger.debug('get_protocols_page. page_num=%d' % page_num) - FILES_BASE_URL = "http://www.knesset.gov.il/protocols/" - res = [] - max_linked_page = max([int(r) for r in re.findall("'Page\$(\d*)", page)]) - last_page = False - if max_linked_page < page_num: - last_page = True - - # trim the page to the results part - start = page.find(r'id="gvProtocol"') - end = page.find(r'javascript:__doPostBack') - page = page[start:end] - date_text = '' - comittee = '' - subject = '' - # find interesting parts - matches = re.findall(r' 0) or (link.find(r'rtf') > 0): - re_res = re.search(r"'\.\./([^']*)'", link) - if re_res: - html_url = FILES_BASE_URL + re_res.group(1) - else: - html_url = re.search(r"'([^']*)'", link).group(1) - res.append([date_text, comittee, subject, - html_url]) # this is the last info we need, so add data to results - date_text = '' - comittee = '' - subject = '' - return (last_page, res) - - def get_protocols(self, max_page=10): - logger.debug('get_protocols. max_page=%d' % max_page) - SEARCH_URL = "http://www.knesset.gov.il/protocols/heb/protocol_search.aspx" - cj = cookielib.LWPCookieJar() - opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) - committees_aliases = [] - for c in Committee.objects.all(): - if c.aliases: - committees_aliases += map(lambda x: (c, x), c.aliases.split(",")) - - urllib2.install_opener(opener) - - # get the search page to extract legal "viewstate" and "event validation" strings. need to pass them so the search will work - page = urllib2.urlopen(SEARCH_URL).read().decode('windows-1255').encode('utf-8') - - event_validation = urllib2.quote(re.search(r'id="__EVENTVALIDATION" value="([^"]*)"', page).group(1)).replace( - '/', '%2F') - view_state = urllib2.quote(re.search(r'id="__VIEWSTATE" value="([^"]*)"', page).group(1)).replace('/', '%2F') - - # define date range - params = "__EVENTTARGET=DtFrom&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE=%s&ComId=-1&knesset_id=-1&DtFrom=24%%2F02%%2F2009&DtTo=&subj=&__EVENTVALIDATION=%s" % ( - view_state, event_validation) - page = urllib2.urlopen(SEARCH_URL, params).read().decode('windows-1255').encode('utf-8') - event_validation = urllib2.quote(re.search(r'id="__EVENTVALIDATION" value="([^"]*)"', page).group(1)).replace( - '/', '%2F') - view_state = urllib2.quote(re.search(r'id="__VIEWSTATE" value="([^"]*)"', page).group(1)).replace('/', '%2F') - - # hit the search - params = "btnSearch=%%E7%%E9%%F4%%E5%%F9&__EVENTTARGET=&__EVENTARGUMENT=&__LASTFOCUS=&__VIEWSTATE=%s&ComId=-1&knesset_id=-1&DtFrom=24%%2F02%%2F2009&DtTo=&subj=&__EVENTVALIDATION=%s" % ( - view_state, event_validation) - page = urllib2.urlopen(SEARCH_URL, params).read().decode('windows-1255').encode('utf-8') - event_validation = urllib2.quote(re.search(r'id="__EVENTVALIDATION" value="([^"]*)"', page).group(1)).replace( - '/', '%2F') - view_state = urllib2.quote(re.search(r'id="__VIEWSTATE" value="([^"]*)"', page).group(1)).replace('/', '%2F') - page_num = 1 - (last_page, page_res) = self.get_protocols_page(page, page_num) - res = page_res[:] - - mks, mk_names = get_all_mk_names() - while (not last_page) and (page_num < max_page): - page_num += 1 - params = "__EVENTTARGET=gvProtocol&__EVENTARGUMENT=Page%%24%d&__LASTFOCUS=&__VIEWSTATE=%s&ComId=-1&knesset_id=-1&DtFrom=24%%2F02%%2F2009&DtTo=&subj=&__EVENTVALIDATION=%s" % ( - page_num, view_state, event_validation) - page = urllib2.urlopen(SEARCH_URL, params).read().decode('windows-1255').encode('utf-8') - # update EV and VS - re_res = re.search(r'id="__EVENTVALIDATION" value="([^"]*)"', page) - if re_res: - event_validation = urllib2.quote(re_res.group(1)).replace('/', '%2F') - else: - logger.warning('skipping page %s' % page_num) - continue - view_state = urllib2.quote(re.search(r'id="__VIEWSTATE" value="([^"]*)"', page).group(1)).replace('/', - '%2F') - # parse the page - (last_page, page_res) = self.get_protocols_page(page, page_num) - res.extend(page_res) - - logger.debug('res contains %d entries' % len(res)) - - default_timeout = socket.getdefaulttimeout() - socket.setdefaulttimeout(10) - num_exceptions = 0 - for (date_string, com, topic, link) in res: - if num_exceptions > 15: - logger.error('too many exception in get_protocols') - break - cm = None - try: - (c, created) = Committee.objects.get_or_create(name=com) - if created: - c.save() - r = re.search("(\d\d)/(\d\d)/(\d\d\d\d)", date_string) - d = datetime.date(int(r.group(3)), int(r.group(2)), int(r.group(1))) - if CommitteeMeeting.objects.filter(committee=c, date=d, topics=topic, date_string=date_string).count(): - cm = CommitteeMeeting.objects.filter(committee=c, date=d, topics=topic, date_string=date_string)[0] - logger.debug('cm %d already exists' % cm.id) - elif CommitteeMeeting.objects.filter(src_url=link).count(): - cm = CommitteeMeeting.objects.get(src_url=link) - logger.debug('cm %d is being updated' % cm.id) - if date_string != cm.date_string: - cm.date_string = date_string - logger.debug('updated date_string') - if d != cm.date: - cm.date = d - logger.debug('updated date') - if topic != cm.topics: - cm.topics = topic - logger.debug('updated topics') - if link != cm.src_url: - cm.src_url = link - logger.debug('updated src_url') - else: - cm = CommitteeMeeting.objects.create(committee=c, date=d, topics=topic, date_string=date_string, - src_url=link) - logger.debug('cm %d created' % cm.id) - except Exception: - # WTF exceptions counting - num_exceptions += 1 - logger.exception('Get protocols exceptions') - if cm is not None: - # TODO: remove all the try except - # currently, code is very fragile and causes a lot of exceptions which prevent later stages from running - updated_protocol = False - try: - if not cm.protocol_text: - cm.protocol_text = self.get_committee_protocol_text(link) - # check if the protocol is from the wrong commitee - for i in committees_aliases: - if i[1] in cm.protocol_text[:300]: - cm.committee = i[0] - break - updated_protocol = True - except Exception: - num_exceptions += 1 - logger.exception('Get protocols exceptions') - - try: - cm.save() - except Exception: - num_exceptions += 1 - logger.exception('Get protocols exceptions') - - try: - if updated_protocol: - cm.create_protocol_parts() - except Exception: - num_exceptions += 1 - logger.exception('Get protocols exceptions') - - try: - cm.find_attending_members(mks, mk_names) - except Exception: - num_exceptions += 1 - logger.exception('Get protocols exceptions') - - try: - self.get_bg_material(cm) - except Exception: - num_exceptions += 1 - logger.exception('Get protocols exceptions') - socket.setdefaulttimeout(default_timeout) - - def get_committee_protocol_text(self, url): - logger.debug('get_committee_protocol_text. url=%s' % url) - if url.find('html') >= 0: - url = url.replace('html', 'rtf') - file_str = StringIO() - count = 0 - flag = True - while count < 10 and flag: - try: - file_str.write(urllib2.urlopen(url).read()) - flag = False - except Exception: - count += 1 - if flag: - logger.error("can't open url %s. tried %d times" % (url, count)) - - if url.find(".rtf") >= 0: - return self.handle_rtf_protocol(file_str) - if url.find(".doc") >= 0: - return self.handle_doc_protocol(file_str) - - def handle_doc_protocol(self, file_str): - directory = os.path.join(DATA_ROOT, 'comm_p') - if not os.path.exists(directory): os.makedirs(directory) - fname = os.path.join(directory, 'comm_p.doc') - f = open(fname, 'wb') - file_str.seek(0) - f.write(file_str.read()) - f.close() - x = antiword(fname) - return re.sub('[\n ]{2,}', '\n\n', re.sub('<.*?>', '', x)) - - def handle_rtf_protocol(self, file_str): - try: - doc = Rtf15Reader.read(file_str) - except Exception: - return '' - text = [] - attended_list = False - for paragraph in doc.content: - for sentence in paragraph.content: - if 'bold' in sentence.properties and attended_list: - attended_list = False - text.append('') - if 'מוזמנים'.decode('utf8') in sentence.content[0] and 'bold' in sentence.properties: - attended_list = True - text.append(sentence.content[0]) - all_text = '\n'.join(text) - return re.sub(r'\n:\n', r':\n', all_text) - - def get_bg_material(self, cm): - links = cm.get_bg_material() - if links: - for i in links: - l = Link.objects.create(url=i.get('url', ''), title=i.get('title', ''), content_object=cm) - logger.debug('committee meeting link %d created' % l.id) - def get_approved_bill_text(self, url): """Retrieve the RTL file in the given url, assume approved bill file format, and return the text from the file. diff --git a/simple/scrapers/management.py b/simple/scrapers/management.py index 34409fa0..21cdb112 100644 --- a/simple/scrapers/management.py +++ b/simple/scrapers/management.py @@ -27,10 +27,17 @@ def _get_existing_object(self, dataservice_object): raise NotImplementedError() def _create_new_object(self, dataservice_object): - # this will run after get_existing_object, so you can assume there is no existing object in DB - # it should create the object in DB using the data in dataservice_object - # return value is the created DB object - # this function must always return a DB object which was created - if there is an error - raise an Exception + """ + this will run after get_existing_object, so you can assume there is no existing object in DB + it should create the object in DB using the data in dataservice_object + return value is the created DB object + this function must always return a DB object which was created - if there is an error - raise an Exception + Args: + dataservice_object: + + Returns: + + """ raise NotImplementedError() def recreate_objects(self, object_ids): @@ -98,8 +105,8 @@ class BaseKnessetDataserviceCollectionCommand(BaseKnessetDataserviceCommand): def _handle_page(self, page_num): for dataservice_object in self.DATASERVICE_CLASS.get_page(page_num=page_num): if not self._has_existing_object(dataservice_object): - object = self._create_new_object(dataservice_object) - self._log_debug(u'created new object %s: %s' % (object.pk, object)) + oknesset_obj = self._create_new_object(dataservice_object) + self._log_debug(u'created new object %s: %s' % (oknesset_obj.pk, oknesset_obj)) if self._max_items > 0: self._num_items += 1 if self._num_items == self._max_items: @@ -237,7 +244,7 @@ def _handle_pagerange(self, options): break def _handle_noargs(self, **options): - if (options['recreate'] != ''): + if options['recreate'] != '': self._handle_recreate(options) elif options.get('createsrcid'): self._handle_createsrcid(options)