diff --git a/modules/pariskiwi/__init__.py b/modules/pariskiwi/__init__.py new file mode 100644 index 00000000..f51a94b6 --- /dev/null +++ b/modules/pariskiwi/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Vincent A +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .backend import ParisKiwiBackend + + +__all__ = ['ParisKiwiBackend'] diff --git a/modules/pariskiwi/backend.py b/modules/pariskiwi/backend.py new file mode 100644 index 00000000..4082d40b --- /dev/null +++ b/modules/pariskiwi/backend.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Vincent A +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.backend import BaseBackend +from weboob.capabilities.calendar import ICapCalendarEvent, BaseCalendarEvent, CATEGORIES, TRANSP, STATUS +from datetime import datetime, time + +from .browser import ParisKiwiBrowser + + +__all__ = ['ParisKiwiBackend'] + + +class ParisKiwiBackend(BaseBackend, ICapCalendarEvent): + NAME = 'pariskiwi' + DESCRIPTION = u'ParisKiwi website' + MAINTAINER = u'Vincent A' + EMAIL = 'dev@indigo.re' + LICENSE = 'AGPLv3+' + VERSION = '0.h' + + BROWSER = ParisKiwiBrowser + + ASSOCIATED_CATEGORIES = [CATEGORIES.CONCERT] + + def search_events(self, query): + if self.has_matching_categories(query): + return self.list_events(query.start_date, query.end_date or None) + + def list_events(self, date_from, date_to=None): + for d in self.browser.list_events_all(): + if self.matches_date(d, date_from, date_to): + event = self.get_event(d['id']) + if event is not None: + yield event + + def get_event(self, _id): + d = self.browser.get_event(_id) + if not d: + return None + return self._make_event(d) + + def _make_event(self, d): + event = BaseCalendarEvent(d['id']) + event.city = u'Paris' + event.url = d['url'] + event.start_date = d['datetime'] + event.end_date = datetime.combine(d['datetime'].date(), time.max) + event.summary = d['summary'] + event.category = CATEGORIES.CONCERT + event.description = d['description'] + event.status = STATUS.CONFIRMED + event.transp = TRANSP.OPAQUE + if 'price' in d: + event.price = d['price'] + if 'address' in d: + event.location = d['address'] + return event + + def _make_false_event(self): + event = BaseCalendarEvent('0') + event.start_date = event.end_date = datetime.utcfromtimestamp(0) + event.summary = u'NON EXISTING EVENT' + event.status = STATUS.CANCELLED + event.category = CATEGORIES.CONCERT + event.transp = TRANSP.OPAQUE + return event + + def matches_date(self, d, date_from, date_to): + if date_from and d['date'] < date_from: + return False + if date_to and d['date'] > date_from: + return False + return True diff --git a/modules/pariskiwi/browser.py b/modules/pariskiwi/browser.py new file mode 100644 index 00000000..e9d4dfe8 --- /dev/null +++ b/modules/pariskiwi/browser.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Vincent A +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BaseBrowser + +from .pages import PageList, PageList2, PageEvent + + +__all__ = ['ParisKiwiBrowser'] + + +class ParisKiwiBrowser(BaseBrowser): + PROTOCOL = 'http' + DOMAIN = 'pariskiwi.org' + ENCODING = 'utf-8' + + PAGES = { + 'http://pariskiwi.org/~parislagrise/mediawiki/index.php/Agenda': PageList, + 'http://pariskiwi.org/~parislagrise/mediawiki/index.php/Agenda/Detruire_Ennui_Paris/.+': PageEvent, + r'http://pariskiwi.org/~parislagrise/mediawiki/api.php\?action=query&list=allpages.*': PageList2, + } + + def __init__(self, *a, **kw): + kw['parser'] = 'raw' + BaseBrowser.__init__(self, *a, **kw) + + def list_events_all(self): + self.location('http://pariskiwi.org/~parislagrise/mediawiki/api.php?action=query&list=allpages&apprefix=Agenda%2FDetruire_Ennui_Paris&aplimit=500&format=json') + assert self.is_on_page(PageList2) + return self.page.list_events() + + def get_event(self, _id): + self.location('http://pariskiwi.org/~parislagrise/mediawiki/index.php/Agenda/Detruire_Ennui_Paris/%s' % _id) + assert self.is_on_page(PageEvent) + return self.page.get_event() diff --git a/modules/pariskiwi/favicon.png b/modules/pariskiwi/favicon.png new file mode 100644 index 00000000..937a7006 Binary files /dev/null and b/modules/pariskiwi/favicon.png differ diff --git a/modules/pariskiwi/pages.py b/modules/pariskiwi/pages.py new file mode 100644 index 00000000..ded700ec --- /dev/null +++ b/modules/pariskiwi/pages.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Vincent A +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BasePage +from datetime import datetime, time +import json +import lxml.html +import re + + +__all__ = ['PageList', 'PageEvent', 'PageList2'] + + +def date_from_id(_id): + textdate = _id.split('_')[0] + return datetime.strptime(textdate, '%m-%d-%Y') + +def id_from_path(title): + return title.replace(' ', '_').split('/')[-1] + +def combine(dt, t): + return datetime(dt.year, dt.month, dt.day, t.hour, t.minute) + +class PageList(BasePage): + def get_events(self): + raise NotImplementedError() + + +class PageList2(BasePage): + def list_events(self): + events = list(self.unsorted_list()) + events.sort(key=lambda d: (d['date'], d['id'])) + return events + + def unsorted_list(self): + # TODO paginate when there are >500 events + for jpage in json.loads(self.document)['query']['allpages']: + d = {} + d['id'] = id_from_path(jpage['title']) + d['date'] = date_from_id(d['id']) + yield d + + +class PageEvent(BasePage): + def get_event(self): + d = {} + d['id'] = id_from_path(self.url) + d['date'] = date_from_id(d['id']) + d['datetime'] = date_from_id(d['id']) + d['url'] = self.url + + html = lxml.html.fromstring(self.document) + for div in html.iter('div'): + if div.get('id') == 'bodyContent': + break + + tags = [t for t in div if not callable(t.tag) and not t.get('id') and 'footer' not in t.get('class', '')] + parts = [t.text_content().strip().replace('\n', ' ') for t in tags] + description = '\n'.join(parts) + summary = description.split('\n', 1)[0] + + self.div = div + if not summary: + return None + + d['summary'] = summary + d['description'] = description + + for n, p in enumerate(parts): + match = re.search(r'\b(\d\d?)h(\d\d)?\b', p) + if match: + d['hour'] = time(int(match.group(1)), int(match.group(2) or '0')) + d['datetime'] = combine(d['date'], d['hour']) + parts[n] = p[:match.start(0)] + p[match.end(0):] + break + + for n, p in enumerate(parts): + match = re.search(ur'\b(\d+([,.]\d+)?)\s*(euros\b|euro\b|€)', p) + if match: + d['price'] = float(match.group(1).replace(',', '.')) + parts[n] = p[:match.start(0)] + p[match.end(0):] + break + + address = [] + for n, p in enumerate(parts): + match = re.search(r'\d+[\s,]+(rue|boulevard|avenue)\s+.+', p, re.I) + if match: + address.append(match.group(0)) + p = parts[n] = p[:match.start(0)] + p[match.end(0):] + match = re.search(r'\b(75|92|93|94|78|77|95|91)\d\d\d\b.*', p) + if match: + address.append(match.group(0)) + p = parts[n] = p[:match.start(0)] + p[match.end(0):] + match = re.search(r'\b(m.tro|rer)\b.*', p, re.I) + if match: + address.append(match.group(0)) + p = parts[n] = p[:match.start(0)] + p[match.end(0):] + match = re.search(r'@\s+\w+(\s+[^.]+.*)?', p) # refuse '@foo' or '@ foo . plop' + if match: + address.append(match.group(0)) + p = parts[n] = p[:match.start(0)] + p[match.end(0):] + + if address: + d['address'] = ' '.join(address) + + return d diff --git a/modules/pariskiwi/test.py b/modules/pariskiwi/test.py new file mode 100644 index 00000000..2f050572 --- /dev/null +++ b/modules/pariskiwi/test.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Vincent A +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest +from datetime import datetime + + +class ParisKiwiTest(BackendTest): + BACKEND = 'pariskiwi' + + def test_pariskiwi_event(self): + event = self.backend.get_event('11-9-2013_-Event_2') + assert event + assert event.location + assert event.price + assert event.summary + assert event.url == 'http://pariskiwi.org/~parislagrise/mediawiki/index.php/Agenda/Detruire_Ennui_Paris/11-9-2013_-Event_2' + assert event.start_date == datetime(2013, 11, 9, 20, 30) + + def test_pariskiwi_list(self): + it = self.backend.list_events(datetime.now()) + ev = it.next() + assert ev is not None + assert ev.start_date >= datetime.now()