From a5a548987282d5f7a430c852bf2b20eaf866af32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= Date: Fri, 31 Aug 2012 00:04:27 +0200 Subject: [PATCH] Add module for Europarl videos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently support committees and other events. TODO: support plenary sessions TODO: latest and search Signed-off-by: François Revol Signed-off-by: Romain Bignon --- modules/europarl/__init__.py | 3 + modules/europarl/backend.py | 82 ++++++++++++++++++ modules/europarl/browser.py | 57 ++++++++++++ modules/europarl/favicon.png | Bin 0 -> 526 bytes modules/europarl/favicon_europarl.xcf | Bin 0 -> 2340 bytes modules/europarl/pages.py | 120 ++++++++++++++++++++++++++ modules/europarl/test.py | 42 +++++++++ modules/europarl/video.py | 50 +++++++++++ 8 files changed, 354 insertions(+) create mode 100644 modules/europarl/__init__.py create mode 100644 modules/europarl/backend.py create mode 100644 modules/europarl/browser.py create mode 100644 modules/europarl/favicon.png create mode 100644 modules/europarl/favicon_europarl.xcf create mode 100644 modules/europarl/pages.py create mode 100644 modules/europarl/test.py create mode 100644 modules/europarl/video.py diff --git a/modules/europarl/__init__.py b/modules/europarl/__init__.py new file mode 100644 index 00000000..0994ffed --- /dev/null +++ b/modules/europarl/__init__.py @@ -0,0 +1,3 @@ +from .backend import EuroparlBackend + +__all__ = ['EuroparlBackend'] diff --git a/modules/europarl/backend.py b/modules/europarl/backend.py new file mode 100644 index 00000000..971c7d6c --- /dev/null +++ b/modules/europarl/backend.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# Copyright(C) 2012 François Revol +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from __future__ import with_statement + +from weboob.capabilities.video import ICapVideo, BaseVideo +from weboob.tools.backend import BaseBackend +from weboob.capabilities.collection import ICapCollection, CollectionNotFound + +from .browser import EuroparlBrowser +from .video import EuroparlVideo + + +__all__ = ['EuroparlBackend'] + + +class EuroparlBackend(BaseBackend, ICapVideo, ICapCollection): + NAME = 'europarl' + MAINTAINER = u'François Revol' + EMAIL = 'revol@free.fr' + VERSION = '0.d' + DESCRIPTION = 'Europarl parliamentary video streaming website' + LICENSE = 'AGPLv3+' + BROWSER = EuroparlBrowser + + def get_video(self, _id): + with self.browser: + return self.browser.get_video(_id) + + SORTBY = ['relevance', 'rating', 'views', 'time'] + + # def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None): + # with self.browser: + # return self.browser.search_videos(pattern, self.SORTBY[sortby]) + + def fill_video(self, video, fields): + if fields != ['thumbnail']: + # if we don't want only the thumbnail, we probably want also every fields + with self.browser: + video = self.browser.get_video(EuroparlVideo.id2url(video.id), video) + if 'thumbnail' in fields and video.thumbnail: + with self.browser: + video.thumbnail.data = self.browser.readurl(video.thumbnail.url) + + return video + + def iter_resources(self, objs, split_path): + if BaseVideo in objs: + collection = self.get_collection(objs, split_path) + if collection.path_level == 0: + yield self.get_collection(objs, [u'latest']) + if collection.split_path == [u'latest']: + for video in self.browser.latest_videos(): + yield video + + def validate_collection(self, objs, collection): + if collection.path_level == 0: + return + if BaseVideo in objs and collection.split_path == [u'latest']: + collection.title = u'Latest Europarl videos' + return + raise CollectionNotFound(collection.split_path) + + OBJECTS = {EuroparlVideo: fill_video} diff --git a/modules/europarl/browser.py b/modules/europarl/browser.py new file mode 100644 index 00000000..734bd21a --- /dev/null +++ b/modules/europarl/browser.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# Copyright(C) 2012 François Revol +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BaseBrowser +from weboob.tools.browser.decorators import id2url + +#from .pages.index import IndexPage +from .pages import VideoPage +from .video import EuroparlVideo + + +__all__ = ['EuroparlBrowser'] + + +class EuroparlBrowser(BaseBrowser): + DOMAIN = 'europarl.europa.eu' + ENCODING = None + PAGES = {r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P\w+)/committees/video\?.*event=(?P[^&]+).*': VideoPage, + r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P\w+)/other-events/video\?.*event=(?P[^&]+).*': VideoPage +#TODO:plenaries +# r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P\w+)/plenary/video\?.*date=(?P[^&]+).*': VideoPage +# r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P\w+)/plenary/video\?.*debate=(?P[^&]+).*': VideoPage + } + + @id2url(EuroparlVideo.id2url) + def get_video(self, url, video=None): + self.location(url) + return self.page.get_video(video) + + # def search_videos(self, pattern, sortby): + # return None + # self.location(self.buildurl('http://europarltv.europa.eu/en/search%s' % sortby, query=pattern.encode('utf-8'))) + # assert self.is_on_page(IndexPage) + # return self.page.iter_videos() + + # def latest_videos(self): + # self.home() + # assert self.is_on_page(IndexPage) + # return self.page.iter_videos() diff --git a/modules/europarl/favicon.png b/modules/europarl/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..fffc442d867908208495c1b977b2e0ce2c9b439a GIT binary patch literal 526 zcmeAS@N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7TyC=Y)RhkE)4%caKYZ?lYt_f1s;*b z3=G`DAk4@xYmNj^kiEpy*OmPahny%kmqC!vb)b-BiEBiObAE1aYF-J0b5UwyNotBh zd1gt5g1e`0KzJjcI0FM?nWu|mNX4zUvo7)-R^V{G-BzEK^y_DD{ni|R?{63UHU?zopr0Q9ia9{>OV literal 0 HcmV?d00001 diff --git a/modules/europarl/favicon_europarl.xcf b/modules/europarl/favicon_europarl.xcf new file mode 100644 index 0000000000000000000000000000000000000000..ea7301492c80a8b11c17e9e7ff19711342973303 GIT binary patch literal 2340 zcma);O>P@U5QWL%@XxUmQIxWhfT1h_G9=P8AbTRi28d)4WMLy$ps67}0DlybPVAMg z;DwhtKym}`bBXM7kj^;yUa^Ov7zIcR)%E&yRZUkvjYgw%av8lH45MK>P6~{kH{k|W zEHX8*+Sl*HpJm`3EW}FKmmVu_m-q?(-iWg8!E7=~rVG{uZEk;amGl?MMf6j;cpZKF z-HY$Fvt^;_q(4g9qpS2n9MQ$X@qF=aoJ8~4IK6mq=Xf|8%*L~;=-6EQYV@ifbvs=f zw}3}}>j9~+Zb{AeqyGj5bM}~@^Yinsx@TwSPvR$!wdY6i_a&&=g@b$EboY0M z{U6i8Y}&b;j#w4~O^gaAA^*Z^9xJ}?F|M$;hVQK5&(`qWHGFRke*sqAd{hB?o<{yl zl%EAE_+UH#oSoR zx(L|_myOzK(Rrhu>Fg^a@A3)^sDE^J$bnzpfx1K*C!LqQZ!*yKB=v%BR>9gg7M;0I zV$j(NTQQvzoh?4c=UXvpNA^~S1cmT=!wD{G()%uE8`~EVthO2}v?uh8nuh{S3bq7xEH{YdBgh86 zg*8`+dhSy*!tHr4wP$^(-b1Zwq@D`YiEP8lpFmddrA!Pgb8(OX(Q!WVSjzX5~j0R{j7 literal 0 HcmV?d00001 diff --git a/modules/europarl/pages.py b/modules/europarl/pages.py new file mode 100644 index 00000000..19eaf335 --- /dev/null +++ b/modules/europarl/pages.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# Copyright(C) 2012 François Revol +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.mech import ClientForm +ControlNotFoundError = ClientForm.ControlNotFoundError + +from weboob.tools.browser import BasePage + +import re +import datetime + +from weboob.capabilities.base import NotAvailable +from weboob.tools.browser import BrokenPageError + +from .video import EuroparlVideo + + + +__all__ = ['VideoPage'] + +class VideoPage(BasePage): + def get_video(self, video=None): + if video is None: + video = EuroparlVideo(self.group_dict['id']) + video.title = unicode(self.get_title()) + video.url = unicode(self.get_url()) + self.set_details(video) + + video.set_empty_fields(NotAvailable) + return video + + def get_url(self): + # search for + # TODO: plenaries can be downloaded as mp4... + obj = self.parser.select(self.document.getroot(), 'input#codeUrl', 1) + if obj is None: + return None + return obj.attrib['value'] + + def get_title(self): + obj = self.parser.select(self.document.getroot(), 'h1#player_subjectTitle') + if len(obj) < 1: + obj = self.parser.select(self.document.getroot(), 'title') + if len(obj) < 1: + return None + title = obj[0].text.strip() + obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle') + if len(obj) < 1: + return title + + for span in self.parser.select(obj[0], 'span.ep_acronym, span.ep_theme'): + if span.text_content(): + title += ' ' + span.text_content().strip() + + return title + + def set_details(self, v): + v.author = u'European Parliament' + obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1) + if obj is not None: + value = obj.attrib['content'] + print value + m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value) + if not m: + raise BrokenPageError('Unable to parse datetime: %r' % value) + day = m.group(1) + month = m.group(2) + year = m.group(3) + hour = m.group(4) + minute = m.group(5) + v.date = datetime.datetime(year=int(year), + month=int(month), + day=int(day), + hour=int(hour), + minute=int(minute)) + + obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1) + if obj is not None: + span = self.parser.select(obj, 'span.ep_date', 1) + value = span.text + m = re.match('(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value) + if not m: + raise BrokenPageError('Unable to parse datetime: %r' % value) + bhour = m.group(1) + bminute = m.group(2) + ehour = m.group(3) + eminute = m.group(4) + day = m.group(5) + month = m.group(6) + year = m.group(7) + + start = datetime.datetime(year=int(year), + month=int(month), + day=int(day), + hour=int(bhour), + minute=int(bminute)) + end = datetime.datetime(year=int(year), + month=int(month), + day=int(day), + hour=int(ehour), + minute=int(eminute)) + + v.duration = end - start diff --git a/modules/europarl/test.py b/modules/europarl/test.py new file mode 100644 index 00000000..aa0c6dec --- /dev/null +++ b/modules/europarl/test.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# Copyright(C) 2012 François Revol +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest +#from weboob.capabilities.video import BaseVideo + + +class EuroparlTest(BackendTest): + BACKEND = 'europarl' + + # def test_search(self): + # l = list(self.backend.search_videos('neelie kroes')) + # self.assertTrue(len(l) > 0) + # v = l[0] + # self.backend.fillobj(v, ('url',)) + # self.assertTrue(v.url and v.url.startswith('http://'), 'URL for video "%s" not found: %s' % (v.id, v.url)) + # self.backend.browser.openurl(v.url) + + # def test_latest(self): + # l = list(self.backend.iter_resources([BaseVideo], [u'latest'])) + # self.assertTrue(len(l) > 0) + # v = l[0] + # self.backend.fillobj(v, ('url',)) + # self.assertTrue(v.url and v.url.startswith('http://'), 'URL for video "%s" not found: %s' % (v.id, v.url)) diff --git a/modules/europarl/video.py b/modules/europarl/video.py new file mode 100644 index 00000000..c1cc3907 --- /dev/null +++ b/modules/europarl/video.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.capabilities.video import BaseVideo + +import re + +__all__ = ['EuroparlVideo'] + + +class EuroparlVideo(BaseVideo): + def __init__(self, *args, **kwargs): + BaseVideo.__init__(self, *args, **kwargs) + self.ext = u'wmv' + + @classmethod + def id2url(cls, _id): + m = re.match('.*-COMMITTEE-.*', _id) + if m: + return u'http://www.europarl.europa.eu/ep-live/en/committees/video?event=%s&format=wmv' % _id + m = re.match('.*-SPECIAL-.*', _id) + if m: + return u'http://www.europarl.europa.eu/ep-live/en/other-events/video?event=%s&format=wmv' % _id + # XXX: not yet supported + m = re.match('\d\d-\d\d-\d\d\d\d', _id) + if m: + return u'http://www.europarl.europa.eu/ep-live/en/plenary/video?date=%s' % _id + # XXX: not yet supported + m = re.match('\d+', _id) + if m: + return u'http://www.europarl.europa.eu/ep-live/en/plenary/video?debate=%s' % _id + return None +