From defdfa6af1cf98610ebdd7d8046b4fdb944b628a Mon Sep 17 00:00:00 2001 From: Juke Date: Sat, 19 Feb 2011 19:41:44 +0100 Subject: [PATCH] new backend lefigaro backend lefigaro --- .../backends/inrocks/pages/genericArticle.py | 4 +- weboob/backends/lefigaro/__init__.py | 19 ++++ weboob/backends/lefigaro/backend.py | 100 ++++++++++++++++++ weboob/backends/lefigaro/browser.py | 34 ++++++ weboob/backends/lefigaro/pages/__init__.py | 0 weboob/backends/lefigaro/pages/article.py | 56 ++++++++++ .../backends/lefigaro/pages/genericArticle.py | 72 +++++++++++++ weboob/backends/lefigaro/pages/simple.py | 27 +++++ weboob/backends/lefigaro/tools.py | 31 ++++++ .../minutes20/pages/genericArticle.py | 4 +- 10 files changed, 343 insertions(+), 4 deletions(-) create mode 100644 weboob/backends/lefigaro/__init__.py create mode 100644 weboob/backends/lefigaro/backend.py create mode 100644 weboob/backends/lefigaro/browser.py create mode 100644 weboob/backends/lefigaro/pages/__init__.py create mode 100644 weboob/backends/lefigaro/pages/article.py create mode 100644 weboob/backends/lefigaro/pages/genericArticle.py create mode 100644 weboob/backends/lefigaro/pages/simple.py create mode 100644 weboob/backends/lefigaro/tools.py diff --git a/weboob/backends/inrocks/pages/genericArticle.py b/weboob/backends/inrocks/pages/genericArticle.py index 0bcef723..e2cc4ba1 100644 --- a/weboob/backends/inrocks/pages/genericArticle.py +++ b/weboob/backends/inrocks/pages/genericArticle.py @@ -33,7 +33,7 @@ class Article(object): class GenericNewsPage(BasePage): __element_body = NotImplementedError __article = Article - __element_title_selector = "h1" + element_title_selector = "h1" main_div = NotImplementedError element_body_selector = NotImplementedError element_author_selector = NotImplementedError @@ -50,7 +50,7 @@ class GenericNewsPage(BasePage): def get_title(self): return select( self.main_div, - self.__element_title_selector, + self.element_title_selector, 1).text_content().strip() def get_element_body(self): diff --git a/weboob/backends/lefigaro/__init__.py b/weboob/backends/lefigaro/__init__.py new file mode 100644 index 00000000..f9e5c9d2 --- /dev/null +++ b/weboob/backends/lefigaro/__init__.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from .backend import NewspaperFigaroBackend +__all__ = ['NewspaperFigaroBackend'] diff --git a/weboob/backends/lefigaro/backend.py b/weboob/backends/lefigaro/backend.py new file mode 100644 index 00000000..e48f7f6d --- /dev/null +++ b/weboob/backends/lefigaro/backend.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +"backend for http://www.lefigaro.fr" + +# python2.5 compatibility +from __future__ import with_statement + +from weboob.capabilities.messages import ICapMessages, Message, Thread +from weboob.tools.backend import BaseBackend +from weboob.tools.newsfeed import Newsfeed +from .tools import url2id +from .browser import NewspaperFigaroBrowser + +class NewspaperFigaroBackend(BaseBackend, ICapMessages): + MAINTAINER = 'Julien Hebert' + EMAIL = 'juke@free.fr' + VERSION = '0.6' + LICENSE = 'GPLv3' + STORAGE = {'seen': {}} + NAME = 'lefigaro' + DESCRIPTION = u'Lefigaro French news website' + BROWSER = NewspaperFigaroBrowser + RSS_FEED = 'http://rss.lefigaro.fr/lefigaro/laune?format=xml' + + def get_thread(self, _id): + if isinstance(_id, Thread): + thread = _id + _id = thread.id + else: + thread = None + + with self.browser: + content = self.browser.get_content(_id) + + if not thread: + thread = Thread(_id) + + flags = Message.IS_HTML + if not thread.id in self.storage.get('seen', default={}): + flags |= Message.IS_UNREAD + thread.title = content.title + if not thread.date: + thread.date = content.date + + thread.root = Message( + thread=thread, + id=0, + title=content.title, + sender=content.author, + receivers=None, + date=thread.date, + parent=None, + content=content.body, + signature='URL: %s' % content.url, + flags=flags, + children= []) + return thread + + def iter_threads(self): + for article in Newsfeed(self.RSS_FEED, url2id).iter_entries(): + thread = Thread(article.id) + thread.title = article.title + thread.date = article.datetime + yield(thread) + + def fill_thread(self, thread): + return self.get_thread(thread) + + def iter_unread_messages(self, thread=None): + for thread in self.iter_threads(): + self.fill_thread(thread) + for msg in thread.iter_all_messages(): + if msg.flags & msg.IS_UNREAD: + yield msg + + def set_message_read(self, message): + self.storage.set( + 'seen', + message.thread.id, + 'comments', + self.storage.get( + 'seen', + message.thread.id, + 'comments', + default=[]) + [message.id]) + self.storage.save() diff --git a/weboob/backends/lefigaro/browser.py b/weboob/backends/lefigaro/browser.py new file mode 100644 index 00000000..dd454407 --- /dev/null +++ b/weboob/backends/lefigaro/browser.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from .pages.article import ArticlePage +from weboob.tools.browser import BaseBrowser + + + +class NewspaperFigaroBrowser(BaseBrowser): + PAGES = { + '.*': ArticlePage, + } + + def is_logged(self): + return False + + def get_content(self, _id): + url = _id + self.location(url) + return self.page.get_article(_id) diff --git a/weboob/backends/lefigaro/pages/__init__.py b/weboob/backends/lefigaro/pages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weboob/backends/lefigaro/pages/article.py b/weboob/backends/lefigaro/pages/article.py new file mode 100644 index 00000000..aadfca8a --- /dev/null +++ b/weboob/backends/lefigaro/pages/article.py @@ -0,0 +1,56 @@ +"ArticlePage object for inrocks" +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from weboob.tools.parsers.lxmlparser import select, SelectElementException +from .genericArticle import GenericNewsPage + +def try_remove(base_element, selector): + try : + base_element.remove(select(base_element, selector, 1 )) + except (SelectElementException, ValueError): + pass + +class ArticlePage(GenericNewsPage): + "ArticlePage object for inrocks" + def on_loaded(self): + self.main_div = self.document.getroot() + self.element_author_selector = "div.name>span" + self.element_body_selector = "#article" + + def get_body(self): + element_body = self.get_element_body() + h1 = select(element_body, self.element_title_selector, 1) + div_infos = select(element_body, "div.infos", 1) + toolsbar = select(element_body, "#toolsbar", 1) + el_script = select(element_body, "script", 1) + + element_body.remove(h1) + element_body.remove(div_infos) + element_body.remove(toolsbar) + + try_remove(element_body, "div.photo") + try_remove(element_body, "div.art_bandeau_bottom") + try_remove(element_body, "div.view") + try_remove(element_body, "span.auteur_long") + + el_script.drop_tree() + element_body.find_class("texte")[0].drop_tag() + element_body.tag = "div" + return self.browser.parser.tostring(element_body) + + diff --git a/weboob/backends/lefigaro/pages/genericArticle.py b/weboob/backends/lefigaro/pages/genericArticle.py new file mode 100644 index 00000000..e2cc4ba1 --- /dev/null +++ b/weboob/backends/lefigaro/pages/genericArticle.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlparser import select, SelectElementException + +class NoAuthorElement(SelectElementException): + pass + +class Article(object): + def __init__(self, browser, _id): + self.browser = browser + self.id = _id + self.title = u'' + self.body = u'' + self.url = u'' + self.author = u'' + self.date = None + +class GenericNewsPage(BasePage): + __element_body = NotImplementedError + __article = Article + element_title_selector = "h1" + main_div = NotImplementedError + element_body_selector = NotImplementedError + element_author_selector = NotImplementedError + + def get_body(self): + return self.browser.parser.tostring(self.get_element_body()) + + def get_author(self): + try: + return self.get_element_author().text_content().strip() + except NoAuthorElement: + return None + + def get_title(self): + return select( + self.main_div, + self.element_title_selector, + 1).text_content().strip() + + def get_element_body(self): + return select(self.main_div, self.element_body_selector, 1) + + def get_element_author(self): + try: + return select(self.main_div, self.element_author_selector, 1) + except SelectElementException: + raise NoAuthorElement() + + def get_article(self, id): + __article = Article(self.browser, id) + __article.author = self.get_author() + __article.title = self.get_title() + __article.url = self.url + __article.body = self.get_body() + + return __article diff --git a/weboob/backends/lefigaro/pages/simple.py b/weboob/backends/lefigaro/pages/simple.py new file mode 100644 index 00000000..706e68c2 --- /dev/null +++ b/weboob/backends/lefigaro/pages/simple.py @@ -0,0 +1,27 @@ +"ArticlePage object for minutes20" +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from .genericArticle import GenericNewsPage + +class SimplePage(GenericNewsPage): + "ArticlePage object for minutes20" + def on_loaded(self): + self.main_div = self.document.getroot() + self.element_author_selector = "div.mna-signature" + self.element_body_selector = "#article" + diff --git a/weboob/backends/lefigaro/tools.py b/weboob/backends/lefigaro/tools.py new file mode 100644 index 00000000..e0a157a2 --- /dev/null +++ b/weboob/backends/lefigaro/tools.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +import re +def id2url(_id): + regexp2 = re.compile("(\w+).([0-9]+).(.*$)") + match = regexp2.match(_id) + if match: + return 'http://www.20minutes.fr/%s/%s/%s' % ( match.group(1), + match.group(2), + match.group(3)) + else: + raise ValueError("id doesn't match") + +def url2id(url): + return url diff --git a/weboob/backends/minutes20/pages/genericArticle.py b/weboob/backends/minutes20/pages/genericArticle.py index 0bcef723..e2cc4ba1 100644 --- a/weboob/backends/minutes20/pages/genericArticle.py +++ b/weboob/backends/minutes20/pages/genericArticle.py @@ -33,7 +33,7 @@ class Article(object): class GenericNewsPage(BasePage): __element_body = NotImplementedError __article = Article - __element_title_selector = "h1" + element_title_selector = "h1" main_div = NotImplementedError element_body_selector = NotImplementedError element_author_selector = NotImplementedError @@ -50,7 +50,7 @@ class GenericNewsPage(BasePage): def get_title(self): return select( self.main_div, - self.__element_title_selector, + self.element_title_selector, 1).text_content().strip() def get_element_body(self):