new backend lefigaro

backend lefigaro
2011-02-19 19:41:44 +01:00 · 2011-02-19 19:41:44 +01:00 · defdfa6af1
commit defdfa6af1
parent caa5658c63
10 changed files with 343 additions and 4 deletions
--- a/weboob/backends/inrocks/pages/genericArticle.py
+++ b/weboob/backends/inrocks/pages/genericArticle.py
@ -33,7 +33,7 @@ class Article(object):
 class GenericNewsPage(BasePage):
    __element_body = NotImplementedError
    __article = Article
-    __element_title_selector  = "h1"
+    element_title_selector  = "h1"
    main_div = NotImplementedError
    element_body_selector = NotImplementedError
    element_author_selector = NotImplementedError
@ -50,7 +50,7 @@ class GenericNewsPage(BasePage):
    def get_title(self):
        return select(
            self.main_div,
-            self.__element_title_selector,
+            self.element_title_selector,
            1).text_content().strip()

    def get_element_body(self):
--- a/weboob/backends/lefigaro/init.py
+++ b/weboob/backends/lefigaro/init.py
@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+from .backend import NewspaperFigaroBackend
+__all__ = ['NewspaperFigaroBackend']
--- a/weboob/backends/lefigaro/backend.py
+++ b/weboob/backends/lefigaro/backend.py
@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+"backend for http://www.lefigaro.fr"
+
+# python2.5 compatibility
+from __future__ import with_statement
+
+from weboob.capabilities.messages import ICapMessages, Message, Thread
+from weboob.tools.backend import BaseBackend
+from weboob.tools.newsfeed import Newsfeed
+from .tools import url2id
+from .browser import NewspaperFigaroBrowser
+
+class NewspaperFigaroBackend(BaseBackend, ICapMessages):
+    MAINTAINER = 'Julien Hebert'
+    EMAIL = 'juke@free.fr'
+    VERSION = '0.6'
+    LICENSE = 'GPLv3'
+    STORAGE = {'seen': {}}
+    NAME = 'lefigaro'
+    DESCRIPTION = u'Lefigaro French news website'
+    BROWSER = NewspaperFigaroBrowser
+    RSS_FEED = 'http://rss.lefigaro.fr/lefigaro/laune?format=xml'
+
+    def get_thread(self, _id):
+        if isinstance(_id, Thread):
+            thread = _id
+            _id = thread.id
+        else:
+            thread = None
+
+        with self.browser:
+            content = self.browser.get_content(_id)
+
+        if not thread:
+            thread = Thread(_id)
+
+        flags = Message.IS_HTML
+        if not thread.id in self.storage.get('seen', default={}):
+            flags |= Message.IS_UNREAD
+        thread.title = content.title
+        if not thread.date:
+            thread.date = content.date
+
+        thread.root = Message(
+            thread=thread,
+            id=0,
+            title=content.title,
+            sender=content.author,
+            receivers=None,
+            date=thread.date,
+            parent=None,
+            content=content.body,
+            signature='URL: %s' % content.url,
+            flags=flags,
+            children= [])
+        return thread
+
+    def iter_threads(self):
+        for article in Newsfeed(self.RSS_FEED, url2id).iter_entries():
+            thread = Thread(article.id)
+            thread.title =  article.title
+            thread.date = article.datetime
+            yield(thread)
+
+    def fill_thread(self, thread):
+        return self.get_thread(thread)
+
+    def iter_unread_messages(self, thread=None):
+        for thread in self.iter_threads():
+            self.fill_thread(thread)
+            for msg in thread.iter_all_messages():
+                if msg.flags & msg.IS_UNREAD:
+                    yield msg
+
+    def set_message_read(self, message):
+        self.storage.set(
+            'seen',
+            message.thread.id,
+            'comments',
+            self.storage.get(
+                'seen',
+                message.thread.id,
+                'comments',
+                default=[]) + [message.id])
+        self.storage.save()
--- a/weboob/backends/lefigaro/browser.py
+++ b/weboob/backends/lefigaro/browser.py
@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+from .pages.article import ArticlePage
+from weboob.tools.browser import BaseBrowser
+
+
+
+class NewspaperFigaroBrowser(BaseBrowser):
+    PAGES = {
+             '.*': ArticlePage,
+            }
+
+    def is_logged(self):
+        return False
+
+    def get_content(self, _id):
+        url = _id
+        self.location(url)
+        return self.page.get_article(_id)
--- a/weboob/backends/lefigaro/pages/init.py
+++ b/weboob/backends/lefigaro/pages/init.py
--- a/weboob/backends/lefigaro/pages/article.py
+++ b/weboob/backends/lefigaro/pages/article.py
@ -0,0 +1,56 @@
+"ArticlePage object for inrocks"
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+from weboob.tools.parsers.lxmlparser import select, SelectElementException
+from .genericArticle import GenericNewsPage
+
+def try_remove(base_element, selector):
+    try :
+        base_element.remove(select(base_element, selector, 1 ))
+    except (SelectElementException, ValueError):
+        pass
+
+class ArticlePage(GenericNewsPage):
+    "ArticlePage object for inrocks"
+    def on_loaded(self):
+        self.main_div = self.document.getroot()
+        self.element_author_selector    = "div.name>span"
+        self.element_body_selector      = "#article"
+
+    def get_body(self):
+        element_body = self.get_element_body()
+        h1          = select(element_body, self.element_title_selector, 1)
+        div_infos   = select(element_body, "div.infos", 1)
+        toolsbar    = select(element_body, "#toolsbar", 1)
+        el_script   = select(element_body, "script", 1)
+
+        element_body.remove(h1)
+        element_body.remove(div_infos)
+        element_body.remove(toolsbar)
+
+        try_remove(element_body, "div.photo")
+        try_remove(element_body, "div.art_bandeau_bottom")
+        try_remove(element_body, "div.view")
+        try_remove(element_body, "span.auteur_long")
+
+        el_script.drop_tree()
+        element_body.find_class("texte")[0].drop_tag()
+        element_body.tag = "div"
+        return self.browser.parser.tostring(element_body)
+
+
--- a/weboob/backends/lefigaro/pages/genericArticle.py
+++ b/weboob/backends/lefigaro/pages/genericArticle.py
@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+from weboob.tools.browser import BasePage
+from weboob.tools.parsers.lxmlparser import select, SelectElementException
+
+class NoAuthorElement(SelectElementException):
+    pass
+
+class Article(object):
+    def __init__(self, browser, _id):
+        self.browser = browser
+        self.id = _id
+        self.title = u''
+        self.body = u''
+        self.url = u''
+        self.author = u''
+        self.date = None
+
+class GenericNewsPage(BasePage):
+    __element_body = NotImplementedError
+    __article = Article
+    element_title_selector  = "h1"
+    main_div = NotImplementedError
+    element_body_selector = NotImplementedError
+    element_author_selector = NotImplementedError
+
+    def get_body(self):
+        return self.browser.parser.tostring(self.get_element_body())
+
+    def get_author(self):
+        try:
+            return self.get_element_author().text_content().strip()
+        except NoAuthorElement:
+            return None
+
+    def get_title(self):
+        return select(
+            self.main_div,
+            self.element_title_selector,
+            1).text_content().strip()
+
+    def get_element_body(self):
+        return select(self.main_div, self.element_body_selector, 1)
+
+    def get_element_author(self):
+        try:
+            return select(self.main_div, self.element_author_selector, 1)
+        except SelectElementException:
+            raise NoAuthorElement()
+
+    def get_article(self, id):
+        __article = Article(self.browser, id)
+        __article.author = self.get_author()
+        __article.title  = self.get_title()
+        __article.url    = self.url
+        __article.body   = self.get_body()
+
+        return __article
--- a/weboob/backends/lefigaro/pages/simple.py
+++ b/weboob/backends/lefigaro/pages/simple.py
@ -0,0 +1,27 @@
+"ArticlePage object for minutes20"
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+from .genericArticle import GenericNewsPage
+
+class SimplePage(GenericNewsPage):
+    "ArticlePage object for minutes20"
+    def on_loaded(self):
+        self.main_div = self.document.getroot()
+        self.element_author_selector = "div.mna-signature"
+        self.element_body_selector = "#article"
+
--- a/weboob/backends/lefigaro/tools.py
+++ b/weboob/backends/lefigaro/tools.py
@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Julien Hebert
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, version 3 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+
+import re
+def id2url(_id):
+    regexp2 = re.compile("(\w+).([0-9]+).(.*$)")
+    match = regexp2.match(_id)
+    if match:
+        return 'http://www.20minutes.fr/%s/%s/%s' % (   match.group(1),
+                                                        match.group(2),
+                                                        match.group(3))
+    else:
+        raise ValueError("id doesn't match")
+
+def url2id(url):
+    return url
--- a/weboob/backends/minutes20/pages/genericArticle.py
+++ b/weboob/backends/minutes20/pages/genericArticle.py
@ -33,7 +33,7 @@ class Article(object):
 class GenericNewsPage(BasePage):
    __element_body = NotImplementedError
    __article = Article
-    __element_title_selector  = "h1"
+    element_title_selector  = "h1"
    main_div = NotImplementedError
    element_body_selector = NotImplementedError
    element_author_selector = NotImplementedError
@ -50,7 +50,7 @@ class GenericNewsPage(BasePage):
    def get_title(self):
        return select(
            self.main_div,
-            self.__element_title_selector,
+            self.element_title_selector,
            1).text_content().strip()

    def get_element_body(self):