inrocks backend

2011-02-16 03:25:18 +01:00 · 2011-02-16 03:25:18 +01:00 · f05d4b2829
commit f05d4b2829
parent 08252358eb
5 changed files with 101 additions and 32 deletions
--- a/weboob/backends/inrocks/backend.py
+++ b/weboob/backends/inrocks/backend.py
@ -14,7 +14,7 @@
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-"backend for http://20minutes.fr"
+"backend for http://www.lesinrocks.com"
 # python2.5 compatibility
 from __future__ import with_statement
@ -76,7 +76,7 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages):
        return thread
    def iter_threads(self):
-        for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml', 
+        for article in Newsfeed('http://www.lesinrocks.com/fileadmin/rss/actus.xml', 
            url2id).iter_entries():
            thread = Thread(article.id)
            thread.title =  article.title
--- a/weboob/backends/inrocks/browser.py
+++ b/weboob/backends/inrocks/browser.py
@ -22,7 +22,9 @@ __all__ = ['NewspaperInrocksBrowser']
 class NewspaperInrocksBrowser(BaseBrowser):
    PAGES = {
-             'http://www.20minutes.fr/article/?.*': ArticlePage,
+             'http://www.lesinrocks.com/actualite/actu-article/t/60121/date/2011-02-15/article/accuse-davoir-participe-a-une-mutinerie-un-detenu-porte-plainte/': ArticlePage,
             '.*': ArticlePage,
            }
@ -30,5 +32,6 @@ class NewspaperInrocksBrowser(BaseBrowser):
        return False
    def get_content(self, _id):
-        self.location(id2url(_id))
+        url = _id
        self.location(url)
        return self.page.article
--- a/weboob/backends/inrocks/pages/article.py
+++ b/weboob/backends/inrocks/pages/article.py
@ -17,27 +17,23 @@
 from weboob.tools.parsers.lxmlparser import select, SelectElementException
-from .minutes20 import Minutes20Page, NoAuthorElement
+from .inrocks import InrocksPage
-class ArticlePage(Minutes20Page):
+def try_remove(base_element, selector):
    def set_body(self):
        self.element_body = select(self.main_div, "div.mna-body", 1)
        element_tools = select(self.element_body, "div.mna-tools", 1)
    try :
-            self.element_body.remove(element_tools)
+        base_element.remove(select(base_element, selector, 1 ))
        except ValueError:
            pass
        try:
            self.element_body.remove(
                select(self.element_body, "div.mna-comment-call", 1))
    except SelectElementException:
        pass
-        except ValueError:
+
-            pass
+class ArticlePage(InrocksPage):
-        try:
+    def set_body(self):
-            self.element_body.remove(self.get_element_author())
+        self.element_body = select(self.main_div, "div.maincol", 1)
-        except NoAuthorElement:
+        try_remove(self.element_body, "div.sidebar")
-            pass
+        details = select(self.element_body, "div.details", 1)
-        except ValueError:
+        try_remove(details, "div.footer")
-            pass
+        header = select(self.element_body, "div.header", 1)
        for selector in ["h1", "div.date", "div.news-single-img", 
                         "div.metas_img"]:
            try_remove(header, selector)
        self.article.body = self.browser.parser.tostring(self.element_body)
--- a/weboob/backends/inrocks/pages/inrocks.py
+++ b/weboob/backends/inrocks/pages/inrocks.py
@ -0,0 +1,68 @@
 # -*- coding: utf-8 -*-
 # Copyright(C) 2011  Julien Hebert
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, version 3 of the License.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 from weboob.tools.browser import BasePage
 from weboob.tools.parsers.lxmlparser import select, SelectElementException
 from weboob.backends.inrocks.tools import url2id
 __all__ = ['InrocksPage', 'Article', 'NoAuthorElement']
 class NoAuthorElement(Exception):
    pass
 class Article(object):
    def __init__(self, browser, _id):
        self.browser = browser
        self.id = _id
        self.title = u''
        self.body = u''
        self.url = u''
        self.author = u''
        self.date = None
 class InrocksPage(BasePage):
    main_div = NotImplementedError
    element_body = NotImplementedError
    article = Article
    element_author_selector = ValueError
    def set_author(self):
        try:
            self.article.author = self.get_element_author().text_content().strip()
        except NoAuthorElement:
            pass
    def get_element_author(self):
        try :
            return select(self.main_div, self.element_author_selector, 1)
        except SelectElementException:
            raise NoAuthorElement()
    def set_body(self):
        self.article.body = self.browser.parser.tostring(select(self.main_div,
                                                                "div.mna-body",
                                                                1))
    def on_loaded(self):
        self.article = Article(self.browser, url2id(self.url) )
        self.main_div = self.document.getroot()
        self.article.title = select(self.main_div, "h1", 1).text_content()
        self.article.url = self.url
        self.element_author_selector = "div.name>span"
        self.set_author()
        self.set_body()
--- a/weboob/backends/inrocks/tools.py
+++ b/weboob/backends/inrocks/tools.py
@ -18,12 +18,14 @@
 import re
 def id2url(_id):
-    regexp2 = re.compile("(\w+).(\w+).(.*$)")
+    regexp2 = re.compile("(\w+).([0-9]+).(.*$)")
    match = regexp2.match(_id)
    if match:
        return 'http://www.20minutes.fr/%s/%s/%s' % (   match.group(1),
                                                        match.group(2),
                                                        match.group(3))
    else:
        raise ValueError("id doesn't match")
 def url2id(url):
-    regexp = re.compile("http://www.20minutes.fr/(\w+)/([0-9]+)/(.*$)")
+    return url
    match = regexp.match(url)
    return '%s.%d.%s' % (match.group(1), int(match.group(2)), match.group(3))