diff --git a/weboob/backends/inrocks/backend.py b/weboob/backends/inrocks/backend.py index 6d5fa009..1f7816bc 100644 --- a/weboob/backends/inrocks/backend.py +++ b/weboob/backends/inrocks/backend.py @@ -14,7 +14,7 @@ # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -"backend for http://20minutes.fr" +"backend for http://www.lesinrocks.com" # python2.5 compatibility from __future__ import with_statement @@ -76,7 +76,7 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages): return thread def iter_threads(self): - for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml', + for article in Newsfeed('http://www.lesinrocks.com/fileadmin/rss/actus.xml', url2id).iter_entries(): thread = Thread(article.id) thread.title = article.title diff --git a/weboob/backends/inrocks/browser.py b/weboob/backends/inrocks/browser.py index 7ebacf80..b56b003a 100644 --- a/weboob/backends/inrocks/browser.py +++ b/weboob/backends/inrocks/browser.py @@ -22,7 +22,9 @@ __all__ = ['NewspaperInrocksBrowser'] class NewspaperInrocksBrowser(BaseBrowser): PAGES = { - 'http://www.20minutes.fr/article/?.*': ArticlePage, + 'http://www.lesinrocks.com/actualite/actu-article/t/60121/date/2011-02-15/article/accuse-davoir-participe-a-une-mutinerie-un-detenu-porte-plainte/': ArticlePage, + '.*': ArticlePage, + } @@ -30,5 +32,6 @@ class NewspaperInrocksBrowser(BaseBrowser): return False def get_content(self, _id): - self.location(id2url(_id)) + url = _id + self.location(url) return self.page.article diff --git a/weboob/backends/inrocks/pages/article.py b/weboob/backends/inrocks/pages/article.py index 9e850424..98843e1d 100644 --- a/weboob/backends/inrocks/pages/article.py +++ b/weboob/backends/inrocks/pages/article.py @@ -17,27 +17,23 @@ from weboob.tools.parsers.lxmlparser import select, SelectElementException -from .minutes20 import Minutes20Page, NoAuthorElement +from .inrocks import InrocksPage -class ArticlePage(Minutes20Page): +def try_remove(base_element, selector): + try : + base_element.remove(select(base_element, selector, 1 )) + except SelectElementException: + pass + +class ArticlePage(InrocksPage): def set_body(self): - self.element_body = select(self.main_div, "div.mna-body", 1) - element_tools = select(self.element_body, "div.mna-tools", 1) - try : - self.element_body.remove(element_tools) - except ValueError: - pass - try: - self.element_body.remove( - select(self.element_body, "div.mna-comment-call", 1)) - except SelectElementException: - pass - except ValueError: - pass - try: - self.element_body.remove(self.get_element_author()) - except NoAuthorElement: - pass - except ValueError: - pass + self.element_body = select(self.main_div, "div.maincol", 1) + try_remove(self.element_body, "div.sidebar") + details = select(self.element_body, "div.details", 1) + try_remove(details, "div.footer") + header = select(self.element_body, "div.header", 1) + for selector in ["h1", "div.date", "div.news-single-img", + "div.metas_img"]: + try_remove(header, selector) + self.article.body = self.browser.parser.tostring(self.element_body) diff --git a/weboob/backends/inrocks/pages/inrocks.py b/weboob/backends/inrocks/pages/inrocks.py new file mode 100644 index 00000000..71551f79 --- /dev/null +++ b/weboob/backends/inrocks/pages/inrocks.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.backends.inrocks.tools import url2id +__all__ = ['InrocksPage', 'Article', 'NoAuthorElement'] + +class NoAuthorElement(Exception): + pass + +class Article(object): + def __init__(self, browser, _id): + self.browser = browser + self.id = _id + self.title = u'' + self.body = u'' + self.url = u'' + self.author = u'' + self.date = None + +class InrocksPage(BasePage): + main_div = NotImplementedError + element_body = NotImplementedError + article = Article + element_author_selector = ValueError + + def set_author(self): + try: + self.article.author = self.get_element_author().text_content().strip() + except NoAuthorElement: + pass + + def get_element_author(self): + try : + return select(self.main_div, self.element_author_selector, 1) + except SelectElementException: + raise NoAuthorElement() + + def set_body(self): + self.article.body = self.browser.parser.tostring(select(self.main_div, + "div.mna-body", + 1)) + + + def on_loaded(self): + self.article = Article(self.browser, url2id(self.url) ) + self.main_div = self.document.getroot() + self.article.title = select(self.main_div, "h1", 1).text_content() + self.article.url = self.url + self.element_author_selector = "div.name>span" + self.set_author() + self.set_body() + + diff --git a/weboob/backends/inrocks/tools.py b/weboob/backends/inrocks/tools.py index 10ccfd7c..e0a157a2 100644 --- a/weboob/backends/inrocks/tools.py +++ b/weboob/backends/inrocks/tools.py @@ -18,12 +18,14 @@ import re def id2url(_id): - regexp2 = re.compile("(\w+).(\w+).(.*$)") + regexp2 = re.compile("(\w+).([0-9]+).(.*$)") match = regexp2.match(_id) - return 'http://www.20minutes.fr/%s/%s/%s' % ( match.group(1), - match.group(2), - match.group(3)) + if match: + return 'http://www.20minutes.fr/%s/%s/%s' % ( match.group(1), + match.group(2), + match.group(3)) + else: + raise ValueError("id doesn't match") + def url2id(url): - regexp = re.compile("http://www.20minutes.fr/(\w+)/([0-9]+)/(.*$)") - match = regexp.match(url) - return '%s.%d.%s' % (match.group(1), int(match.group(2)), match.group(3)) + return url