From 2ebe8012bdfb74c7cc16309816afcd06595d851e Mon Sep 17 00:00:00 2001 From: Juke Date: Thu, 17 Feb 2011 04:06:01 +0100 Subject: [PATCH] uniform minutes20 and inrocks --- weboob/backends/inrocks/__init__.py | 2 - weboob/backends/inrocks/backend.py | 16 +++----- weboob/backends/inrocks/browser.py | 3 +- weboob/backends/inrocks/pages/article.py | 11 +++-- weboob/backends/inrocks/pages/inrocks.py | 43 +++++++++++--------- weboob/backends/minutes20/__init__.py | 2 - weboob/backends/minutes20/backend.py | 18 +++----- weboob/backends/minutes20/browser.py | 6 +-- weboob/backends/minutes20/pages/article.py | 34 ++++++---------- weboob/backends/minutes20/pages/minutes20.py | 42 +++++++++---------- 10 files changed, 78 insertions(+), 99 deletions(-) diff --git a/weboob/backends/inrocks/__init__.py b/weboob/backends/inrocks/__init__.py index 8fe34caf..775a3fc9 100644 --- a/weboob/backends/inrocks/__init__.py +++ b/weboob/backends/inrocks/__init__.py @@ -15,7 +15,5 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - from .backend import NewspaperInrocksBackend - __all__ = ['NewspaperInrocksBackendBackend'] diff --git a/weboob/backends/inrocks/backend.py b/weboob/backends/inrocks/backend.py index 1f7816bc..7a9ef487 100644 --- a/weboob/backends/inrocks/backend.py +++ b/weboob/backends/inrocks/backend.py @@ -21,25 +21,22 @@ from __future__ import with_statement from weboob.capabilities.messages import ICapMessages, Message, Thread from weboob.tools.backend import BaseBackend - -from .browser import NewspaperInrocksBrowser from weboob.tools.newsfeed import Newsfeed from .tools import url2id +from .browser import NewspaperInrocksBrowser __all__ = ['NewspaperInrocksBackend'] - - - class NewspaperInrocksBackend(BaseBackend, ICapMessages): - NAME = 'inrocks' MAINTAINER = 'Julien Hebert' EMAIL = 'juke@free.fr' VERSION = '0.6' LICENSE = 'GPLv3' - DESCRIPTION = u'Inrock French news website' STORAGE = {'seen': {}} + NAME = 'inrocks' + DESCRIPTION = u'Inrock French news website' BROWSER = NewspaperInrocksBrowser + RSS_FEED = 'http://www.lesinrocks.com/fileadmin/rss/actus.xml' def get_thread(self, _id): if isinstance(_id, Thread): @@ -54,7 +51,6 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages): if not thread: thread = Thread(_id) - flags = Message.IS_HTML if not thread.id in self.storage.get('seen', default={}): flags |= Message.IS_UNREAD @@ -71,13 +67,13 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages): date=thread.date, parent=None, content=content.body, + signature='URL: %s' % content.url, flags=flags, children= []) return thread def iter_threads(self): - for article in Newsfeed('http://www.lesinrocks.com/fileadmin/rss/actus.xml', - url2id).iter_entries(): + for article in Newsfeed(self.RSS_FEED, url2id).iter_entries(): thread = Thread(article.id) thread.title = article.title thread.date = article.datetime diff --git a/weboob/backends/inrocks/browser.py b/weboob/backends/inrocks/browser.py index b56b003a..70915bee 100644 --- a/weboob/backends/inrocks/browser.py +++ b/weboob/backends/inrocks/browser.py @@ -18,8 +18,10 @@ from .pages.article import ArticlePage from weboob.tools.browser import BaseBrowser from .tools import id2url + __all__ = ['NewspaperInrocksBrowser'] + class NewspaperInrocksBrowser(BaseBrowser): PAGES = { 'http://www.lesinrocks.com/actualite/actu-article/t/60121/date/2011-02-15/article/accuse-davoir-participe-a-une-mutinerie-un-detenu-porte-plainte/': ArticlePage, @@ -27,7 +29,6 @@ class NewspaperInrocksBrowser(BaseBrowser): } - def is_logged(self): return False diff --git a/weboob/backends/inrocks/pages/article.py b/weboob/backends/inrocks/pages/article.py index 3140e4c6..5ded0b8f 100644 --- a/weboob/backends/inrocks/pages/article.py +++ b/weboob/backends/inrocks/pages/article.py @@ -1,3 +1,4 @@ +"ArticlePage object for inrocks" # -*- coding: utf-8 -*- # Copyright(C) 2011 Julien Hebert @@ -15,7 +16,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - from weboob.tools.parsers.lxmlparser import select, SelectElementException from .inrocks import InrocksPage @@ -26,8 +26,8 @@ def try_remove(base_element, selector): pass class ArticlePage(InrocksPage): - def set_body(self): - self.element_body = select(self.main_div, "div.maincol", 1) + "ArticlePage object for inrocks" + def get_body(self): try_remove(self.element_body, "div.sidebar") details = select(self.element_body, "div.details", 1) try_remove(details, "div.footer") @@ -35,5 +35,8 @@ class ArticlePage(InrocksPage): for selector in ["h1", "div.picture", "div.date", "div.news-single-img", "div.metas_img", "strong"]: try_remove(header, selector) + + return self.browser.parser.tostring(self.element_body) + + - self.article.body = self.browser.parser.tostring(self.element_body) diff --git a/weboob/backends/inrocks/pages/inrocks.py b/weboob/backends/inrocks/pages/inrocks.py index 71551f79..5042ce0f 100644 --- a/weboob/backends/inrocks/pages/inrocks.py +++ b/weboob/backends/inrocks/pages/inrocks.py @@ -37,32 +37,35 @@ class InrocksPage(BasePage): element_body = NotImplementedError article = Article element_author_selector = ValueError + element_title_selector = ValueError + element_body_selector = ValueError - def set_author(self): - try: - self.article.author = self.get_element_author().text_content().strip() - except NoAuthorElement: + def get_body(self): + return self.browser.parser.tostring(self.element_body) + + def get_author(self): + try : + return select(self.main_div, self.element_author_selector, 1).text_content().strip() + except SelectElementException: + #TODO: test nombre d'element en retour pass - def get_element_author(self): - try : - return select(self.main_div, self.element_author_selector, 1) - except SelectElementException: - raise NoAuthorElement() - - def set_body(self): - self.article.body = self.browser.parser.tostring(select(self.main_div, - "div.mna-body", - 1)) - + def get_title(self): + return select(self.main_div, self.element_title_selector, 1).text_content().strip() def on_loaded(self): self.article = Article(self.browser, url2id(self.url) ) self.main_div = self.document.getroot() - self.article.title = select(self.main_div, "h1", 1).text_content() - self.article.url = self.url - self.element_author_selector = "div.name>span" - self.set_author() - self.set_body() + + self.element_author_selector = "div.name>span" + self.element_title_selector = "h1" + self.element_body_selector = "div.maincol" + + self.element_body = select(self.main_div, self.element_body_selector, 1) + + self.article.author = self.get_author() + self.article.title = self.get_title() + self.article.url = self.url + self.article.body = self.get_body() diff --git a/weboob/backends/minutes20/__init__.py b/weboob/backends/minutes20/__init__.py index 91af6812..b1e19911 100644 --- a/weboob/backends/minutes20/__init__.py +++ b/weboob/backends/minutes20/__init__.py @@ -15,7 +15,5 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - from .backend import Newspaper20minutesBackend - __all__ = ['Newspaper20minutesBackend'] diff --git a/weboob/backends/minutes20/backend.py b/weboob/backends/minutes20/backend.py index 6c0cd4c6..e9ae34f5 100644 --- a/weboob/backends/minutes20/backend.py +++ b/weboob/backends/minutes20/backend.py @@ -21,27 +21,22 @@ from __future__ import with_statement from weboob.capabilities.messages import ICapMessages, Message, Thread from weboob.tools.backend import BaseBackend - -from .browser import Newspaper20minutesBrowser from weboob.tools.newsfeed import Newsfeed from .tools import url2id +from .browser import Newspaper20minutesBrowser __all__ = ['Newspaper20minutesBackend'] - - - class Newspaper20minutesBackend(BaseBackend, ICapMessages): - NAME = 'minutes20' MAINTAINER = 'Julien Hebert' EMAIL = 'juke@free.fr' VERSION = '0.6' LICENSE = 'GPLv3' - DESCRIPTION = u'20minutes French news website' - #CONFIG = ValuesDict(Value('login', label='Account ID'), - # Value('password', label='Password', masked=True)) STORAGE = {'seen': {}} + NAME = 'minutes20' + DESCRIPTION = u'20minutes French news website' BROWSER = Newspaper20minutesBrowser + RSS_FEED = 'http://www.20minutes.fr/rss/20minutes.xml' def get_thread(self, _id): if isinstance(_id, Thread): @@ -56,7 +51,6 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages): if not thread: thread = Thread(_id) - flags = Message.IS_HTML if not thread.id in self.storage.get('seen', default={}): flags |= Message.IS_UNREAD @@ -79,8 +73,7 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages): return thread def iter_threads(self): - for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml', - url2id).iter_entries(): + for article in Newsfeed(self.RSS_FEED, url2id).iter_entries(): thread = Thread(article.id) thread.title = article.title thread.date = article.datetime @@ -96,7 +89,6 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages): if msg.flags & msg.IS_UNREAD: yield msg - def set_message_read(self, message): self.storage.set( 'seen', diff --git a/weboob/backends/minutes20/browser.py b/weboob/backends/minutes20/browser.py index fd56bd24..f3b9720b 100644 --- a/weboob/backends/minutes20/browser.py +++ b/weboob/backends/minutes20/browser.py @@ -16,9 +16,10 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from .pages.article import ArticlePage -from .pages.minutes20 import Minutes20Page from weboob.tools.browser import BaseBrowser from .tools import id2url +from .pages.minutes20 import Minutes20Page + __all__ = ['Newspaper20minutesBrowser'] class Newspaper20minutesBrowser(BaseBrowser): @@ -28,7 +29,6 @@ class Newspaper20minutesBrowser(BaseBrowser): 'http://www.20minutes.fr/preums/?.*': Minutes20Page } - def is_logged(self): return False @@ -44,6 +44,4 @@ class Newspaper20minutesBrowser(BaseBrowser): raise ValueError("thread id is empty") else: raise - except AttributeError: - raise ValueError("cant go on url") return self.page.article diff --git a/weboob/backends/minutes20/pages/article.py b/weboob/backends/minutes20/pages/article.py index b4e9a4cc..b56cf127 100644 --- a/weboob/backends/minutes20/pages/article.py +++ b/weboob/backends/minutes20/pages/article.py @@ -1,5 +1,6 @@ "ArticlePage object for minutes20" # -*- coding: utf-8 -*- + # Copyright(C) 2011 Julien Hebert # # This program is free software; you can redistribute it and/or modify @@ -15,30 +16,19 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - from weboob.tools.parsers.lxmlparser import select, SelectElementException from .minutes20 import Minutes20Page, NoAuthorElement +def try_remove(base_element, selector): + try : + base_element.remove(select(base_element, selector, 1 )) + except (SelectElementException, ValueError): + pass + class ArticlePage(Minutes20Page): "ArticlePage object for minutes20" - def set_body(self): - self.element_body = select(self.main_div, "div.mna-body", 1) - element_tools = select(self.element_body, "div.mna-tools", 1) - - try : - self.element_body.remove(element_tools) - except ValueError: - pass - - try: - self.element_body.remove( - select(self.element_body, "div.mna-comment-call", 1)) - except (SelectElementException, ValueError): - pass - - try: - self.element_body.remove(self.get_element_author()) - except (NoAuthorElement, ValueError): - pass - - self.article.body = self.browser.parser.tostring(self.element_body) + def get_body(self): + try_remove(self.element_body, "div.mna-tools") + try_remove(self.element_body, "div.mna-comment-call") + try_remove(self.element_body, self.element_author_selector) + return self.browser.parser.tostring(self.element_body) diff --git a/weboob/backends/minutes20/pages/minutes20.py b/weboob/backends/minutes20/pages/minutes20.py index 6ea9f16f..40065e7b 100644 --- a/weboob/backends/minutes20/pages/minutes20.py +++ b/weboob/backends/minutes20/pages/minutes20.py @@ -29,38 +29,38 @@ class Article(object): self.title = u'' self.body = u'' self.url = u'' - self.author = u'' + self.author = u'' self.date = None class Minutes20Page(BasePage): main_div = NotImplementedError element_body = NotImplementedError article = Article - - def set_author(self): - self.article.author = self.get_element_author().text_content().strip() + element_author_selector = ValueError + element_title_selector = ValueError + element_body_selector = ValueError - def get_element_author(self): - try : - return select(self.main_div, "div.mna-signature", 1) - except SelectElementException: - raise NoAuthorElement() + def get_body(self): + return self.browser.parser.tostring(self.element_body) - def set_body(self): - self.article.body = self.browser.parser.tostring(select(self.main_div, - "div.mna-body", - 1)) + def get_author(self): + return select(self.main_div, self.element_author_selector, 1).text_content().strip() + def get_title(self): + return select(self.main_div, self.element_title_selector, 1).text_content().strip() def on_loaded(self): self.article = Article(self.browser, url2id(self.url) ) self.main_div = self.document.getroot() - self.article.title = select(self.main_div, "h1", 1).text_content() - self.article.url = self.url - try : - self.set_author() - except NoAuthorElement: - pass - self.set_body() - + self.element_author_selector = "div.mna-signature" + self.element_title_selector = "h1" + self.element_body_selector = "div.mna-body" + + self.element_body = select(self.main_div, self.element_body_selector, 1) + + self.article.author = self.get_author() + self.article.title = self.get_title() + self.article.url = self.url + self.article.body = self.get_body() +