From 9dad89bf7f8a994c256ab38edd92206229bbeecf Mon Sep 17 00:00:00 2001 From: Juke Date: Thu, 17 Feb 2011 10:38:57 +0100 Subject: [PATCH] use assessor --- weboob/backends/inrocks/pages/article.py | 4 +- weboob/backends/minutes20/browser.py | 2 +- weboob/backends/minutes20/pages/article.py | 12 +++-- weboob/backends/minutes20/pages/minutes20.py | 57 ++++++++++++-------- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/weboob/backends/inrocks/pages/article.py b/weboob/backends/inrocks/pages/article.py index 5ded0b8f..11ba5b72 100644 --- a/weboob/backends/inrocks/pages/article.py +++ b/weboob/backends/inrocks/pages/article.py @@ -32,10 +32,10 @@ class ArticlePage(InrocksPage): details = select(self.element_body, "div.details", 1) try_remove(details, "div.footer") header = select(self.element_body, "div.header", 1) - for selector in ["h1", "div.picture", "div.date", "div.news-single-img", + for selector in ["h1", "div.picture", "div.date", "div.news-single-img", "div.metas_img", "strong"]: try_remove(header, selector) - + return self.browser.parser.tostring(self.element_body) diff --git a/weboob/backends/minutes20/browser.py b/weboob/backends/minutes20/browser.py index f3b9720b..c0d3bac1 100644 --- a/weboob/backends/minutes20/browser.py +++ b/weboob/backends/minutes20/browser.py @@ -44,4 +44,4 @@ class Newspaper20minutesBrowser(BaseBrowser): raise ValueError("thread id is empty") else: raise - return self.page.article + return self.page.get_article() diff --git a/weboob/backends/minutes20/pages/article.py b/weboob/backends/minutes20/pages/article.py index b56cf127..6ba57a45 100644 --- a/weboob/backends/minutes20/pages/article.py +++ b/weboob/backends/minutes20/pages/article.py @@ -28,7 +28,11 @@ def try_remove(base_element, selector): class ArticlePage(Minutes20Page): "ArticlePage object for minutes20" def get_body(self): - try_remove(self.element_body, "div.mna-tools") - try_remove(self.element_body, "div.mna-comment-call") - try_remove(self.element_body, self.element_author_selector) - return self.browser.parser.tostring(self.element_body) + element_body = self.get_element_body() + try_remove(element_body, "div.mna-tools") + try_remove(element_body, "div.mna-comment-call") + try : + element_body.remove(self.get_element_author()) + except NoAuthorElement: + pass + return self.browser.parser.tostring(element_body) diff --git a/weboob/backends/minutes20/pages/minutes20.py b/weboob/backends/minutes20/pages/minutes20.py index 40065e7b..07bb8659 100644 --- a/weboob/backends/minutes20/pages/minutes20.py +++ b/weboob/backends/minutes20/pages/minutes20.py @@ -19,7 +19,7 @@ from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.backends.minutes20.tools import url2id __all__ = ['Minutes20Page', 'Article', 'NoAuthorElement'] -class NoAuthorElement(Exception): +class NoAuthorElement(SelectElementException): pass class Article(object): @@ -33,34 +33,47 @@ class Article(object): self.date = None class Minutes20Page(BasePage): - main_div = NotImplementedError - element_body = NotImplementedError - article = Article - element_author_selector = ValueError - element_title_selector = ValueError - element_body_selector = ValueError + __main_div = NotImplementedError + __element_body = NotImplementedError + __article = Article + __element_author_selector = ValueError + __element_title_selector = ValueError + __element_body_selector = ValueError def get_body(self): - return self.browser.parser.tostring(self.element_body) + return self.browser.parser.tostring(self.get_element_body()) def get_author(self): - return select(self.main_div, self.element_author_selector, 1).text_content().strip() + try: + return self.get_element_author().text_content().strip() + except NoAuthorElement: + return None def get_title(self): - return select(self.main_div, self.element_title_selector, 1).text_content().strip() + return select(self.__main_div, self.__element_title_selector, 1).text_content().strip() + + def get_element_body(self): + return select(self.__main_div, self.__element_body_selector, 1) + + def get_element_author(self): + try: + return select(self.__main_div, self.__element_author_selector, 1) + except SelectElementException: + raise NoAuthorElement() + + def get_article(self): + __article = Article(self.browser, url2id(self.url) ) + __article.author = self.get_author() + __article.title = self.get_title() + __article.url = self.url + __article.body = self.get_body() + + return __article def on_loaded(self): - self.article = Article(self.browser, url2id(self.url) ) - self.main_div = self.document.getroot() + self.__main_div = self.document.getroot() - self.element_author_selector = "div.mna-signature" - self.element_title_selector = "h1" - self.element_body_selector = "div.mna-body" - - self.element_body = select(self.main_div, self.element_body_selector, 1) - - self.article.author = self.get_author() - self.article.title = self.get_title() - self.article.url = self.url - self.article.body = self.get_body() + self.__element_author_selector = "div.mna-signature" + self.__element_title_selector = "h1" + self.__element_body_selector = "div.mna-body"