From e181fe4b89ca0ae412a76cb70186245f9dfee8c3 Mon Sep 17 00:00:00 2001 From: Florent Date: Wed, 25 Apr 2012 13:33:45 +0200 Subject: [PATCH] Fix parsing of cartoon pages (Site changed) Tested on version 0.b and 0.c --- modules/presseurop/browser.py | 3 ++- modules/presseurop/pages/article.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/modules/presseurop/browser.py b/modules/presseurop/browser.py index e292ec3b..1b320fd7 100644 --- a/modules/presseurop/browser.py +++ b/modules/presseurop/browser.py @@ -18,13 +18,14 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from .pages.article import ArticlePage +from .pages.article import ArticlePage, CartoonPage from weboob.tools.browser import BaseBrowser class NewspaperPresseuropBrowser(BaseBrowser): "NewspaperPresseuropBrowser class" PAGES = { + "http://www.presseurop.eu/.*/cartoon/.*": CartoonPage, "http://www.presseurop.eu/.*": ArticlePage, } diff --git a/modules/presseurop/pages/article.py b/modules/presseurop/pages/article.py index 2561ad3b..1d4efb16 100644 --- a/modules/presseurop/pages/article.py +++ b/modules/presseurop/pages/article.py @@ -39,3 +39,22 @@ class ArticlePage(GenericNewsPage): title = GenericNewsPage.get_title(self) title = title.split('|')[0] return title + + +class CartoonPage(GenericNewsPage): + "CartoonPage object for presseurop" + + def on_loaded(self): + self.main_div = self.document.getroot() + self.element_title_selector = "title" + self.element_author_selector = "div.profilecartoontext>p>a" + self.element_body_selector = "div.panel" + + def get_body(self): + element_body = self.get_element_body() + return self.parser.tostring(element_body) + + def get_title(self): + title = GenericNewsPage.get_title(self) + title = title.split('|')[0] + return title