From 0d11694605ab34d1527edd9556dc9e2b94c6f21a Mon Sep 17 00:00:00 2001 From: Florent Date: Fri, 11 Jan 2013 15:27:35 +0100 Subject: [PATCH] Fix author selection --- modules/lefigaro/browser.py | 3 ++- modules/lefigaro/pages/article.py | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/modules/lefigaro/browser.py b/modules/lefigaro/browser.py index acb4e91c..e64c15e1 100644 --- a/modules/lefigaro/browser.py +++ b/modules/lefigaro/browser.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from .pages.article import ArticlePage +from .pages.article import ArticlePage, ActuPage from .pages.flashactu import FlashActuPage from weboob.tools.browser import BaseBrowser, BasePage @@ -33,6 +33,7 @@ class NewspaperFigaroBrowser(BaseBrowser): "http://\w+.lefigaro.fr/flash-.*/(\d{4})/(\d{2})/(\d{2})/(.*$)": FlashActuPage, "http://\w+.lefigaro.fr/bd/(\d{4})/(\d{2})/(\d{2})/(.*$)": FlashActuPage, "http://\w+.lefigaro.fr/(?!flash-|bd|actualite).+/(\d{4})/(\d{2})/(\d{2})/(.*$)": ArticlePage, + "http://\w+.lefigaro.fr/actualite/(\d{4})/(\d{2})/(\d{2})/(.*$)": ActuPage, "http://\w+.lefigaro.fr/actualite-.*/(\d{4})/(\d{2})/(\d{2})/(.*$)": ArticlePage, "http://\w+.lefigaro.fr/": IndexPage, } diff --git a/modules/lefigaro/pages/article.py b/modules/lefigaro/pages/article.py index d0ed5f57..d216600f 100644 --- a/modules/lefigaro/pages/article.py +++ b/modules/lefigaro/pages/article.py @@ -26,7 +26,7 @@ class ArticlePage(GenericNewsPage): def on_loaded(self): self.main_div = self.document.getroot() self.element_title_selector = "h1" - self.element_author_selector = "div.name>span" + self.element_author_selector = "span.auteur>a, span.auteur_long>div" self.element_body_selector = "#article, div.article" def get_body(self): @@ -68,3 +68,17 @@ class ArticlePage(GenericNewsPage): txts[0].drop_tag() element_body.tag = "div" return self.parser.tostring(element_body) + + +class ActuPage(GenericNewsPage): + def on_loaded(self): + self.main_div = self.document.getroot() + self.element_title_selector = "h2" + self.element_author_selector = "div.name>span" + self.element_body_selector = ".block-text" + + def get_body(self): + element_body = self.get_element_body() + try_remove_from_selector_list(self.parser, element_body, ['div']) + element_body.tag = "div" + return self.parser.tostring(element_body)