From abcc4067d29dea253ac6779bb4683f0b09fcf702 Mon Sep 17 00:00:00 2001 From: Florent Date: Tue, 22 Jan 2013 14:33:35 +0100 Subject: [PATCH] All dailys news in only one thread --- modules/presseurop/backend.py | 24 ++++++++++++++++++++---- modules/presseurop/browser.py | 22 +++++++++++++++++++--- modules/presseurop/pages/article.py | 20 +++++++++++++++++++- 3 files changed, 58 insertions(+), 8 deletions(-) diff --git a/modules/presseurop/backend.py b/modules/presseurop/backend.py index aeb27d3f..e927dc49 100644 --- a/modules/presseurop/backend.py +++ b/modules/presseurop/backend.py @@ -47,8 +47,24 @@ class NewspaperPresseuropBackend(GenericNewspaperBackend, ICapMessages): self.RSS_FEED = 'http://www.presseurop.eu/%s/rss.xml' % (self.config['lang'].get()) def iter_threads(self): + daily = [] for article in Newsfeed(self.RSS_FEED, self.RSSID).iter_entries(): - thread = Thread(article.link) - thread.title = article.title - thread.date = article.datetime - yield(thread) + if "/news-brief/" in article.link: + day = self.browser.get_daily_date(article.link) + if day and (day not in daily): + daily.append(day) + id, title, date = self.browser.get_daily_infos(day) + thread = Thread(id) + thread.title = title + thread.date = date + yield(thread) + elif day is None: + thread = Thread(article.link) + thread.title = article.title + thread.date = article.datetime + yield(thread) + else: + thread = Thread(article.link) + thread.title = article.title + thread.date = article.datetime + yield(thread) diff --git a/modules/presseurop/browser.py b/modules/presseurop/browser.py index 93e4118d..7d4cac4a 100644 --- a/modules/presseurop/browser.py +++ b/modules/presseurop/browser.py @@ -18,7 +18,9 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from .pages.article import PresseuropPage, CartoonPage, DailyTitlesPage +from datetime import date, datetime, time +from .pages.article import PresseuropPage, CartoonPage, DailySinglePage,\ + DailyTitlesPage from weboob.tools.browser import BaseBrowser from weboob.tools.ordereddict import OrderedDict @@ -26,8 +28,8 @@ from weboob.tools.ordereddict import OrderedDict class NewspaperPresseuropBrowser(BaseBrowser): "NewspaperPresseuropBrowser class" PAGES = OrderedDict(( - ("http://www.presseurop.eu/.*/todays-front-pages/.*", DailyTitlesPage), - ("http://www.presseurop.eu/.*/front-page/.*", DailyTitlesPage), + ("http://www.presseurop.eu/.*/news-brief/.*", DailySinglePage), + ("http://www.presseurop.eu/.*/today/.*", DailyTitlesPage), ("http://www.presseurop.eu/.*/cartoon/.*", CartoonPage), ("http://www.presseurop.eu/.*", PresseuropPage), )) @@ -45,3 +47,17 @@ class NewspaperPresseuropBrowser(BaseBrowser): "return page article content" self.location(_id) return self.page.get_article(_id) + + def get_daily_date(self, _id): + self.location(_id) + return self.page.get_daily_date() + + def get_daily_infos(self, _id): + url = "http://www.presseurop.eu/fr/today/" + _id + self.location(url) + title = self.page.get_title() + article_date = date(*[int(x) + for x in _id.split('-')]) + article_time = time(0, 0, 0) + article_datetime = datetime.combine(article_date, article_time) + return url, title, article_datetime diff --git a/modules/presseurop/pages/article.py b/modules/presseurop/pages/article.py index 6e0178e8..f2e169e3 100644 --- a/modules/presseurop/pages/article.py +++ b/modules/presseurop/pages/article.py @@ -50,7 +50,25 @@ class DailyTitlesPage(PresseuropPage): self.main_div = self.document.getroot() self.element_title_selector = "title" self.element_author_selector = "div[id=content-author]>a" - self.element_body_selector = "ul.articlebody" + self.element_body_selector = "section.main" + + def get_body(self): + element_body = self.get_element_body() + try_drop_tree(self.parser, element_body, "li.button-social") + try_drop_tree(self.parser, element_body, "aside.articlerelated") + try_drop_tree(self.parser, element_body, "div.sharecount") + clean_relativ_urls(element_body, "http://presseurop.eu") + + + +class DailySinglePage(PresseuropPage): + def get_daily_date(self): + ul = self.document.getroot().xpath("//ul[@class='carousel-skin carousel-today']") + if len(ul) > 0: + link = ul[0].xpath('li/a')[0] + date = link.attrib['href'].split('/')[3] + return date + return None class CartoonPage(PresseuropPage):