All dailys news in only one thread

This commit is contained in:
Florent 2013-01-22 14:33:35 +01:00
commit abcc4067d2
3 changed files with 58 additions and 8 deletions

View file

@ -47,8 +47,24 @@ class NewspaperPresseuropBackend(GenericNewspaperBackend, ICapMessages):
self.RSS_FEED = 'http://www.presseurop.eu/%s/rss.xml' % (self.config['lang'].get()) self.RSS_FEED = 'http://www.presseurop.eu/%s/rss.xml' % (self.config['lang'].get())
def iter_threads(self): def iter_threads(self):
daily = []
for article in Newsfeed(self.RSS_FEED, self.RSSID).iter_entries(): for article in Newsfeed(self.RSS_FEED, self.RSSID).iter_entries():
thread = Thread(article.link) if "/news-brief/" in article.link:
thread.title = article.title day = self.browser.get_daily_date(article.link)
thread.date = article.datetime if day and (day not in daily):
yield(thread) daily.append(day)
id, title, date = self.browser.get_daily_infos(day)
thread = Thread(id)
thread.title = title
thread.date = date
yield(thread)
elif day is None:
thread = Thread(article.link)
thread.title = article.title
thread.date = article.datetime
yield(thread)
else:
thread = Thread(article.link)
thread.title = article.title
thread.date = article.datetime
yield(thread)

View file

@ -18,7 +18,9 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from .pages.article import PresseuropPage, CartoonPage, DailyTitlesPage from datetime import date, datetime, time
from .pages.article import PresseuropPage, CartoonPage, DailySinglePage,\
DailyTitlesPage
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.ordereddict import OrderedDict from weboob.tools.ordereddict import OrderedDict
@ -26,8 +28,8 @@ from weboob.tools.ordereddict import OrderedDict
class NewspaperPresseuropBrowser(BaseBrowser): class NewspaperPresseuropBrowser(BaseBrowser):
"NewspaperPresseuropBrowser class" "NewspaperPresseuropBrowser class"
PAGES = OrderedDict(( PAGES = OrderedDict((
("http://www.presseurop.eu/.*/todays-front-pages/.*", DailyTitlesPage), ("http://www.presseurop.eu/.*/news-brief/.*", DailySinglePage),
("http://www.presseurop.eu/.*/front-page/.*", DailyTitlesPage), ("http://www.presseurop.eu/.*/today/.*", DailyTitlesPage),
("http://www.presseurop.eu/.*/cartoon/.*", CartoonPage), ("http://www.presseurop.eu/.*/cartoon/.*", CartoonPage),
("http://www.presseurop.eu/.*", PresseuropPage), ("http://www.presseurop.eu/.*", PresseuropPage),
)) ))
@ -45,3 +47,17 @@ class NewspaperPresseuropBrowser(BaseBrowser):
"return page article content" "return page article content"
self.location(_id) self.location(_id)
return self.page.get_article(_id) return self.page.get_article(_id)
def get_daily_date(self, _id):
self.location(_id)
return self.page.get_daily_date()
def get_daily_infos(self, _id):
url = "http://www.presseurop.eu/fr/today/" + _id
self.location(url)
title = self.page.get_title()
article_date = date(*[int(x)
for x in _id.split('-')])
article_time = time(0, 0, 0)
article_datetime = datetime.combine(article_date, article_time)
return url, title, article_datetime

View file

@ -50,7 +50,25 @@ class DailyTitlesPage(PresseuropPage):
self.main_div = self.document.getroot() self.main_div = self.document.getroot()
self.element_title_selector = "title" self.element_title_selector = "title"
self.element_author_selector = "div[id=content-author]>a" self.element_author_selector = "div[id=content-author]>a"
self.element_body_selector = "ul.articlebody" self.element_body_selector = "section.main"
def get_body(self):
element_body = self.get_element_body()
try_drop_tree(self.parser, element_body, "li.button-social")
try_drop_tree(self.parser, element_body, "aside.articlerelated")
try_drop_tree(self.parser, element_body, "div.sharecount")
clean_relativ_urls(element_body, "http://presseurop.eu")
class DailySinglePage(PresseuropPage):
def get_daily_date(self):
ul = self.document.getroot().xpath("//ul[@class='carousel-skin carousel-today']")
if len(ul) > 0:
link = ul[0].xpath('li/a')[0]
date = link.attrib['href'].split('/')[3]
return date
return None
class CartoonPage(PresseuropPage): class CartoonPage(PresseuropPage):