uniform minutes20 and inrocks

This commit is contained in:
Juke 2011-02-17 04:06:01 +01:00 committed by Romain Bignon
commit 2ebe8012bd
10 changed files with 76 additions and 97 deletions

View file

@ -15,7 +15,5 @@
# along with this program; if not, write to the Free Software # along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .backend import NewspaperInrocksBackend from .backend import NewspaperInrocksBackend
__all__ = ['NewspaperInrocksBackendBackend'] __all__ = ['NewspaperInrocksBackendBackend']

View file

@ -21,25 +21,22 @@ from __future__ import with_statement
from weboob.capabilities.messages import ICapMessages, Message, Thread from weboob.capabilities.messages import ICapMessages, Message, Thread
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend
from .browser import NewspaperInrocksBrowser
from weboob.tools.newsfeed import Newsfeed from weboob.tools.newsfeed import Newsfeed
from .tools import url2id from .tools import url2id
from .browser import NewspaperInrocksBrowser
__all__ = ['NewspaperInrocksBackend'] __all__ = ['NewspaperInrocksBackend']
class NewspaperInrocksBackend(BaseBackend, ICapMessages): class NewspaperInrocksBackend(BaseBackend, ICapMessages):
NAME = 'inrocks'
MAINTAINER = 'Julien Hebert' MAINTAINER = 'Julien Hebert'
EMAIL = 'juke@free.fr' EMAIL = 'juke@free.fr'
VERSION = '0.6' VERSION = '0.6'
LICENSE = 'GPLv3' LICENSE = 'GPLv3'
DESCRIPTION = u'Inrock French news website'
STORAGE = {'seen': {}} STORAGE = {'seen': {}}
NAME = 'inrocks'
DESCRIPTION = u'Inrock French news website'
BROWSER = NewspaperInrocksBrowser BROWSER = NewspaperInrocksBrowser
RSS_FEED = 'http://www.lesinrocks.com/fileadmin/rss/actus.xml'
def get_thread(self, _id): def get_thread(self, _id):
if isinstance(_id, Thread): if isinstance(_id, Thread):
@ -54,7 +51,6 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages):
if not thread: if not thread:
thread = Thread(_id) thread = Thread(_id)
flags = Message.IS_HTML flags = Message.IS_HTML
if not thread.id in self.storage.get('seen', default={}): if not thread.id in self.storage.get('seen', default={}):
flags |= Message.IS_UNREAD flags |= Message.IS_UNREAD
@ -71,13 +67,13 @@ class NewspaperInrocksBackend(BaseBackend, ICapMessages):
date=thread.date, date=thread.date,
parent=None, parent=None,
content=content.body, content=content.body,
signature='URL: %s' % content.url,
flags=flags, flags=flags,
children= []) children= [])
return thread return thread
def iter_threads(self): def iter_threads(self):
for article in Newsfeed('http://www.lesinrocks.com/fileadmin/rss/actus.xml', for article in Newsfeed(self.RSS_FEED, url2id).iter_entries():
url2id).iter_entries():
thread = Thread(article.id) thread = Thread(article.id)
thread.title = article.title thread.title = article.title
thread.date = article.datetime thread.date = article.datetime

View file

@ -18,8 +18,10 @@
from .pages.article import ArticlePage from .pages.article import ArticlePage
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from .tools import id2url from .tools import id2url
__all__ = ['NewspaperInrocksBrowser'] __all__ = ['NewspaperInrocksBrowser']
class NewspaperInrocksBrowser(BaseBrowser): class NewspaperInrocksBrowser(BaseBrowser):
PAGES = { PAGES = {
'http://www.lesinrocks.com/actualite/actu-article/t/60121/date/2011-02-15/article/accuse-davoir-participe-a-une-mutinerie-un-detenu-porte-plainte/': ArticlePage, 'http://www.lesinrocks.com/actualite/actu-article/t/60121/date/2011-02-15/article/accuse-davoir-participe-a-une-mutinerie-un-detenu-porte-plainte/': ArticlePage,
@ -27,7 +29,6 @@ class NewspaperInrocksBrowser(BaseBrowser):
} }
def is_logged(self): def is_logged(self):
return False return False

View file

@ -1,3 +1,4 @@
"ArticlePage object for inrocks"
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert # Copyright(C) 2011 Julien Hebert
@ -15,7 +16,6 @@
# along with this program; if not, write to the Free Software # along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.parsers.lxmlparser import select, SelectElementException
from .inrocks import InrocksPage from .inrocks import InrocksPage
@ -26,8 +26,8 @@ def try_remove(base_element, selector):
pass pass
class ArticlePage(InrocksPage): class ArticlePage(InrocksPage):
def set_body(self): "ArticlePage object for inrocks"
self.element_body = select(self.main_div, "div.maincol", 1) def get_body(self):
try_remove(self.element_body, "div.sidebar") try_remove(self.element_body, "div.sidebar")
details = select(self.element_body, "div.details", 1) details = select(self.element_body, "div.details", 1)
try_remove(details, "div.footer") try_remove(details, "div.footer")
@ -36,4 +36,7 @@ class ArticlePage(InrocksPage):
"div.metas_img", "strong"]: "div.metas_img", "strong"]:
try_remove(header, selector) try_remove(header, selector)
self.article.body = self.browser.parser.tostring(self.element_body) return self.browser.parser.tostring(self.element_body)

View file

@ -37,32 +37,35 @@ class InrocksPage(BasePage):
element_body = NotImplementedError element_body = NotImplementedError
article = Article article = Article
element_author_selector = ValueError element_author_selector = ValueError
element_title_selector = ValueError
element_body_selector = ValueError
def set_author(self): def get_body(self):
try: return self.browser.parser.tostring(self.element_body)
self.article.author = self.get_element_author().text_content().strip()
except NoAuthorElement: def get_author(self):
try :
return select(self.main_div, self.element_author_selector, 1).text_content().strip()
except SelectElementException:
#TODO: test nombre d'element en retour
pass pass
def get_element_author(self): def get_title(self):
try : return select(self.main_div, self.element_title_selector, 1).text_content().strip()
return select(self.main_div, self.element_author_selector, 1)
except SelectElementException:
raise NoAuthorElement()
def set_body(self):
self.article.body = self.browser.parser.tostring(select(self.main_div,
"div.mna-body",
1))
def on_loaded(self): def on_loaded(self):
self.article = Article(self.browser, url2id(self.url) ) self.article = Article(self.browser, url2id(self.url) )
self.main_div = self.document.getroot() self.main_div = self.document.getroot()
self.article.title = select(self.main_div, "h1", 1).text_content()
self.article.url = self.url self.element_author_selector = "div.name>span"
self.element_author_selector = "div.name>span" self.element_title_selector = "h1"
self.set_author() self.element_body_selector = "div.maincol"
self.set_body()
self.element_body = select(self.main_div, self.element_body_selector, 1)
self.article.author = self.get_author()
self.article.title = self.get_title()
self.article.url = self.url
self.article.body = self.get_body()

View file

@ -15,7 +15,5 @@
# along with this program; if not, write to the Free Software # along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .backend import Newspaper20minutesBackend from .backend import Newspaper20minutesBackend
__all__ = ['Newspaper20minutesBackend'] __all__ = ['Newspaper20minutesBackend']

View file

@ -21,27 +21,22 @@ from __future__ import with_statement
from weboob.capabilities.messages import ICapMessages, Message, Thread from weboob.capabilities.messages import ICapMessages, Message, Thread
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend
from .browser import Newspaper20minutesBrowser
from weboob.tools.newsfeed import Newsfeed from weboob.tools.newsfeed import Newsfeed
from .tools import url2id from .tools import url2id
from .browser import Newspaper20minutesBrowser
__all__ = ['Newspaper20minutesBackend'] __all__ = ['Newspaper20minutesBackend']
class Newspaper20minutesBackend(BaseBackend, ICapMessages): class Newspaper20minutesBackend(BaseBackend, ICapMessages):
NAME = 'minutes20'
MAINTAINER = 'Julien Hebert' MAINTAINER = 'Julien Hebert'
EMAIL = 'juke@free.fr' EMAIL = 'juke@free.fr'
VERSION = '0.6' VERSION = '0.6'
LICENSE = 'GPLv3' LICENSE = 'GPLv3'
DESCRIPTION = u'20minutes French news website'
#CONFIG = ValuesDict(Value('login', label='Account ID'),
# Value('password', label='Password', masked=True))
STORAGE = {'seen': {}} STORAGE = {'seen': {}}
NAME = 'minutes20'
DESCRIPTION = u'20minutes French news website'
BROWSER = Newspaper20minutesBrowser BROWSER = Newspaper20minutesBrowser
RSS_FEED = 'http://www.20minutes.fr/rss/20minutes.xml'
def get_thread(self, _id): def get_thread(self, _id):
if isinstance(_id, Thread): if isinstance(_id, Thread):
@ -56,7 +51,6 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages):
if not thread: if not thread:
thread = Thread(_id) thread = Thread(_id)
flags = Message.IS_HTML flags = Message.IS_HTML
if not thread.id in self.storage.get('seen', default={}): if not thread.id in self.storage.get('seen', default={}):
flags |= Message.IS_UNREAD flags |= Message.IS_UNREAD
@ -79,8 +73,7 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages):
return thread return thread
def iter_threads(self): def iter_threads(self):
for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml', for article in Newsfeed(self.RSS_FEED, url2id).iter_entries():
url2id).iter_entries():
thread = Thread(article.id) thread = Thread(article.id)
thread.title = article.title thread.title = article.title
thread.date = article.datetime thread.date = article.datetime
@ -96,7 +89,6 @@ class Newspaper20minutesBackend(BaseBackend, ICapMessages):
if msg.flags & msg.IS_UNREAD: if msg.flags & msg.IS_UNREAD:
yield msg yield msg
def set_message_read(self, message): def set_message_read(self, message):
self.storage.set( self.storage.set(
'seen', 'seen',

View file

@ -16,9 +16,10 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .pages.article import ArticlePage from .pages.article import ArticlePage
from .pages.minutes20 import Minutes20Page
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from .tools import id2url from .tools import id2url
from .pages.minutes20 import Minutes20Page
__all__ = ['Newspaper20minutesBrowser'] __all__ = ['Newspaper20minutesBrowser']
class Newspaper20minutesBrowser(BaseBrowser): class Newspaper20minutesBrowser(BaseBrowser):
@ -28,7 +29,6 @@ class Newspaper20minutesBrowser(BaseBrowser):
'http://www.20minutes.fr/preums/?.*': Minutes20Page 'http://www.20minutes.fr/preums/?.*': Minutes20Page
} }
def is_logged(self): def is_logged(self):
return False return False
@ -44,6 +44,4 @@ class Newspaper20minutesBrowser(BaseBrowser):
raise ValueError("thread id is empty") raise ValueError("thread id is empty")
else: else:
raise raise
except AttributeError:
raise ValueError("cant go on url")
return self.page.article return self.page.article

View file

@ -1,5 +1,6 @@
"ArticlePage object for minutes20" "ArticlePage object for minutes20"
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert # Copyright(C) 2011 Julien Hebert
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
@ -15,30 +16,19 @@
# along with this program; if not, write to the Free Software # along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.parsers.lxmlparser import select, SelectElementException
from .minutes20 import Minutes20Page, NoAuthorElement from .minutes20 import Minutes20Page, NoAuthorElement
def try_remove(base_element, selector):
try :
base_element.remove(select(base_element, selector, 1 ))
except (SelectElementException, ValueError):
pass
class ArticlePage(Minutes20Page): class ArticlePage(Minutes20Page):
"ArticlePage object for minutes20" "ArticlePage object for minutes20"
def set_body(self): def get_body(self):
self.element_body = select(self.main_div, "div.mna-body", 1) try_remove(self.element_body, "div.mna-tools")
element_tools = select(self.element_body, "div.mna-tools", 1) try_remove(self.element_body, "div.mna-comment-call")
try_remove(self.element_body, self.element_author_selector)
try : return self.browser.parser.tostring(self.element_body)
self.element_body.remove(element_tools)
except ValueError:
pass
try:
self.element_body.remove(
select(self.element_body, "div.mna-comment-call", 1))
except (SelectElementException, ValueError):
pass
try:
self.element_body.remove(self.get_element_author())
except (NoAuthorElement, ValueError):
pass
self.article.body = self.browser.parser.tostring(self.element_body)

View file

@ -36,31 +36,31 @@ class Minutes20Page(BasePage):
main_div = NotImplementedError main_div = NotImplementedError
element_body = NotImplementedError element_body = NotImplementedError
article = Article article = Article
element_author_selector = ValueError
element_title_selector = ValueError
element_body_selector = ValueError
def set_author(self): def get_body(self):
self.article.author = self.get_element_author().text_content().strip() return self.browser.parser.tostring(self.element_body)
def get_element_author(self): def get_author(self):
try : return select(self.main_div, self.element_author_selector, 1).text_content().strip()
return select(self.main_div, "div.mna-signature", 1)
except SelectElementException:
raise NoAuthorElement()
def set_body(self):
self.article.body = self.browser.parser.tostring(select(self.main_div,
"div.mna-body",
1))
def get_title(self):
return select(self.main_div, self.element_title_selector, 1).text_content().strip()
def on_loaded(self): def on_loaded(self):
self.article = Article(self.browser, url2id(self.url) ) self.article = Article(self.browser, url2id(self.url) )
self.main_div = self.document.getroot() self.main_div = self.document.getroot()
self.article.title = select(self.main_div, "h1", 1).text_content()
self.article.url = self.url
try :
self.set_author()
except NoAuthorElement:
pass
self.set_body()
self.element_author_selector = "div.mna-signature"
self.element_title_selector = "h1"
self.element_body_selector = "div.mna-body"
self.element_body = select(self.main_div, self.element_body_selector, 1)
self.article.author = self.get_author()
self.article.title = self.get_title()
self.article.url = self.url
self.article.body = self.get_body()