new backend

This commit is contained in:
Juke 2011-02-10 23:47:17 +01:00 committed by Romain Bignon
commit 08252358eb
6 changed files with 234 additions and 0 deletions

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .backend import NewspaperInrocksBackend
__all__ = ['NewspaperInrocksBackendBackend']

View file

@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"backend for http://20minutes.fr"
# python2.5 compatibility
from __future__ import with_statement
from weboob.capabilities.messages import ICapMessages, Message, Thread
from weboob.tools.backend import BaseBackend
from .browser import NewspaperInrocksBrowser
from weboob.tools.newsfeed import Newsfeed
from .tools import url2id
__all__ = ['NewspaperInrocksBackend']
class NewspaperInrocksBackend(BaseBackend, ICapMessages):
NAME = 'inrocks'
MAINTAINER = 'Julien Hebert'
EMAIL = 'juke@free.fr'
VERSION = '0.6'
LICENSE = 'GPLv3'
DESCRIPTION = u'Inrock French news website'
STORAGE = {'seen': {}}
BROWSER = NewspaperInrocksBrowser
def get_thread(self, _id):
if isinstance(_id, Thread):
thread = _id
_id = thread.id
else:
thread = None
with self.browser:
content = self.browser.get_content(_id)
if not thread:
thread = Thread(_id)
flags = Message.IS_HTML
if not thread.id in self.storage.get('seen', default={}):
flags |= Message.IS_UNREAD
thread.title = content.title
if not thread.date:
thread.date = content.date
thread.root = Message(
thread=thread,
id=0,
title=content.title,
sender=content.author,
receivers=None,
date=thread.date,
parent=None,
content=content.body,
flags=flags,
children= [])
return thread
def iter_threads(self):
for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml',
url2id).iter_entries():
thread = Thread(article.id)
thread.title = article.title
thread.date = article.datetime
yield(thread)
def fill_thread(self, thread):
return self.get_thread(thread)
def iter_unread_messages(self, thread=None):
for thread in self.iter_threads():
self.fill_thread(thread)
for msg in thread.iter_all_messages():
if msg.flags & msg.IS_UNREAD:
yield msg
def set_message_read(self, message):
self.storage.set(
'seen',
message.thread.id,
'comments',
self.storage.get(
'seen',
message.thread.id,
'comments',
default=[]) + [message.id])
self.storage.save()

View file

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .pages.article import ArticlePage
from weboob.tools.browser import BaseBrowser
from .tools import id2url
__all__ = ['NewspaperInrocksBrowser']
class NewspaperInrocksBrowser(BaseBrowser):
PAGES = {
'http://www.20minutes.fr/article/?.*': ArticlePage,
}
def is_logged(self):
return False
def get_content(self, _id):
self.location(id2url(_id))
return self.page.article

View file

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.tools.parsers.lxmlparser import select, SelectElementException
from .minutes20 import Minutes20Page, NoAuthorElement
class ArticlePage(Minutes20Page):
def set_body(self):
self.element_body = select(self.main_div, "div.mna-body", 1)
element_tools = select(self.element_body, "div.mna-tools", 1)
try :
self.element_body.remove(element_tools)
except ValueError:
pass
try:
self.element_body.remove(
select(self.element_body, "div.mna-comment-call", 1))
except SelectElementException:
pass
except ValueError:
pass
try:
self.element_body.remove(self.get_element_author())
except NoAuthorElement:
pass
except ValueError:
pass
self.article.body = self.browser.parser.tostring(self.element_body)

View file

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import re
def id2url(_id):
regexp2 = re.compile("(\w+).(\w+).(.*$)")
match = regexp2.match(_id)
return 'http://www.20minutes.fr/%s/%s/%s' % ( match.group(1),
match.group(2),
match.group(3))
def url2id(url):
regexp = re.compile("http://www.20minutes.fr/(\w+)/([0-9]+)/(.*$)")
match = regexp.match(url)
return '%s.%d.%s' % (match.group(1), int(match.group(2)), match.group(3))