From 08252358eb0432d3963ca1e4ff6582f6dd05d500 Mon Sep 17 00:00:00 2001 From: Juke Date: Thu, 10 Feb 2011 23:47:17 +0100 Subject: [PATCH] new backend --- weboob/backends/inrocks/__init__.py | 21 +++++ weboob/backends/inrocks/backend.py | 107 ++++++++++++++++++++++ weboob/backends/inrocks/browser.py | 34 +++++++ weboob/backends/inrocks/pages/__init__.py | 0 weboob/backends/inrocks/pages/article.py | 43 +++++++++ weboob/backends/inrocks/tools.py | 29 ++++++ 6 files changed, 234 insertions(+) create mode 100644 weboob/backends/inrocks/__init__.py create mode 100644 weboob/backends/inrocks/backend.py create mode 100644 weboob/backends/inrocks/browser.py create mode 100644 weboob/backends/inrocks/pages/__init__.py create mode 100644 weboob/backends/inrocks/pages/article.py create mode 100644 weboob/backends/inrocks/tools.py diff --git a/weboob/backends/inrocks/__init__.py b/weboob/backends/inrocks/__init__.py new file mode 100644 index 00000000..8fe34caf --- /dev/null +++ b/weboob/backends/inrocks/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from .backend import NewspaperInrocksBackend + +__all__ = ['NewspaperInrocksBackendBackend'] diff --git a/weboob/backends/inrocks/backend.py b/weboob/backends/inrocks/backend.py new file mode 100644 index 00000000..6d5fa009 --- /dev/null +++ b/weboob/backends/inrocks/backend.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +"backend for http://20minutes.fr" + +# python2.5 compatibility +from __future__ import with_statement + +from weboob.capabilities.messages import ICapMessages, Message, Thread +from weboob.tools.backend import BaseBackend + +from .browser import NewspaperInrocksBrowser +from weboob.tools.newsfeed import Newsfeed +from .tools import url2id + +__all__ = ['NewspaperInrocksBackend'] + + + + +class NewspaperInrocksBackend(BaseBackend, ICapMessages): + NAME = 'inrocks' + MAINTAINER = 'Julien Hebert' + EMAIL = 'juke@free.fr' + VERSION = '0.6' + LICENSE = 'GPLv3' + DESCRIPTION = u'Inrock French news website' + STORAGE = {'seen': {}} + BROWSER = NewspaperInrocksBrowser + + def get_thread(self, _id): + if isinstance(_id, Thread): + thread = _id + _id = thread.id + else: + thread = None + + with self.browser: + content = self.browser.get_content(_id) + + if not thread: + thread = Thread(_id) + + + flags = Message.IS_HTML + if not thread.id in self.storage.get('seen', default={}): + flags |= Message.IS_UNREAD + thread.title = content.title + if not thread.date: + thread.date = content.date + + thread.root = Message( + thread=thread, + id=0, + title=content.title, + sender=content.author, + receivers=None, + date=thread.date, + parent=None, + content=content.body, + flags=flags, + children= []) + return thread + + def iter_threads(self): + for article in Newsfeed('http://www.20minutes.fr/rss/20minutes.xml', + url2id).iter_entries(): + thread = Thread(article.id) + thread.title = article.title + thread.date = article.datetime + yield(thread) + + def fill_thread(self, thread): + return self.get_thread(thread) + + def iter_unread_messages(self, thread=None): + for thread in self.iter_threads(): + self.fill_thread(thread) + for msg in thread.iter_all_messages(): + if msg.flags & msg.IS_UNREAD: + yield msg + + + def set_message_read(self, message): + self.storage.set( + 'seen', + message.thread.id, + 'comments', + self.storage.get( + 'seen', + message.thread.id, + 'comments', + default=[]) + [message.id]) + self.storage.save() diff --git a/weboob/backends/inrocks/browser.py b/weboob/backends/inrocks/browser.py new file mode 100644 index 00000000..7ebacf80 --- /dev/null +++ b/weboob/backends/inrocks/browser.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from .pages.article import ArticlePage +from weboob.tools.browser import BaseBrowser +from .tools import id2url +__all__ = ['NewspaperInrocksBrowser'] + +class NewspaperInrocksBrowser(BaseBrowser): + PAGES = { + 'http://www.20minutes.fr/article/?.*': ArticlePage, + } + + + def is_logged(self): + return False + + def get_content(self, _id): + self.location(id2url(_id)) + return self.page.article diff --git a/weboob/backends/inrocks/pages/__init__.py b/weboob/backends/inrocks/pages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weboob/backends/inrocks/pages/article.py b/weboob/backends/inrocks/pages/article.py new file mode 100644 index 00000000..9e850424 --- /dev/null +++ b/weboob/backends/inrocks/pages/article.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from weboob.tools.parsers.lxmlparser import select, SelectElementException +from .minutes20 import Minutes20Page, NoAuthorElement + +class ArticlePage(Minutes20Page): + def set_body(self): + self.element_body = select(self.main_div, "div.mna-body", 1) + element_tools = select(self.element_body, "div.mna-tools", 1) + try : + self.element_body.remove(element_tools) + except ValueError: + pass + try: + self.element_body.remove( + select(self.element_body, "div.mna-comment-call", 1)) + except SelectElementException: + pass + except ValueError: + pass + try: + self.element_body.remove(self.get_element_author()) + except NoAuthorElement: + pass + except ValueError: + pass + self.article.body = self.browser.parser.tostring(self.element_body) diff --git a/weboob/backends/inrocks/tools.py b/weboob/backends/inrocks/tools.py new file mode 100644 index 00000000..10ccfd7c --- /dev/null +++ b/weboob/backends/inrocks/tools.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +import re +def id2url(_id): + regexp2 = re.compile("(\w+).(\w+).(.*$)") + match = regexp2.match(_id) + return 'http://www.20minutes.fr/%s/%s/%s' % ( match.group(1), + match.group(2), + match.group(3)) +def url2id(url): + regexp = re.compile("http://www.20minutes.fr/(\w+)/([0-9]+)/(.*$)") + match = regexp.match(url) + return '%s.%d.%s' % (match.group(1), int(match.group(2)), match.group(3))