From 5be1d36bebecd000cbf112349a76f1b9fbb8c0e8 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Tue, 8 Jun 2010 22:18:56 +0200 Subject: [PATCH] new backend 'fourchan' (implements ICapMessages) --- weboob/backends/fourchan/__init__.py | 2 + weboob/backends/fourchan/backend.py | 107 +++++++++++++++++++++ weboob/backends/fourchan/browser.py | 41 ++++++++ weboob/backends/fourchan/pages/__init__.py | 0 weboob/backends/fourchan/pages/board.py | 85 ++++++++++++++++ weboob/capabilities/messages.py | 4 +- 6 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 weboob/backends/fourchan/__init__.py create mode 100644 weboob/backends/fourchan/backend.py create mode 100644 weboob/backends/fourchan/browser.py create mode 100644 weboob/backends/fourchan/pages/__init__.py create mode 100644 weboob/backends/fourchan/pages/board.py diff --git a/weboob/backends/fourchan/__init__.py b/weboob/backends/fourchan/__init__.py new file mode 100644 index 00000000..5abbf6a5 --- /dev/null +++ b/weboob/backends/fourchan/__init__.py @@ -0,0 +1,2 @@ +from .backend import FourChanBackend +from .browser import FourChan diff --git a/weboob/backends/fourchan/backend.py b/weboob/backends/fourchan/backend.py new file mode 100644 index 00000000..4702f16b --- /dev/null +++ b/weboob/backends/fourchan/backend.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Romain Bignon +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from logging import warning + +from weboob.backend import BaseBackend +from weboob.capabilities.messages import ICapMessages, Message + +from .browser import FourChan + +class FourChanBackend(BaseBackend, ICapMessages): + NAME = 'fourchan' + MAINTAINER = 'Romain Bignon' + EMAIL = 'romain@peerfuse.org' + VERSION = '1.0' + LICENSE = 'GPLv3' + DESCRIPTION = "4chan website" + + CONFIG = {'boards': BaseBackend.ConfigField(description='Boards'), + } + STORAGE = {'boards': {}} + BROWSER = FourChan + + def iter_messages(self, thread=None): + return self._iter_messages(thread, False) + + def iter_new_messages(self, thread=None): + return self._iter_messages(thread, True) + + def _iter_messages(self, thread, only_new): + if thread: + if '.' in thread: + board, thread = thread.split('.', 2) + return self._iter_messages_of(board, thread, only_new) + else: + warning('"%s" is not a valid ID' % thread) + else: + for board in self.config['boards'].split(' '): + return self._iter_messages_of(board, None, only_new) + + def _iter_messages_of(self, board, thread_wanted, only_new): + if not board in self.storage.get('boards', default={}): + self.storage.set('boards', board, {}) + + if thread_wanted: + for message in self._iter_thread_messages(board, thread_wanted, only_new): + yield message + else: + for thread in self.browser.get_threads(board): + for message in self._iter_thread_messages(board, thread.id, only_new): + yield message + + def _iter_thread_messages(self, board, thread, only_new): + thread = self.browser.get_thread(board, thread) + + if thread.id in self.storage.get('boards', board, default={}): + self.storage.set('boards', board, thread.id, []) + new = True + else: + new = False + + if not only_new or new: + yield Message('%s.%s' % (board, thread.id), + 0, + thread.filename, + thread.author, + thread.datetime, + content=thread.text, + is_html=True, + is_new=new) + + for comment in thread.comments: + if not comment.id in self.storage.get('boards', board, thread.id, default=[]): + self.storage.set('boards', board, thread.id, self.storage.get('boards', board, thread.id, default=[]) + [comment.id]) + new = True + else: + new = False + + if not only_new or new: + yield Message('%s.%s' % (board, thread.id), + comment.id, + thread.filename, + comment.author, + comment.datetime, + 0, + comment.text, + is_html=True, + is_new=new) + + self.storage.save() + + #def post_reply(self, thread_id, reply_id, title, message): + # return self.browser.post_reply(thread_id, reply_id, title, message) diff --git a/weboob/backends/fourchan/browser.py b/weboob/backends/fourchan/browser.py new file mode 100644 index 00000000..eea2bfc2 --- /dev/null +++ b/weboob/backends/fourchan/browser.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Romain Bignon +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +from weboob.tools.browser import BaseBrowser + +from .pages.board import BoardPage + +class FourChan(BaseBrowser): + DOMAIN = 'boards.4chan.org' + PROTOCOL = 'http' + PAGES = {'http://boards.4chan.org/\w+/': BoardPage, + 'http://boards.4chan.org/\w+/res/\d+': BoardPage, + } + + def is_logged(self): + return True + + def get_threads(self, board): + self.location('http://boards.4chan.org/%s/' % board) + + return self.page.articles + + def get_thread(self, board, id): + self.location('http://boards.4chan.org/%s/res/%d' % (board, id)) + + assert len(self.page.articles) == 1 + return self.page.articles[0] diff --git a/weboob/backends/fourchan/pages/__init__.py b/weboob/backends/fourchan/pages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weboob/backends/fourchan/pages/board.py b/weboob/backends/fourchan/pages/board.py new file mode 100644 index 00000000..b11adb83 --- /dev/null +++ b/weboob/backends/fourchan/pages/board.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Romain Bignon +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +import re +from logging import warning + +from weboob.tools.browser import BasePage + +class Message(object): + def __init__(self, browser, board, id, filename=u'', url=u''): + self.id = id + self.browser = browser + self.board = board + self.filename = filename + self.datetime = 0 + self.url = url + self.author = u'' + self.text = u'' + self.comments = [] + + def add_comment(self, div): + comment = Message(self.browser, self.board, int(div.attrib.get('id', ''))) + comment.author = div.cssselect('span.commentpostername')[0].text + comment.text = self.browser.parser.tostring(div.find('blockquote')) + self.comments.append(comment) + + def __repr__(self): + return '' % (self.id, self.filename, self.url, len(self.comments)) + +class BoardPage(BasePage): + URL_REGEXP = re.compile('http://boards.4chan.org/(\w+)/') + + def on_loaded(self): + self.articles = [] + + m = self.URL_REGEXP.match(self.url) + if m: + self.board = m.group(1) + else: + warning('Unable to find board') + self.board = 'unknown' + + forms = self.document.getroot().cssselect('form') + form = None + + for f in forms: + if f.attrib.get('name', '') == 'delform': + form = f + break + + if form is None: + warning('No delform :(') + + article = None + for div in form.getchildren(): + if div.tag == 'span' and div.attrib.get('class', '') == 'filesize': + url = div.find('a').get('href', '') + filename = 'unknown.jpg' + span = div.find('span') + if span is not None: + filename = span.text + article = Message(self.browser, self.board, 0, filename, url) + self.articles.append(article) + if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'): + article.id = int(div.attrib.get('name', '0')) + if div.tag == 'blockquote': + article.text = self.browser.parser.tostring(div) + if div.tag == 'table': + tags = div.cssselect('td.reply') + if tags: + article.add_comment(tags[0]) diff --git a/weboob/capabilities/messages.py b/weboob/capabilities/messages.py index bc03e2fa..0f74de67 100644 --- a/weboob/capabilities/messages.py +++ b/weboob/capabilities/messages.py @@ -73,8 +73,8 @@ class Message: return self.id == msg.id and self.thread_id == msg.thread_id def __repr__(self): - result = '' % ( - self.id, self.title, self.date, self.sender) + result = '' % ( + self.thread_id, self.id, self.title, self.date, self.sender) return result.encode('utf-8') class ICapMessages(ICap):