From 2184951c4a2b6106e917e7778acfeeeeed88c6fe Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 24 Aug 2011 16:25:17 +0200 Subject: [PATCH] new backend 'phpbb' --- weboob/backends/phpbb/__init__.py | 24 +++ weboob/backends/phpbb/backend.py | 186 ++++++++++++++++++++++++ weboob/backends/phpbb/browser.py | 140 ++++++++++++++++++ weboob/backends/phpbb/pages/__init__.py | 0 weboob/backends/phpbb/pages/forum.py | 134 +++++++++++++++++ weboob/backends/phpbb/pages/index.py | 37 +++++ weboob/backends/phpbb/test.py | 32 ++++ weboob/backends/phpbb/tools.py | 59 ++++++++ 8 files changed, 612 insertions(+) create mode 100644 weboob/backends/phpbb/__init__.py create mode 100644 weboob/backends/phpbb/backend.py create mode 100644 weboob/backends/phpbb/browser.py create mode 100644 weboob/backends/phpbb/pages/__init__.py create mode 100644 weboob/backends/phpbb/pages/forum.py create mode 100644 weboob/backends/phpbb/pages/index.py create mode 100644 weboob/backends/phpbb/test.py create mode 100644 weboob/backends/phpbb/tools.py diff --git a/weboob/backends/phpbb/__init__.py b/weboob/backends/phpbb/__init__.py new file mode 100644 index 00000000..28eefc07 --- /dev/null +++ b/weboob/backends/phpbb/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .browser import PhpBB +from .backend import PhpBBBackend + +__all__ = ['PhpBB', 'PhpBBBackend'] diff --git a/weboob/backends/phpbb/backend.py b/weboob/backends/phpbb/backend.py new file mode 100644 index 00000000..ef910307 --- /dev/null +++ b/weboob/backends/phpbb/backend.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from __future__ import with_statement + +from weboob.tools.backend import BaseBackend, BackendConfig +from weboob.tools.newsfeed import Newsfeed +from weboob.tools.value import Value, ValueInt, ValueBackendPassword +from weboob.tools.misc import limit +from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage + +from .browser import PhpBB +from .tools import rssid, url2id + + +__all__ = ['PhpBBBackend'] + + +class PhpBBBackend(BaseBackend, ICapMessages): + NAME = 'phpbb' + MAINTAINER = 'Romain Bignon' + EMAIL = 'romain@weboob.org' + VERSION = '0.9' + LICENSE = 'AGPLv3+' + DESCRIPTION = "phpBB forum" + CONFIG = BackendConfig(Value('url', label='URL of forum', regexp='https?://.*'), + Value('username', label='Username'), + ValueBackendPassword('password', label='Password'), + ValueInt('thread_unread_messages', label='Limit number of unread messages to retrieve for a thread', default=500) + ) + STORAGE = {'seen': {}} + BROWSER = PhpBB + + def create_default_browser(self): + return self.create_browser(self.config['url'].get(), + self.config['username'].get(), + self.config['password'].get()) + + #### ICapMessages ############################################## + + def _iter_threads(self, root_link=None): + with self.browser: + links = list(self.browser.iter_links(root_link.url if root_link else None)) + + for link in links: + if link.type == link.FORUM: + link.title = '%s[%s]' % (root_link.title if root_link else '', link.title) + for thread in self._iter_threads(link): + yield thread + if link.type == link.TOPIC: + thread = Thread(url2id(link.url)) + thread.title = ('%s ' % root_link.title if root_link else '') + link.title + thread.date = link.date + thread.nb_messages = link.nb_messages + thread.flags = thread.IS_DISCUSSION + yield thread + + def iter_threads(self): + return self._iter_threads() + + def get_thread(self, id): + thread = None + parent = None + + if isinstance(id, Thread): + thread = id + id = thread.id + + thread_id = url2id(id) or id + try: + last_seen_id = self.storage.get('seen', default={})[url2id(thread_id)] + except KeyError: + last_seen_id = 0 + + with self.browser: + for post in self.browser.iter_posts(id): + if not thread: + thread = Thread(thread_id) + thread.title = post.title + + flags = Message.IS_HTML + if last_seen_id < post.id: + flags |= Message.IS_UNREAD + + m = Message(thread=thread, + id=post.id, + title=post.title, + sender=post.author, + receivers=None, + date=post.date, + parent=parent, + content=post.content, + signature=post.signature, + children=[], + flags=flags) + + if parent: + parent.children = [m] + else: + thread.root = m + + parent = m + + return thread + + def iter_unread_messages(self, thread=None): + with self.browser: + url = self.browser.get_root_feed_url() + for article in Newsfeed(url, rssid).iter_entries(): + id = url2id(article.link) + thread_id, message_id = [int(v) for v in id.split('.')] + thread = Thread(thread_id) + + try: + last_seen_id = self.storage.get('seen', default={})[thread.id] + except KeyError: + last_seen_id = 0 + + child = None + iterator = self.browser.riter_posts(id, last_seen_id) + if self.config['thread_unread_messages'].get() > 0: + iterator = limit(iterator, self.config['thread_unread_messages'].get()) + for post in iterator: + message = Message(thread=thread, + id=post.id, + title=post.title, + sender=post.author, + receivers=None, + date=post.date, + parent=None, + content=post.content, + signature=post.signature, + children=[], + flags=Message.IS_UNREAD|Message.IS_HTML) + if child: + message.children.append(child) + child.parent = message + + if post.parent: + message.parent = Message(thread=thread, + id=post.parent) + else: + thread.root = message + yield message + + def set_message_read(self, message): + try: + last_seen_id = self.storage.get('seen', default={})[message.thread.id] + except KeyError: + last_seen_id = 0 + + if message.id > last_seen_id: + self.storage.set('seen', int(message.thread.id), message.id) + self.storage.save() + + def fill_thread(self, thread, fields): + return self.get_thread(thread) + + #### ICapMessagesReply ######################################### + #def post_message(self, message): + # assert message.thread + + # with self.browser: + # return self.browser.post_comment(message.thread.id, + # message.parent.id, + # message.title, + # message.content) + + OBJECTS = {Thread: fill_thread} diff --git a/weboob/backends/phpbb/browser.py b/weboob/backends/phpbb/browser.py new file mode 100644 index 00000000..e145c806 --- /dev/null +++ b/weboob/backends/phpbb/browser.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +import urllib +from urlparse import urlsplit + +from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword +from weboob.capabilities.messages import CantSendMessage + +from .pages.index import LoginPage +from .pages.forum import ForumPage, TopicPage +from .tools import id2url, url2id + +# Browser +class PhpBB(BaseBrowser): + PAGES = {'https?://.*/index.php': ForumPage, + 'https?://.*/viewforum.php\?f=(\d+)': ForumPage, + 'https?://.*/viewtopic.php\?.*': TopicPage, + 'https?://.*/ucp.php\?mode=login.*': LoginPage, + } + + last_board_msg_id = None + + def __init__(self, url, *args, **kwargs): + self.url = url + v = urlsplit(url) + self.PROTOCOL = v.scheme + self.DOMAIN = v.netloc + self.BASEPATH = v.path[:v.path.rfind('/')] + BaseBrowser.__init__(self, *args, **kwargs) + + def home(self): + self.location(self.url) + + def is_logged(self): + return not self.page or self.page.is_logged() + + def login(self): + data = {'login': 'Connexion', + 'username': self.username, + 'password': self.password, + } + self.location('%s/ucp.php?mode=login' % self.BASEPATH, urllib.urlencode(data), no_login=True) + + assert self.is_on_page(LoginPage) + + if not self.page.is_logged(): + raise BrowserIncorrectPassword(self.page.get_error_message()) + + def get_root_feed_url(self): + self.home() + return self.page.get_feed_url() + + def iter_links(self, url): + if url: + self.location(url) + else: + self.home() + + assert self.is_on_page(ForumPage) + return self.page.iter_links() + + def iter_posts(self, id, stop_id=None): + if id.startswith('http'): + self.location(id) + else: + self.location('%s/%s' % (self.BASEPATH, id2url(id))) + assert self.is_on_page(TopicPage) + + parent = 0 + while 1: + for post in self.page.iter_posts(): + if post.id == stop_id: + return + + post.parent = parent + yield post + parent = post.id + + if self.page.cur_page == self.page.tot_pages: + return + self.location(self.page.next_page_url()) + + def riter_posts(self, id, stop_id=None): + if id.startswith('http'): + self.location(id) + else: + self.location('%s/%s' % (self.BASEPATH, id2url(id))) + assert self.is_on_page(TopicPage) + + child = None + while 1: + for post in self.page.riter_posts(): + if child: + child.parent = post.id + yield child + if post.id == stop_id: + return + child = post + + if self.page.cur_page == 1: + if child: + yield child + return + self.location(self.page.prev_page_url()) + + def get_post(self, id): + if id.startswith('http'): + self.location(id) + id = url2id(id) + else: + self.location('%s/%s' % (self.BASEPATH, id2url(id))) + assert self.is_on_page(TopicPage) + + post = self.page.get_post(int(id.split('.')[-1])) + if not post: + return None + + if post.parent == 0 and self.page.cur_page > 1: + self.location(self.page.prev_page_url()) + post.parent = self.page.get_last_post_id() + + return post diff --git a/weboob/backends/phpbb/pages/__init__.py b/weboob/backends/phpbb/pages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weboob/backends/phpbb/pages/forum.py b/weboob/backends/phpbb/pages/forum.py new file mode 100644 index 00000000..1161745c --- /dev/null +++ b/weboob/backends/phpbb/pages/forum.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from urlparse import urlsplit, parse_qs +from datetime import datetime + +from .index import PhpBBPage +from ..tools import parse_date + + +__all__ = ['Link', 'ForumPage', 'TopicPage'] + + +class Link(object): + (FORUM, + TOPIC) = xrange(2) + + def __init__(self, type, url): + self.type = type + self.url = url + self.title = u'' + self.date = None + self.nb_messages = 0 + +class ForumPage(PhpBBPage): + def iter_links(self): + for li in self.parser.select(self.document.getroot(), 'ul.forums li.row'): + title = li.cssselect('a.forumtitle')[0] + link = Link(Link.FORUM, title.attrib['href']) + link.title = title.text.strip() + yield link + + for li in self.parser.select(self.document.getroot(), 'ul.topics li.row'): + title = li.cssselect('a.topictitle')[0] + link = Link(Link.TOPIC, title.attrib['href']) + link.title = title.text.strip() + link.date = parse_date(li.find('dl').find('dt').findall('a')[-1].tail.strip(u'» \r\n')) + # it only lists number of answers, so we add 1. + link.nb_messages = int(li.cssselect('dd.posts')[0].text.strip()) + 1 + yield link + +class Post(object): + def __init__(self, topic, id): + self.id = int(id) + self.topic = topic + self.title = u'' + self.author = u'' + self.date = None + self.content = u'' + self.signature = u'' + self.parent = 0 + +class TopicPage(PhpBBPage): + def on_loaded(self): + div = self.document.getroot().cssselect('div.pagination')[0] + strongs = div.cssselect('strong') + self.cur_page = int(strongs[0].text.strip()) + self.tot_pages = int(strongs[1].text.strip()) + + v = urlsplit(self.url) + args = parse_qs(v.query) + self.topic_id = int(args['t'][0]) + + nav = self.parser.select(self.document.getroot(), 'li.icon-home', 1) + self.forum_title = u'' + for a in nav.findall('a')[1:]: + text = a.text.strip() + if len(text) >= 20: + text = text[:20] + u'…' + self.forum_title = '[%s]' % text + + def next_page_url(self): + return self.document.getroot().cssselect('a.right-box')[0].attrib['href'] + + def prev_page_url(self): + return self.document.getroot().cssselect('a.left-box')[0].attrib['href'] + + def iter_posts(self): + for div in self.parser.select(self.document.getroot(), 'div.post'): + yield self._get_post(div) + + def riter_posts(self): + for div in reversed(self.parser.select(self.document.getroot(), 'div.post')): + yield self._get_post(div) + + def get_post(self, id): + parent = 0 + for div in self.parser.select(self.document.getroot(), 'div.post'): + if div.attrib['id'] == 'p%d' % id: + post = self._get_post(div) + post.parent = parent + return post + else: + parent = int(div.attrib['id'][1:]) + + def _get_post(self, div): + body = div.cssselect('div.postbody')[0] + profile = div.cssselect('dl.postprofile')[0] + + id = div.attrib['id'][1:] + post = Post(self.topic_id, id) + post.title = '%s %s' % (self.forum_title, body.cssselect('h3 a')[0].text.strip()) + post.author = profile.cssselect('dt a')[-1].text.strip() + post.date = parse_date(body.cssselect('p.author')[0].find('strong').tail.strip(u'» \n\r')) + post.content = self.parser.tostring(body.cssselect('div.content')[0]) + + signature = body.cssselect('div.signature') + if len(signature) > 0: + post.signature = self.parser.tostring(signature[0]) + return post + + def get_last_post_id(self): + id = 0 + for div in self.parser.select(self.document.getroot(), 'div.post'): + id = int(div.attrib['id'][1:]) + return id + diff --git a/weboob/backends/phpbb/pages/index.py b/weboob/backends/phpbb/pages/index.py new file mode 100644 index 00000000..f17e4efb --- /dev/null +++ b/weboob/backends/phpbb/pages/index.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BasePage + +class PhpBBPage(BasePage): + def is_logged(self): + return len(self.document.getroot().cssselect('li.icon-register')) == 0 + + def get_feed_url(self): + links = self.document.getroot().cssselect('link[type="application/atom+xml"]') + return links[-1].attrib['href'] + +class LoginPage(PhpBBPage): + def get_error_message(self): + errors = [] + for div in self.parser.select(self.document.getroot(), 'div.error'): + if div.text: + errors.append(div.text.strip()) + return ', '.join(errors) diff --git a/weboob/backends/phpbb/test.py b/weboob/backends/phpbb/test.py new file mode 100644 index 00000000..b0441033 --- /dev/null +++ b/weboob/backends/phpbb/test.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + + +__all__ = ['PhpBBTest'] + + +class PhpBBTest(BackendTest): + BACKEND = 'phpbb' + + def test_phpbb(self): + for thread in self.backend.iter_threads(): + pass diff --git a/weboob/backends/phpbb/tools.py b/weboob/backends/phpbb/tools.py new file mode 100644 index 00000000..78ff64cf --- /dev/null +++ b/weboob/backends/phpbb/tools.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from dateutil.parser import parse as _parse_dt +from urlparse import urlsplit, parse_qs + +from weboob.tools.misc import local2utc + + +def url2id(url): + v = urlsplit(url) + pagename = v.path.split('/')[-1] + args = parse_qs(v.query) + if pagename == 'viewtopic.php': + s = '%d' % int(args['t'][0]) + if 'p' in args: + s += '.%d' % int(args['p'][0]) + return s + + return None + +def id2url(id): + v = id.split('.') + if len(v) == 1: + return 'viewtopic.php?t=%d' % int(v[0]) + if len(v) == 2: + return 'viewtopic.php?t=%d&p=%d#p%d' % (int(v[0]), + int(v[1]), + int(v[1])) + +def rssid(id): + return id + +def parse_date(s): + s = s.replace(u'Fév', 'Feb') \ + .replace(u'Avr', 'Apr') \ + .replace(u'Mai', 'May') \ + .replace(u'Juin', 'Jun') \ + .replace(u'Juil', 'Jul') \ + .replace(u'Aoû', 'Aug') \ + .replace(u'Déc', 'Dec') + return local2utc(_parse_dt(s))