support repositories to manage backends (closes #747)

This commit is contained in:
Romain Bignon 2012-01-03 12:10:21 +01:00
commit 14a7a1d362
410 changed files with 1079 additions and 297 deletions

24
modules/phpbb/__init__.py Normal file
View file

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from .browser import PhpBB
from .backend import PhpBBBackend
__all__ = ['PhpBB', 'PhpBBBackend']

199
modules/phpbb/backend.py Normal file
View file

@ -0,0 +1,199 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.tools.newsfeed import Newsfeed
from weboob.tools.value import Value, ValueInt, ValueBackendPassword
from weboob.tools.misc import limit
from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage
from .browser import PhpBB
from .tools import rssid, url2id, id2url, id2topic
__all__ = ['PhpBBBackend']
class PhpBBBackend(BaseBackend, ICapMessages, ICapMessagesPost):
NAME = 'phpbb'
MAINTAINER = 'Romain Bignon'
EMAIL = 'romain@weboob.org'
VERSION = '0.a'
LICENSE = 'AGPLv3+'
DESCRIPTION = "phpBB forum"
CONFIG = BackendConfig(Value('url', label='URL of forum', regexp='https?://.*'),
Value('username', label='Username', default=''),
ValueBackendPassword('password', label='Password', default=''),
ValueInt('thread_unread_messages', label='Limit number of unread messages to retrieve for a thread', default=500)
)
STORAGE = {'seen': {}}
BROWSER = PhpBB
def create_default_browser(self):
username = self.config['username'].get()
if len(username) > 0:
password = self.config['password'].get()
else:
password = None
return self.create_browser(self.config['url'].get(),
username, password)
#### ICapMessages ##############################################
def _iter_threads(self, root_link=None):
with self.browser:
links = list(self.browser.iter_links(root_link.url if root_link else None))
for link in links:
if link.type == link.FORUM:
link.title = '%s[%s]' % (root_link.title if root_link else '', link.title)
for thread in self._iter_threads(link):
yield thread
if link.type == link.TOPIC:
thread = Thread(url2id(link.url))
thread.title = ('%s ' % root_link.title if root_link else '') + link.title
thread.date = link.date
thread.nb_messages = link.nb_messages
thread.flags = thread.IS_DISCUSSION
yield thread
def iter_threads(self):
return self._iter_threads()
def get_thread(self, id):
thread = None
parent = None
if isinstance(id, Thread):
thread = id
id = thread.id
thread_id = url2id(id, nopost=True) or id
try:
last_seen_id = self.storage.get('seen', default={})[id2topic(thread_id)]
except KeyError:
last_seen_id = 0
with self.browser:
for post in self.browser.iter_posts(id):
if not thread:
thread = Thread(thread_id)
thread.title = post.title
m = self._post2message(thread, post)
m.parent = parent
if last_seen_id < post.id:
m.flags |= Message.IS_UNREAD
if parent:
parent.children = [m]
else:
thread.root = m
parent = m
return thread
def _post2message(self, thread, post):
signature = post.signature
if signature:
signature += '<br />'
signature += 'URL: %s' % self.browser.absurl(id2url('%s.%s' % (thread.id, post.id)))
return Message(thread=thread,
id=post.id,
title=post.title,
sender=post.author,
receivers=None,
date=post.date,
parent=None,
content=post.content,
signature=signature,
children=[],
flags=Message.IS_HTML)
def iter_unread_messages(self, thread=None):
with self.browser:
url = self.browser.get_root_feed_url()
for article in Newsfeed(url, rssid).iter_entries():
id = url2id(article.link)
thread = None
try:
last_seen_id = self.storage.get('seen', default={})[id2topic(id)]
except KeyError:
last_seen_id = 0
child = None
iterator = self.browser.riter_posts(id, last_seen_id)
if self.config['thread_unread_messages'].get() > 0:
iterator = limit(iterator, self.config['thread_unread_messages'].get())
for post in iterator:
if not thread:
thread = Thread('%s.%s' % (post.forum_id, post.topic_id))
message = self._post2message(thread, post)
if child:
message.children.append(child)
child.parent = message
if post.parent:
message.parent = Message(thread=thread,
id=post.parent)
else:
thread.root = message
yield message
def set_message_read(self, message):
try:
last_seen_id = self.storage.get('seen', default={})[id2topic(message.thread.id)]
except KeyError:
last_seen_id = 0
if message.id > last_seen_id:
self.storage.set('seen', id2topic(message.thread.id), message.id)
self.storage.save()
def fill_thread(self, thread, fields):
return self.get_thread(thread)
#### ICapMessagesReply #########################################
def post_message(self, message):
assert message.thread
forum = 0
topic = 0
if message.thread:
try:
if '.' in message.thread.id:
forum, topic = [int(i) for i in message.thread.id.split('.', 1)]
else:
forum = int(message.thread.id)
except ValueError:
raise CantSendMessage('Thread ID must be in form "FORUM_ID[.TOPIC_ID]".')
with self.browser:
return self.browser.post_answer(forum,
topic,
message.title,
message.content)
OBJECTS = {Thread: fill_thread}

201
modules/phpbb/browser.py Normal file
View file

@ -0,0 +1,201 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import urllib
from urlparse import urlsplit
from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword
from weboob.capabilities.messages import CantSendMessage
from .pages.index import LoginPage
from .pages.forum import ForumPage, TopicPage, PostingPage
from .tools import id2url, url2id
__all__ = ['PhpBB']
# Browser
class PhpBB(BaseBrowser):
PAGES = {'https?://.*/index.php': ForumPage,
'https?://.*/': ForumPage,
'https?://.*/viewforum.php\?f=(\d+)': ForumPage,
'https?://.*/search.php\?.*': ForumPage,
'https?://.*/viewtopic.php\?.*': TopicPage,
'https?://.*/posting.php\?.*': PostingPage,
'https?://.*/ucp.php\?mode=login.*': LoginPage,
}
last_board_msg_id = None
def __init__(self, url, *args, **kwargs):
self.url = url
v = urlsplit(url)
self.PROTOCOL = v.scheme
self.DOMAIN = v.netloc
self.BASEPATH = v.path[:v.path.rfind('/')]
BaseBrowser.__init__(self, *args, **kwargs)
def absurl(self, rel):
return BaseBrowser.absurl(self, '%s/%s' % (self.BASEPATH, rel))
def home(self):
self.location(self.url)
def is_logged(self):
return not self.page or self.page.is_logged()
def login(self):
data = {'login': 'Connexion',
'username': self.username,
'password': self.password,
}
self.location('%s/ucp.php?mode=login' % self.BASEPATH, urllib.urlencode(data), no_login=True)
assert self.is_on_page(LoginPage)
if not self.page.is_logged():
raise BrowserIncorrectPassword(self.page.get_error_message())
def get_root_feed_url(self):
self.home()
return self.page.get_feed_url()
def iter_links(self, url):
if url:
self.location(url)
else:
self.home()
assert self.is_on_page(ForumPage)
return self.page.iter_links()
def iter_posts(self, id, stop_id=None):
if id.startswith('http'):
self.location(id)
else:
self.location('%s/%s' % (self.BASEPATH, id2url(id)))
assert self.is_on_page(TopicPage)
parent = 0
while 1:
for post in self.page.iter_posts():
if stop_id and post.id >= stop_id:
return
post.parent = parent
yield post
parent = post.id
if self.page.cur_page == self.page.tot_pages:
return
self.location(self.page.next_page_url())
def riter_posts(self, id, stop_id=None):
if id.startswith('http'):
self.location(id)
else:
self.location('%s/%s' % (self.BASEPATH, id2url(id)))
assert self.is_on_page(TopicPage)
child = None
while 1:
for post in self.page.riter_posts():
if child:
child.parent = post.id
yield child
if post.id <= stop_id:
return
child = post
if self.page.cur_page == 1:
if child:
yield child
return
self.location(self.page.prev_page_url())
def get_post(self, id):
if id.startswith('http'):
self.location(id)
id = url2id(id)
else:
self.location('%s/%s' % (self.BASEPATH, id2url(id)))
assert self.is_on_page(TopicPage)
post = self.page.get_post(int(id.split('.')[-1]))
if not post:
return None
if post.parent == 0 and self.page.cur_page > 1:
self.location(self.page.prev_page_url())
post.parent = self.page.get_last_post_id()
return post
def get_forums(self):
self.home()
return dict(self.page.iter_all_forums())
def post_answer(self, forum_id, topic_id, title, content):
if topic_id == 0:
if not forum_id:
forums = self.get_forums()
forums_prompt = 'Forums list:\n%s' % ('\n'.join(['\t- %s' % f for f in forums.itervalues()]))
m = re.match('\[(.*)\] (.*)', title or '')
if not m:
raise CantSendMessage('Please enter a title formatted like that:\n\t"[FORUM] SUBJECT"\n\n%s' % forums_prompt)
forum_id = None
for k,v in forums.iteritems():
if v.lower() == m.group(1).lower():
forum_id = k
break
if not forum_id:
raise CantSendMessage('Forum "%s" not found.\n\n%s' % (m.group(1), forums_prompt))
self.location('%s/posting.php?mode=post&f=%d' % (self.BASEPATH, forum_id))
assert self.is_on_page(PostingPage)
self.page.post(title, content)
assert self.is_on_page(PostingPage)
error = self.page.get_error_message()
if error:
raise CantSendMessage(u'Unable to send message: %s' % error)
else:
self.location('%s/%s' % (self.BASEPATH, id2url(topic_id)))
assert self.is_on_page(TopicPage)
self.page.go_reply()
assert self.is_on_page(PostingPage)
# Don't send title because it isn't needed in real use case
# and with monboob title is something like:
# Re: [Forum Name] Re: Topic Name
if title is not None and title.startswith('Re:'):
title = None
self.page.post(title, content)
assert self.is_on_page(PostingPage)
error = self.page.get_error_message()
if error:
raise CantSendMessage(u'Unable to send message: %s' % error)

BIN
modules/phpbb/favicon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

View file

View file

@ -0,0 +1,235 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from time import sleep
from urlparse import urlsplit, parse_qs
from weboob.tools.browser import BrokenPageError
from .index import PhpBBPage
from ..tools import parse_date
__all__ = ['Link', 'ForumPage', 'TopicPage', 'PostingPage']
class Link(object):
(FORUM,
TOPIC) = xrange(2)
def __init__(self, type, url):
self.type = type
self.url = url
self.title = u''
self.date = None
self.nb_messages = 0
class ForumPage(PhpBBPage):
def iter_links(self):
for li in self.parser.select(self.document.getroot(), 'ul.forums li.row'):
title = li.cssselect('a.forumtitle')[0]
link = Link(Link.FORUM, title.attrib['href'])
link.title = title.text.strip()
yield link
for li in self.parser.select(self.document.getroot(), 'ul.topics li.row'):
title = li.cssselect('a.topictitle')[0]
link = Link(Link.TOPIC, title.attrib['href'])
link.title = title.text.strip()
for a in li.find('dl').find('dt').findall('a'):
for text in (a.text, a.tail):
if text is None:
continue
try:
link.date = parse_date(text.strip(u'» \r\n'))
except ValueError:
continue
else:
break
# it only lists number of answers, so we add 1.
link.nb_messages = int(li.cssselect('dd.posts')[0].text.strip()) + 1
yield link
def iter_all_forums(self):
for option in self.parser.select(self.document.getroot(), 'select#f', 1).findall('option'):
value = int(option.attrib['value'])
if value < 0 or not option.text:
continue
yield value, option.text.strip(u'» \xa0\n\r')
class Post(object):
def __init__(self, forum_id, topic_id, id):
self.id = int(id)
self.forum_id = forum_id
self.topic_id = topic_id
self.title = u''
self.author = u''
self.date = None
self.content = u''
self.signature = u''
self.parent = 0
class TopicPage(PhpBBPage):
def on_loaded(self):
div = self.document.getroot().cssselect('div.pagination')[0]
strongs = div.cssselect('strong')
self.cur_page = int(strongs[0].text.strip())
self.tot_pages = int(strongs[1].text.strip())
try:
url = self.document.xpath('//h2/a')[-1].attrib['href']
except BrokenPageError:
url = self.url
v = urlsplit(url)
args = parse_qs(v.query)
self.topic_id = int(args['t'][0])
self.forum_id = int(args['f'][0]) if 'f' in args else 0
self.forum_title = u''
nav = self.parser.select(self.document.getroot(), 'li.icon-home')
if len(nav) > 0:
text = nav[0].findall('a')[-1].text.strip()
if len(text) >= 20:
text = text[:20] + u''
self.forum_title = '[%s] ' % text
def go_reply(self):
self.browser.follow_link(url_regex='posting\.php')
def next_page_url(self):
try:
return self.parser.select(self.document.getroot(), 'a.right-box', 1).attrib['href']
except BrokenPageError:
a_list = self.parser.select(self.document.getroot(), 'div.pagination', 1).findall('a')
if self.cur_page == self.tot_pages:
return '#'
return a_list[-1].attrib['href']
def prev_page_url(self):
try:
return self.parser.select(self.document.getroot(), 'a.left-box', 1).attrib['href']
except BrokenPageError:
a_list = self.parser.select(self.document.getroot(), 'div.pagination', 1).findall('a')
if self.cur_page == self.tot_pages:
a = a_list[-1]
else:
a = a_list[-2]
return a.attrib['href']
def iter_posts(self):
for div in self.parser.select(self.document.getroot(), 'div.post'):
yield self._get_post(div)
def riter_posts(self):
for div in reversed(self.parser.select(self.document.getroot(), 'div.post')):
yield self._get_post(div)
def get_post(self, id):
parent = 0
for div in self.parser.select(self.document.getroot(), 'div.post'):
if div.attrib['id'] == 'p%d' % id:
post = self._get_post(div)
post.parent = parent
return post
else:
parent = int(div.attrib['id'][1:])
def _get_post(self, div):
body = div.cssselect('div.postbody')[0]
profile = div.cssselect('dl.postprofile')[0]
id = div.attrib['id'][1:]
post = Post(self.forum_id, self.topic_id, id)
title_tags = body.xpath('//h3/a')
if len(title_tags) == 0:
title_tags = self.document.xpath('//h2/a')
if len(title_tags) == 0:
title = u''
self.logger.warning('Unable to parse title')
else:
title = title_tags[-1].text.strip()
post.title = self.forum_title + title
for a in profile.cssselect('dt a'):
if a.text:
post.author = a.text.strip()
p_tags = body.cssselect('p.author')
if len(p_tags) == 0:
p_tags = body.find('p')
if len(p_tags) == 0:
post.date = None
self.logger.warning('Unable to parse datetime')
else:
p = p_tags[0]
text = p.find('strong') is not None and p.find('strong').tail
if not text:
text = p.text[4:]
text = text.strip(u'» \n\r')
try:
post.date = parse_date(text)
except ValueError:
self.logger.warning(u'Unable to parse datetime "%s"' % text)
post.content = self.parser.tostring(body.cssselect('div.content')[0])
signature = body.cssselect('div.signature')
if len(signature) > 0:
post.signature = self.parser.tostring(signature[0])
return post
def get_last_post_id(self):
id = 0
for div in self.parser.select(self.document.getroot(), 'div.post'):
id = int(div.attrib['id'][1:])
return id
class PostingPage(PhpBBPage):
def post(self, title, content):
self.browser.select_form(predicate=lambda form: form.attrs.get('id', '') == 'postform')
self.browser.set_all_readonly(False)
if title:
self.browser['subject'] = title.encode('utf-8')
self.browser['message'] = content.encode('utf-8')
# This code on phpbb:
# if ($cancel || ($current_time - $lastclick < 2 && $submit))
# {
# /* ... */
# redirect($redirect);
# }
# To prevent that shit because weboob is too fast, we simulate
# a value of lastclick 10 seconds before.
self.browser['lastclick'] = str(int(self.browser['lastclick']) - 10)
# Likewise for create_time, with this check:
# $diff = time() - $creation_time;
# // If creation_time and the time() now is zero we can assume it was not a human doing this (the check for if ($diff)...
# if ($diff && ($diff <= $timespan || $timespan === -1))
# But as the form_token depends on the create_time value, I can't
# change it. But I can wait a second before posting...
sleep(1)
self.browser.submit(name='post')

View file

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BasePage
class PhpBBPage(BasePage):
def is_logged(self):
return len(self.document.getroot().cssselect('li.icon-register')) == 0
def get_feed_url(self):
links = self.document.getroot().cssselect('link[type="application/atom+xml"]')
return links[-1].attrib['href']
def get_error_message(self):
errors = []
for div in self.parser.select(self.document.getroot(), 'div.error,p.error'):
if div.text:
errors.append(div.text.strip())
return ', '.join(errors)
class LoginPage(PhpBBPage):
pass

36
modules/phpbb/test.py Normal file
View file

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest
__all__ = ['PhpBBTest']
class PhpBBTest(BackendTest):
BACKEND = 'phpbb'
def testthreads(self):
for thread in self.backend.iter_threads():
pass
def test_unread_messages(self):
for message in self.backend.iter_unread_messages():
pass

75
modules/phpbb/tools.py Normal file
View file

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from dateutil.parser import parse as _parse_dt
from urlparse import urlsplit, parse_qs
from weboob.tools.misc import local2utc
def url2id(url, nopost=False):
v = urlsplit(url)
pagename = v.path.split('/')[-1]
args = parse_qs(v.query)
if pagename == 'viewforum.php':
return '%d' % int(args['f'][0])
if pagename == 'viewtopic.php':
if 'f' in args:
s = '%d' % int(args['f'][0])
else:
s = '0'
s += '.%d' % int(args['t'][0])
if 'p' in args and not nopost:
s += '.%d' % int(args['p'][0])
return s
return None
def id2url(id):
v = id.split('.')
if len(v) == 1:
return 'viewforum.php?f=%d' % int(v[0])
if len(v) == 2:
return 'viewtopic.php?f=%d&t=%d' % (int(v[0]), int(v[1]))
if len(v) == 3:
return 'viewtopic.php?f=%d&t=%d&p=%d#p%d' % (int(v[0]),
int(v[1]),
int(v[2]),
int(v[2]))
def id2topic(id):
try:
return int(id.split('.')[1])
except IndexError:
return None
def rssid(id):
return id
def parse_date(s):
s = s.replace(u'Fév', 'Feb') \
.replace(u'Avr', 'Apr') \
.replace(u'Mai', 'May') \
.replace(u'Juin', 'Jun') \
.replace(u'Juil', 'Jul') \
.replace(u'Aoû', 'Aug') \
.replace(u'Ao\xfbt', 'Aug') \
.replace(u'Déc', 'Dec')
return local2utc(_parse_dt(s))