support repositories to manage backends (closes #747)

This commit is contained in:
Romain Bignon 2012-01-03 12:10:21 +01:00
commit 14a7a1d362
410 changed files with 1079 additions and 297 deletions

View file

@ -0,0 +1,4 @@
from .backend import FourChanBackend
from .browser import FourChan
__all__ = ['FourChanBackend', 'FourChan']

127
modules/fourchan/backend.py Normal file
View file

@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
from weboob.capabilities.messages import ICapMessages, Message, Thread
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.tools.value import Value
from .browser import FourChan
__all__ = ['FourChanBackend']
class FourChanBackend(BaseBackend, ICapMessages):
NAME = 'fourchan'
MAINTAINER = 'Romain Bignon'
EMAIL = 'romain@weboob.org'
VERSION = '0.a'
LICENSE = 'AGPLv3+'
DESCRIPTION = '4chan website'
CONFIG = BackendConfig(Value('boards', label='Boards to fetch'))
STORAGE = {'boards': {}}
BROWSER = FourChan
def _splitid(self, id):
return id.split('.', 1)
def get_thread(self, id):
thread = None
if isinstance(id, Thread):
thread = id
id = thread.id
if not '.' in id:
self.logger.warning('Malformated ID (%s)' % id)
return
board, thread_id = self._splitid(id)
with self.browser:
_thread = self.browser.get_thread(board, thread_id)
flags = 0
if not _thread.id in self.storage.get('boards', board, default={}):
flags |= Message.IS_UNREAD
if not thread:
thread = Thread(id)
thread.title = _thread.filename
thread.root = Message(thread=thread,
id=0, # root message
title=_thread.filename,
sender=_thread.author,
receivers=None,
date=_thread.datetime,
parent=None,
content=_thread.text,
signature=None,
children=[],
flags=flags|Message.IS_HTML)
for comment in _thread.comments:
flags = 0
if not comment.id in self.storage.get('boards', board, _thread.id, default=[]):
flags |= Message.IS_UNREAD
m = Message(thread=thread,
id=comment.id,
title=_thread.filename,
sender=comment.author,
receivers=None,
date=comment.datetime,
parent=thread.root,
content=comment.text,
signature=None,
children=None,
flags=flags|Message.IS_HTML)
thread.root.children.append(m)
return thread
def iter_threads(self):
for board in self.config['boards'].get().split(' '):
with self.browser:
threads = self.browser.get_threads(board)
for thread in threads:
t = Thread('%s.%s' % (board, thread.id))
t.title = thread.filename
yield t
def iter_unread_messages(self):
for thread in self.iter_threads():
self.fill_thread(thread, 'root')
for m in thread.iter_all_messages():
if m.flags & Message.IS_UNREAD:
yield m
def set_message_read(self, message):
board, thread_id = self._splitid(message.thread.id)
self.storage.set('boards', board, thread_id, self.storage.get('boards', board, thread_id, default=[]) + [message.id])
self.storage.save()
def fill_thread(self, thread, fields):
return self.get_thread(thread)
OBJECTS = {Thread: fill_thread}

View file

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from .pages.board import BoardPage
class FourChan(BaseBrowser):
DOMAIN = 'boards.4chan.org'
PAGES = {
'http://boards.4chan.org/\w+/': BoardPage,
'http://boards.4chan.org/\w+/res/\d+': BoardPage,
}
def is_logged(self):
return True
def get_threads(self, board):
self.location('http://boards.4chan.org/%s/' % board)
return self.page.articles
def get_thread(self, board, id):
self.location('http://boards.4chan.org/%s/res/%d' % (board, long(id)))
assert len(self.page.articles) == 1
return self.page.articles[0]

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.4 KiB

View file

View file

@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
from datetime import datetime
from weboob.tools.browser import BasePage
__all__ = ['BoardPage']
class Message(object):
def __init__(self, browser, board, id, filename=u'', url=u''):
self.id = id
self.browser = browser
self.board = board
self.filename = filename
self.datetime = datetime.now()
self.url = url
self.author = u''
self.text = u''
self.comments = []
def add_comment(self, div):
comment = Message(self.browser, self.board, int(div.attrib.get('id', '')))
comment.author = div.cssselect('span.commentpostername')[0].text
comment.text = self.browser.parser.tostring(div.find('blockquote'))
self.comments.append(comment)
def __repr__(self):
return '<Message id=%s filename=%s url=%s comments=%d>' % (self.id, self.filename, self.url, len(self.comments))
class BoardPage(BasePage):
URL_REGEXP = re.compile('http://boards.4chan.org/(\w+)/')
def on_loaded(self):
self.articles = []
m = self.URL_REGEXP.match(self.url)
if m:
self.board = m.group(1)
else:
self.logger.warning('Unable to find board')
self.board = 'unknown'
forms = self.document.getroot().cssselect('form')
form = None
for f in forms:
if f.attrib.get('name', '') == 'delform':
form = f
break
if form is None:
self.logger.warning('No delform :(')
article = None
for div in form.getchildren():
if div.tag == 'span' and div.attrib.get('class', '') == 'filesize':
url = div.find('a').get('href', '')
filename = 'unknown.jpg'
span = div.find('span')
if span is not None:
filename = span.text
article = Message(self.browser, self.board, 0, filename, url)
self.articles.append(article)
if article is None:
continue
if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'):
article.id = int(div.attrib.get('name', '0'))
if div.tag == 'blockquote':
article.text = self.parser.tostring(div)
if div.tag == 'table':
tags = div.cssselect('td.reply')
if tags:
article.add_comment(tags[0])

43
modules/fourchan/test.py Normal file
View file

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from logging import debug
from weboob.tools.test import BackendTest
class FourChanTest(BackendTest):
BACKEND = 'fourchan'
def test_new_messages(self):
tot = 0
for thread in self.backend.iter_threads():
thread = self.backend.fillobj(thread, 'root')
count = 0
for m in thread.iter_all_messages():
count += 1
debug('Count: %s' % count)
tot += count
debug('Total messages: %s' % tot)
count = 0
for message in self.backend.iter_unread_messages():
count += 1
debug('Unread messages: %s' % count)