support repositories to manage backends (closes #747)

This commit is contained in:
Romain Bignon 2012-01-03 12:10:21 +01:00
commit 14a7a1d362
410 changed files with 1079 additions and 297 deletions

3
modules/hds/__init__.py Normal file
View file

@ -0,0 +1,3 @@
from .backend import HDSBackend
__all__ = ['HDSBackend']

105
modules/hds/backend.py Normal file
View file

@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from __future__ import with_statement
from weboob.tools.backend import BaseBackend
from weboob.capabilities.messages import ICapMessages, Message, Thread
from .browser import HDSBrowser
__all__ = ['HDSBackend']
class HDSBackend(BaseBackend, ICapMessages):
NAME = 'hds'
MAINTAINER = 'Romain Bignon'
EMAIL = 'romain@weboob.org'
VERSION = '0.a'
LICENSE = 'AGPLv3+'
DESCRIPTION = "histoires-de-sexe.net french erotic novels"
STORAGE = {'seen': []}
BROWSER = HDSBrowser
#### ICapMessages ##############################################
def iter_threads(self):
with self.browser:
for story in self.browser.iter_stories():
thread = Thread(story.id)
thread.title = story.title
thread.date = story.date
thread.nb_messages = 1
yield thread
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
def get_thread(self, id):
if isinstance(id, Thread):
thread = id
id = thread.id
else:
thread = None
with self.browser:
story = self.browser.get_story(id)
if not story:
return None
if not thread:
thread = Thread(story.id)
flags = 0
if not thread.id in self.storage.get('seen', default=[]):
flags |= Message.IS_UNREAD
thread.title = story.title
thread.date = story.date
thread.root = Message(thread=thread,
id=0,
title=story.title,
sender=story.author.name,
receivers=None,
date=thread.date,
parent=None,
content=story.body,
children=[],
signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
flags=flags)
return thread
def iter_unread_messages(self, thread=None):
for thread in self.iter_threads():
if thread.id in self.storage.get('seen', default=[]):
continue
self.fill_thread(thread, 'root')
yield thread.root
def set_message_read(self, message):
self.storage.set('seen', self.storage.get('seen', default=[]) + [message.thread.id])
self.storage.save()
def fill_thread(self, thread, fields):
return self.get_thread(thread)
OBJECTS = {Thread: fill_thread}

58
modules/hds/browser.py Normal file
View file

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
# Browser
class HDSBrowser(BaseBrowser):
ENCODING = 'ISO-8859-1'
DOMAIN = 'histoires-de-sexe.net'
PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
'http://histoires-de-sexe.net/menu.php': HomePage,
'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
}
def iter_stories(self):
self.location('/sexe/histoires-par-date.php')
n = 1
while self.page.get_numerous() == n:
count = 0
for count, story in enumerate(self.page.iter_stories()):
yield story
n += 1
self.location('/sexe/histoires-par-date.php?p=%d' % n)
def get_story(self, id):
id = int(id)
self.location('/sexe.php?histoire=%d' % id)
assert self.is_on_page(StoryPage)
return self.page.get_story()
def get_author(self, name):
self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
assert self.is_on_page(AuthorPage)
return self.page.get_author()

BIN
modules/hds/favicon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

182
modules/hds/pages.py Normal file
View file

@ -0,0 +1,182 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re
from weboob.tools.browser import BasePage
__all__ = ['ValidationPage', 'HomePage', 'HistoryPage', 'StoryPage']
class ValidationPage(BasePage):
pass
class HomePage(BasePage):
pass
class Author(object):
(UNKNOWN,
MALE,
FEMALE,
TRANSEXUAL) = xrange(4)
def __init__(self, name):
self.name = name
self.sex = self.UNKNOWN
self.email = None
self.description = None
class Story(object):
def __init__(self, id):
self.id = id
self.title = u''
self.date = None
self.category = None
self.author = None
self.body = None
class HistoryPage(BasePage):
def get_numerous(self):
td = self.parser.select(self.document.getroot(), 'td.t0', 1)
n = td.xpath('//u/strong|//u/b')[0].text
return int(n)
def iter_stories(self):
links = self.parser.select(self.document.getroot(), 'a.t11')
story = None
for link in links:
if not story:
m = re.match('.*histoire=(\d+)', link.attrib['href'])
if not m:
self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
continue
story = Story(int(m.group(1)))
story.title = link.text.strip()
else:
story.author = Author(link.text.strip())
if not link.tail:
self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
story = None
continue
date_text = link.tail.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if not m:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story = None
continue
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
yield story
story = None
class StoryPage(BasePage):
def get_story(self):
p_tags = self.document.getroot().xpath('//body/p')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
return None
story = Story((self.group_dict['id']))
story.body = u''
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if 'homme' in gender:
story.author.sex = story.author.MALE
elif 'femme' in gender:
story.author.sex = story.author.FEMALE
else:
story.author.sex = story.author.TRANSEXUAL
email_tag = meta.xpath('./span[@class="police1"]')[0]
story.author.email = email_tag.text.strip()
for img in email_tag.findall('img'):
if img.attrib['src'].endswith('meyle1.gif'):
story.author.email += '@'
elif img.attrib['src'].endswith('meyle1pouan.gif'):
story.author.email += '.'
else:
self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
story.author.email += img.tail.strip()
title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
story.title = title_tag.text.strip() if title_tag.text else u''
span = self.parser.select(self.document.getroot(), 'span.t4', 1)
date_text = span.text.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if m:
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
else:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story.category = span.find('br').tail.split(':')[1].strip()
div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
for para in div.findall('br'):
if para.text is not None:
story.body += para.text.strip()
story.body += '\n'
if para.tail is not None:
story.body += para.tail.strip()
story.body = story.body.replace(u'\x92', "'").strip()
return story
class AuthorPage(BasePage):
def get_author(self):
p_tags = self.document.getroot().xpath('//body/div/font/b')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"La fiche de l'auteur n'est plus accessible.":
return None
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
author_name = meta.xpath('./span[@class="t3"]')[0].text
if author_name is None:
author_name = self.group_dict['name']
author = Author(author_name.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if not gender:
author.sex = author.UNKNOWN
elif 'homme' in gender:
author.sex = author.MALE
elif 'femme' in gender:
author.sex = author.FEMALE
else:
author.sex = author.TRANSEXUAL
author.description = u''
for para in meta.getchildren():
if para.tag not in ('b', 'br'):
continue
if para.text is not None:
author.description += '\n\n%s' % para.text.strip()
if para.tail is not None:
author.description += '\n%s' % para.tail.strip()
author.description = author.description.replace(u'\x92', "'").strip()
if author.description.startswith(u'0 récit '):
self.logger.warning('This author does not have published any story.')
return author

33
modules/hds/test.py Normal file
View file

@ -0,0 +1,33 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest
from weboob.tools.misc import limit
__all__ = ['HDSTest']
class HDSTest(BackendTest):
BACKEND = 'hds'
def test_new_messages(self):
for message in limit(self.backend.iter_unread_messages(), 10):
pass