support repositories to manage backends (closes #747)

2012-01-03 12:10:21 +01:00 · 2012-01-03 12:10:21 +01:00 · 14a7a1d362
commit 14a7a1d362
parent ef16a5b726
410 changed files with 1079 additions and 297 deletions
--- a/modules/hds/init.py
+++ b/modules/hds/init.py
@ -0,0 +1,3 @@
+from .backend import HDSBackend
+
+__all__ = ['HDSBackend']
--- a/modules/hds/backend.py
+++ b/modules/hds/backend.py
@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from __future__ import with_statement
+
+from weboob.tools.backend import BaseBackend
+from weboob.capabilities.messages import ICapMessages, Message, Thread
+
+from .browser import HDSBrowser
+
+
+__all__ = ['HDSBackend']
+
+
+class HDSBackend(BaseBackend, ICapMessages):
+    NAME = 'hds'
+    MAINTAINER = 'Romain Bignon'
+    EMAIL = 'romain@weboob.org'
+    VERSION = '0.a'
+    LICENSE = 'AGPLv3+'
+    DESCRIPTION = "histoires-de-sexe.net french erotic novels"
+    STORAGE = {'seen': []}
+    BROWSER = HDSBrowser
+
+    #### ICapMessages ##############################################
+
+    def iter_threads(self):
+        with self.browser:
+            for story in self.browser.iter_stories():
+                thread = Thread(story.id)
+                thread.title = story.title
+                thread.date = story.date
+                thread.nb_messages = 1
+                yield thread
+
+    GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
+
+    def get_thread(self, id):
+        if isinstance(id, Thread):
+            thread = id
+            id = thread.id
+        else:
+            thread = None
+
+        with self.browser:
+            story = self.browser.get_story(id)
+
+        if not story:
+            return None
+
+        if not thread:
+            thread = Thread(story.id)
+
+        flags = 0
+        if not thread.id in self.storage.get('seen', default=[]):
+            flags |= Message.IS_UNREAD
+
+        thread.title = story.title
+        thread.date = story.date
+        thread.root = Message(thread=thread,
+                              id=0,
+                              title=story.title,
+                              sender=story.author.name,
+                              receivers=None,
+                              date=thread.date,
+                              parent=None,
+                              content=story.body,
+                              children=[],
+                              signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
+                              flags=flags)
+
+        return thread
+
+    def iter_unread_messages(self, thread=None):
+        for thread in self.iter_threads():
+            if thread.id in self.storage.get('seen', default=[]):
+                continue
+            self.fill_thread(thread, 'root')
+            yield thread.root
+
+    def set_message_read(self, message):
+        self.storage.set('seen', self.storage.get('seen', default=[]) + [message.thread.id])
+        self.storage.save()
+
+    def fill_thread(self, thread, fields):
+        return self.get_thread(thread)
+
+    OBJECTS = {Thread: fill_thread}
--- a/modules/hds/browser.py
+++ b/modules/hds/browser.py
@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.browser import BaseBrowser
+
+from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
+
+# Browser
+class HDSBrowser(BaseBrowser):
+    ENCODING = 'ISO-8859-1'
+    DOMAIN = 'histoires-de-sexe.net'
+    PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
+             'http://histoires-de-sexe.net/menu.php': HomePage,
+             'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
+             'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
+             'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
+            }
+
+    def iter_stories(self):
+        self.location('/sexe/histoires-par-date.php')
+        n = 1
+        while self.page.get_numerous() == n:
+            count = 0
+            for count, story in enumerate(self.page.iter_stories()):
+                yield story
+
+            n += 1
+            self.location('/sexe/histoires-par-date.php?p=%d' % n)
+
+    def get_story(self, id):
+        id = int(id)
+
+        self.location('/sexe.php?histoire=%d' % id)
+        assert self.is_on_page(StoryPage)
+        return self.page.get_story()
+
+    def get_author(self, name):
+        self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
+
+        assert self.is_on_page(AuthorPage)
+        return self.page.get_author()
--- a/modules/hds/favicon.png
+++ b/modules/hds/favicon.png
--- a/modules/hds/pages.py
+++ b/modules/hds/pages.py
@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011  Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import datetime
+import re
+
+from weboob.tools.browser import BasePage
+
+
+__all__ = ['ValidationPage', 'HomePage', 'HistoryPage', 'StoryPage']
+
+
+class ValidationPage(BasePage):
+    pass
+
+class HomePage(BasePage):
+    pass
+
+class Author(object):
+    (UNKNOWN,
+     MALE,
+     FEMALE,
+     TRANSEXUAL) = xrange(4)
+
+    def __init__(self, name):
+        self.name = name
+        self.sex = self.UNKNOWN
+        self.email = None
+        self.description = None
+
+class Story(object):
+    def __init__(self, id):
+        self.id = id
+        self.title = u''
+        self.date = None
+        self.category = None
+        self.author = None
+        self.body = None
+
+class HistoryPage(BasePage):
+    def get_numerous(self):
+        td = self.parser.select(self.document.getroot(), 'td.t0', 1)
+        n = td.xpath('//u/strong|//u/b')[0].text
+        return int(n)
+
+    def iter_stories(self):
+        links = self.parser.select(self.document.getroot(), 'a.t11')
+        story = None
+        for link in links:
+            if not story:
+                m = re.match('.*histoire=(\d+)', link.attrib['href'])
+                if not m:
+                    self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
+                    continue
+                story = Story(int(m.group(1)))
+                story.title = link.text.strip()
+            else:
+                story.author = Author(link.text.strip())
+                if not link.tail:
+                    self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
+                    story = None
+                    continue
+                date_text = link.tail.strip().split('\n')[-1].strip()
+                m = re.match('(\d+)-(\d+)-(\d+)', date_text)
+                if not m:
+                    self.logger.warning('Unable to parse datetime "%s"' % date_text)
+                    story = None
+                    continue
+                story.date = datetime.date(int(m.group(3)),
+                                           int(m.group(2)),
+                                           int(m.group(1)))
+                yield story
+                story = None
+
+class StoryPage(BasePage):
+    def get_story(self):
+        p_tags = self.document.getroot().xpath('//body/p')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
+            return None
+
+        story = Story((self.group_dict['id']))
+        story.body = u''
+        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
+        story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
+        gender = meta.xpath('./a[@class="t0"]')[0].text
+        if 'homme' in gender:
+            story.author.sex = story.author.MALE
+        elif 'femme' in gender:
+            story.author.sex = story.author.FEMALE
+        else:
+            story.author.sex = story.author.TRANSEXUAL
+        email_tag = meta.xpath('./span[@class="police1"]')[0]
+        story.author.email = email_tag.text.strip()
+        for img in email_tag.findall('img'):
+            if img.attrib['src'].endswith('meyle1.gif'):
+                story.author.email += '@'
+            elif img.attrib['src'].endswith('meyle1pouan.gif'):
+                story.author.email += '.'
+            else:
+                self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
+            story.author.email += img.tail.strip()
+
+        title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
+        story.title = title_tag.text.strip() if title_tag.text else u''
+
+        span = self.parser.select(self.document.getroot(), 'span.t4', 1)
+        date_text = span.text.strip().split('\n')[-1].strip()
+        m = re.match('(\d+)-(\d+)-(\d+)', date_text)
+        if m:
+            story.date = datetime.date(int(m.group(3)),
+                                       int(m.group(2)),
+                                       int(m.group(1)))
+        else:
+            self.logger.warning('Unable to parse datetime "%s"' % date_text)
+
+        story.category = span.find('br').tail.split(':')[1].strip()
+
+        div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
+        for para in div.findall('br'):
+            if para.text is not None:
+                story.body += para.text.strip()
+            story.body += '\n'
+            if para.tail is not None:
+                story.body += para.tail.strip()
+        story.body = story.body.replace(u'\x92', "'").strip()
+        return story
+
+
+class AuthorPage(BasePage):
+    def get_author(self):
+        p_tags = self.document.getroot().xpath('//body/div/font/b')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"La fiche de l'auteur n'est plus accessible.":
+            return None
+
+        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
+        author_name = meta.xpath('./span[@class="t3"]')[0].text
+        if author_name is None:
+            author_name = self.group_dict['name']
+        author = Author(author_name.strip())
+        gender = meta.xpath('./a[@class="t0"]')[0].text
+        if not gender:
+            author.sex = author.UNKNOWN
+        elif 'homme' in gender:
+            author.sex = author.MALE
+        elif 'femme' in gender:
+            author.sex = author.FEMALE
+        else:
+            author.sex = author.TRANSEXUAL
+
+        author.description = u''
+        for para in meta.getchildren():
+            if para.tag not in ('b', 'br'):
+                continue
+            if para.text is not None:
+                author.description += '\n\n%s' % para.text.strip()
+            if para.tail is not None:
+                author.description += '\n%s' % para.tail.strip()
+        author.description = author.description.replace(u'\x92', "'").strip()
+
+        if author.description.startswith(u'0 récit '):
+            self.logger.warning('This author does not have published any story.')
+        return author
+
--- a/modules/hds/test.py
+++ b/modules/hds/test.py
@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011  Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.test import BackendTest
+from weboob.tools.misc import limit
+
+
+__all__ = ['HDSTest']
+
+
+class HDSTest(BackendTest):
+    BACKEND = 'hds'
+
+    def test_new_messages(self):
+        for message in limit(self.backend.iter_unread_messages(), 10):
+            pass