convert hds module to browser2 and fix it

2014-10-11 16:56:32 +02:00 · 2014-10-11 16:56:32 +02:00 · 89e7bbe9ef
commit 89e7bbe9ef
parent 94cea6458b
4 changed files with 114 additions and 163 deletions
--- a/contrib/hds/export.py
+++ b/contrib/hds/export.py
@ -68,7 +68,7 @@ def main(filename):
    sys.stdout.write('Getting stories list from website... ')
    sys.stdout.flush()
    for story in br.iter_stories():
-        if story.id in stored:
+        if int(story.id) in stored:
            break
        to_fetch.add(story.id)
        authors.add(story.author.name)
--- a/modules/hds/browser.py
+++ b/modules/hds/browser.py
@ -18,43 +18,40 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-from weboob.deprecated.browser import Browser
+from weboob.browser import PagesBrowser, URL

 from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage

 # Browser


-class HDSBrowser(Browser):
-    ENCODING = 'ISO-8859-1'
-    DOMAIN = 'histoires-de-sexe.net'
-    PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
-             'http://histoires-de-sexe.net/menu.php': HomePage,
-             'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
-             'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
-             'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
-            }
+class HDSBrowser(PagesBrowser):
+    BASEURL = 'http://histoires-de-sexe.net'
+
+    validation_page = URL('^/$', ValidationPage)
+    home = URL(r'/menu.php', HomePage)
+    history = URL(r'/sexe/histoires-par-date.php\?p=(?P<pagenum>\d+)', HistoryPage)
+    story = URL(r'/sexe.php\?histoire=(?P<id>.+)', StoryPage)
+    author = URL(r'/fiche.php\?auteur=(?P<name>.+)', AuthorPage)

    def iter_stories(self):
-        self.location('/sexe/histoires-par-date.php')
        n = 1
+        self.history.go(pagenum=n)
        while self.page.get_numerous() == n:
-            count = 0
-            for count, story in enumerate(self.page.iter_stories()):
+            for story in self.page.iter_stories():
                yield story

            n += 1
-            self.location('/sexe/histoires-par-date.php?p=%d' % n)
+            self.history.go(pagenum=n)

    def get_story(self, id):
-        id = int(id)
+        self.story.go(id=id)

-        self.location('/sexe.php?histoire=%d' % id)
-        assert self.is_on_page(StoryPage)
+        assert self.story.is_here()
        return self.page.get_story()

    def get_author(self, name):
-        self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
+        self.author.go(name=name)

-        assert self.is_on_page(AuthorPage)
+        assert self.author.is_here()
        return self.page.get_author()
--- a/modules/hds/module.py
+++ b/modules/hds/module.py
@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages):
    #### CapMessages ##############################################

    def iter_threads(self):
-        with self.browser:
-            for story in self.browser.iter_stories():
-                thread = Thread(story.id)
-                thread.title = story.title
-                thread.date = story.date
-                yield thread
+        for story in self.browser.iter_stories():
+            thread = Thread(story.id)
+            thread.title = story.title
+            thread.date = story.date
+            yield thread

    GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']

@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages):
        else:
            thread = None

-        with self.browser:
-            story = self.browser.get_story(id)
+        story = self.browser.get_story(id)

        if not story:
            return None
@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages):
                              parent=None,
                              content=story.body,
                              children=[],
-                              signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
+                              signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category),
                              flags=flags)

        return thread
--- a/modules/hds/pages.py
+++ b/modules/hds/pages.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2011  Romain Bignon
+# Copyright(C) 2014  Romain Bignon
 #
 # This file is part of weboob.
 #
@ -18,17 +18,17 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-import datetime
-import re
-
-from weboob.deprecated.browser import Page
+from weboob.browser.pages import HTMLPage
+from weboob.browser.elements import method, ListElement, ItemElement
+from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
+from weboob.browser.filters.html import XPath, Link


-class ValidationPage(Page):
+class ValidationPage(HTMLPage):
    pass


-class HomePage(Page):
+class HomePage(HTMLPage):
    pass


@ -38,15 +38,23 @@ class Author(object):
     FEMALE,
     TRANSEXUAL) = xrange(4)

-    def __init__(self, name):
+    def __init__(self, name=None):
        self.name = name
        self.sex = self.UNKNOWN
-        self.email = None
        self.description = None

+    class Sex2Enum(Filter):
+        def filter(self, text):
+            if text == 'homme':
+                return Author.MALE
+            if text == 'femme':
+                return Author.FEMALE
+            return Author.TRANSEXUAL
+
+

 class Story(object):
-    def __init__(self, id):
+    def __init__(self, id=None):
        self.id = id
        self.title = u''
        self.date = None
@ -55,129 +63,77 @@ class Story(object):
        self.body = None


-class HistoryPage(Page):
+class HistoryPage(HTMLPage):
+    ENCODING = 'iso-8859-1'
+
    def get_numerous(self):
-        td = self.parser.select(self.document.getroot(), 'td.t0', 1)
-        n = td.xpath('//u/strong|//u/b')[0].text
-        return int(n)
+        return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))

-    def iter_stories(self):
-        links = self.parser.select(self.document.getroot(), 'a.t11')
-        story = None
-        for link in links:
-            if not story:
-                m = re.match('.*histoire=(\d+)', link.attrib['href'])
-                if not m:
-                    self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
+    @method
+    class iter_stories(ListElement):
+        item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
+
+        class item(ItemElement):
+            klass = Story
+
+            def parse(self, el):
+                self.env['header'] = el.getprevious().xpath('.//span')[0]
+                self.env['body'] = el.getnext().xpath('.//a')
+
+            obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
+            obj_title = CleanText('.')
+            obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
+            obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
+
+            def obj_author(self):
+                return Author(self.env['header'].xpath('.//a/text()')[0])
+
+class StoryPage(HTMLPage):
+    ENCODING = 'iso-8859-1'
+
+    @method
+    class get_story(ItemElement):
+        klass = Story
+
+        obj_id = Env('id')
+        obj_title = CleanText('//h1')
+        obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
+        obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
+
+        def obj_body(self):
+            div = self.el.xpath('//div[@align="justify"]')[0]
+            body = ''
+            for para in div.findall('br'):
+                if para.text is not None:
+                    body += para.text.strip()
+                body += '\n'
+                if para.tail is not None:
+                    body += para.tail.strip()
+            return body.replace(u'\x92', "'").strip()
+
+
+        class obj_author(ItemElement):
+            klass = Author
+
+            obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
+            obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
+
+
+class AuthorPage(HTMLPage):
+    @method
+    class get_author(ItemElement):
+        klass = Author
+
+        obj_name = CleanText('//span[has-class("t3")]')
+        obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
+
+        def obj_description(self):
+            description = u''
+            for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
+                if para.tag not in ('b', 'br'):
                    continue
-                story = Story(int(m.group(1)))
-                story.title = link.text.strip()
-            else:
-                story.author = Author(link.text.strip())
-                if not link.tail:
-                    self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
-                    story = None
-                    continue
-                date_text = link.tail.strip().split('\n')[-1].strip()
-                m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text)
-                if not m:
-                    self.logger.warning('Unable to parse datetime "%s"' % date_text)
-                    story = None
-                    continue
-                story.date = datetime.date(int(m.group(3)),
-                                           int(m.group(2)),
-                                           int(m.group(1)))
-                yield story
-                story = None
-
-
-class StoryPage(Page):
-    def get_story(self):
-        p_tags = self.document.getroot().xpath('//body/p')
-        if len(p_tags) > 0 and p_tags[0].text.strip() == \
-                u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
-            return None
-
-        story = Story((self.group_dict['id']))
-        story.body = u''
-        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
-        story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
-        gender = meta.xpath('./a[@class="t0"]')[0].text
-        if 'homme' in gender:
-            story.author.sex = story.author.MALE
-        elif 'femme' in gender:
-            story.author.sex = story.author.FEMALE
-        else:
-            story.author.sex = story.author.TRANSEXUAL
-        email_tag = meta.xpath('./span[@class="police1"]')[0]
-        story.author.email = email_tag.text.strip()
-        for img in email_tag.findall('img'):
-            if img.attrib['src'].endswith('meyle1.gif'):
-                story.author.email += '@'
-            elif img.attrib['src'].endswith('meyle1pouan.gif'):
-                story.author.email += '.'
-            else:
-                self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
-            story.author.email += img.tail.strip()
-
-        title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
-        story.title = title_tag.text.strip() if title_tag.text else u''
-
-        span = self.parser.select(self.document.getroot(), 'span.t4', 1)
-        date_text = span.text.strip().split('\n')[-1].strip()
-        m = re.match('(\d+)-(\d+)-(\d+)', date_text)
-        if m:
-            story.date = datetime.date(int(m.group(3)),
-                                       int(m.group(2)),
-                                       int(m.group(1)))
-        else:
-            self.logger.warning('Unable to parse datetime "%s"' % date_text)
-
-        story.category = span.find('br').tail.split(':')[1].strip()
-
-        div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
-        for para in div.findall('br'):
-            if para.text is not None:
-                story.body += para.text.strip()
-            story.body += '\n'
-            if para.tail is not None:
-                story.body += para.tail.strip()
-        story.body = story.body.replace(u'\x92', "'").strip()
-        return story
-
-
-class AuthorPage(Page):
-    def get_author(self):
-        p_tags = self.document.getroot().xpath('//body/div/font/b')
-        if len(p_tags) > 0 and p_tags[0].text.strip() == \
-                u"La fiche de l'auteur n'est plus accessible.":
-            return None
-
-        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
-        author_name = meta.xpath('./span[@class="t3"]')[0].text
-        if author_name is None:
-            author_name = self.group_dict['name']
-        author = Author(author_name.strip())
-        gender = meta.xpath('./a[@class="t0"]')[0].text
-        if not gender:
-            author.sex = author.UNKNOWN
-        elif 'homme' in gender:
-            author.sex = author.MALE
-        elif 'femme' in gender:
-            author.sex = author.FEMALE
-        else:
-            author.sex = author.TRANSEXUAL
-
-        author.description = u''
-        for para in meta.getchildren():
-            if para.tag not in ('b', 'br'):
-                continue
-            if para.text is not None:
-                author.description += '\n\n%s' % para.text.strip()
-            if para.tail is not None:
-                author.description += '\n%s' % para.tail.strip()
-        author.description = author.description.replace(u'\x92', "'").strip()
-
-        if author.description.startswith(u'0 récit '):
-            self.logger.warning('This author does not have published any story.')
-        return author
+                if para.text is not None:
+                    description += '\n\n%s' % para.text.strip()
+                if para.tail is not None:
+                    description += '\n%s' % para.tail.strip()
+            return description.replace(u'\x92', "'").strip()