several fixes and website errors detection

2011-08-03 15:16:52 +02:00 · 2011-08-03 15:16:52 +02:00 · 0f589ad941
commit 0f589ad941
parent 08a7b36408
2 changed files with 43 additions and 10 deletions
--- a/weboob/backends/hds/browser.py
+++ b/weboob/backends/hds/browser.py
@ -36,14 +36,11 @@ class HDSBrowser(BaseBrowser):
    def iter_stories(self):
        self.location('/sexe/histoires-par-date.php')
        n = 1
-        while 1:
+        while self.page.get_numerous() == n:
            count = 0
            for count, story in enumerate(self.page.iter_stories()):
                yield story

-            if count < 49:
-                return
-
            n += 1
            self.location('/sexe/histoires-par-date.php?p=%d' % n)

@ -55,7 +52,7 @@ class HDSBrowser(BaseBrowser):
        return self.page.get_story()

    def get_author(self, name):
-        self.location(self.buildurl('/fiche.php', auteur=name))
+        self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))

        assert self.is_on_page(AuthorPage)
        return self.page.get_author()
--- a/weboob/backends/hds/pages.py
+++ b/weboob/backends/hds/pages.py
@ -50,10 +50,16 @@ class Story(object):
        self.id = id
        self.title = u''
        self.date = None
+        self.category = None
        self.author = None
        self.body = None

 class HistoryPage(BasePage):
+    def get_numerous(self):
+        td = self.parser.select(self.document.getroot(), 'td.t0', 1)
+        n = td.xpath('//u/strong|//u/b')[0].text
+        return int(n)
+
    def iter_stories(self):
        links = self.parser.select(self.document.getroot(), 'a.t11')
        story = None
@ -67,6 +73,10 @@ class HistoryPage(BasePage):
                story.title = link.text.strip()
            else:
                story.author = Author(link.text.strip())
+                if not link.tail:
+                    self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
+                    story = None
+                    continue
                date_text = link.tail.strip().split('\n')[-1].strip()
                m = re.match('(\d+)-(\d+)-(\d+)', date_text)
                if not m:
@ -81,6 +91,11 @@ class HistoryPage(BasePage):

 class StoryPage(BasePage):
    def get_story(self):
+        p_tags = self.document.getroot().xpath('//body/p')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
+            return None
+
        story = Story((self.group_dict['id']))
        story.body = u''
        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
@ -103,8 +118,11 @@ class StoryPage(BasePage):
                self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
            story.author.email += img.tail.strip()

-        story.title = self.parser.select(self.document.getroot(), 'h1', 1).text.strip()
-        date_text = self.parser.select(self.document.getroot(), 'span.t4', 1).text.strip().split('\n')[-1].strip()
+        title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
+        story.title = title_tag.text.strip() if title_tag.text else u''
+
+        span = self.parser.select(self.document.getroot(), 'span.t4', 1)
+        date_text = span.text.strip().split('\n')[-1].strip()
        m = re.match('(\d+)-(\d+)-(\d+)', date_text)
        if m:
            story.date = datetime.date(int(m.group(3)),
@ -113,6 +131,8 @@ class StoryPage(BasePage):
        else:
            self.logger.warning('Unable to parse datetime "%s"' % date_text)

+        story.category = span.find('br').tail.split(':')[1].strip()
+
        div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
        for para in div.findall('br'):
            if para.text is not None:
@ -126,12 +146,25 @@ class StoryPage(BasePage):

 class AuthorPage(BasePage):
    def get_author(self):
+        p_tags = self.document.getroot().xpath('//body/div/font/b')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"La fiche de l'auteur n'est plus accessible.":
+            return None
+
        meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
-        author = Author(meta.xpath('./span[@class="t3"]')[0].text.strip())
-        if 'homme' in meta.xpath('./a[@class="t0"]')[0].text:
+        author_name = meta.xpath('./span[@class="t3"]')[0].text
+        if author_name is None:
+            author_name = self.group_dict['name']
+        author = Author(author_name.strip())
+        gender = meta.xpath('./a[@class="t0"]')[0].text
+        if not gender:
+            author.sex = author.UNKNOWN
+        elif 'homme' in gender:
            author.sex = author.MALE
-        else:
+        elif 'femme' in gender:
            author.sex = author.FEMALE
+        else:
+            author.sex = author.TRANSEXUAL

        author.description = u''
        for para in meta.getchildren():
@ -142,5 +175,8 @@ class AuthorPage(BasePage):
            if para.tail is not None:
                author.description += '\n%s' % para.tail.strip()
        author.description = author.description.replace(u'\x92', "'").strip()
+
+        if author.description.startswith(u'0 récit '):
+            self.logger.warning('This author does not have published any story.')
        return author