From 0f589ad94177cf595d2d59e21b7d27b4dbf19f2c Mon Sep 17 00:00:00 2001
From: Romain Bignon <romain@peerfuse.org>
Date: Wed, 3 Aug 2011 15:16:52 +0200
Subject: [PATCH] several fixes and website errors detection

---
 weboob/backends/hds/browser.py |  7 ++----
 weboob/backends/hds/pages.py   | 46 ++++++++++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/weboob/backends/hds/browser.py b/weboob/backends/hds/browser.py
index e93ceeab..5bdb4b10 100644
--- a/weboob/backends/hds/browser.py
+++ b/weboob/backends/hds/browser.py
@@ -36,14 +36,11 @@ class HDSBrowser(BaseBrowser):
     def iter_stories(self):
         self.location('/sexe/histoires-par-date.php')
         n = 1
-        while 1:
+        while self.page.get_numerous() == n:
             count = 0
             for count, story in enumerate(self.page.iter_stories()):
                 yield story
 
-            if count < 49:
-                return
-
             n += 1
             self.location('/sexe/histoires-par-date.php?p=%d' % n)
 
@@ -55,7 +52,7 @@ class HDSBrowser(BaseBrowser):
         return self.page.get_story()
 
     def get_author(self, name):
-        self.location(self.buildurl('/fiche.php', auteur=name))
+        self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
 
         assert self.is_on_page(AuthorPage)
         return self.page.get_author()
diff --git a/weboob/backends/hds/pages.py b/weboob/backends/hds/pages.py
index 532cc405..4c02a206 100644
--- a/weboob/backends/hds/pages.py
+++ b/weboob/backends/hds/pages.py
@@ -50,10 +50,16 @@ class Story(object):
         self.id = id
         self.title = u''
         self.date = None
+        self.category = None
         self.author = None
         self.body = None
 
 class HistoryPage(BasePage):
+    def get_numerous(self):
+        td = self.parser.select(self.document.getroot(), 'td.t0', 1)
+        n = td.xpath('//u/strong|//u/b')[0].text
+        return int(n)
+
     def iter_stories(self):
         links = self.parser.select(self.document.getroot(), 'a.t11')
         story = None
@@ -67,6 +73,10 @@ class HistoryPage(BasePage):
                 story.title = link.text.strip()
             else:
                 story.author = Author(link.text.strip())
+                if not link.tail:
+                    self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
+                    story = None
+                    continue
                 date_text = link.tail.strip().split('\n')[-1].strip()
                 m = re.match('(\d+)-(\d+)-(\d+)', date_text)
                 if not m:
@@ -81,6 +91,11 @@ class HistoryPage(BasePage):
 
 class StoryPage(BasePage):
     def get_story(self):
+        p_tags = self.document.getroot().xpath('//body/p')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
+            return None
+
         story = Story((self.group_dict['id']))
         story.body = u''
         meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
@@ -103,8 +118,11 @@ class StoryPage(BasePage):
                 self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
             story.author.email += img.tail.strip()
 
-        story.title = self.parser.select(self.document.getroot(), 'h1', 1).text.strip()
-        date_text = self.parser.select(self.document.getroot(), 'span.t4', 1).text.strip().split('\n')[-1].strip()
+        title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
+        story.title = title_tag.text.strip() if title_tag.text else u''
+
+        span = self.parser.select(self.document.getroot(), 'span.t4', 1)
+        date_text = span.text.strip().split('\n')[-1].strip()
         m = re.match('(\d+)-(\d+)-(\d+)', date_text)
         if m:
             story.date = datetime.date(int(m.group(3)),
@@ -113,6 +131,8 @@ class StoryPage(BasePage):
         else:
             self.logger.warning('Unable to parse datetime "%s"' % date_text)
 
+        story.category = span.find('br').tail.split(':')[1].strip()
+
         div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
         for para in div.findall('br'):
             if para.text is not None:
@@ -126,12 +146,25 @@ class StoryPage(BasePage):
 
 class AuthorPage(BasePage):
     def get_author(self):
+        p_tags = self.document.getroot().xpath('//body/div/font/b')
+        if len(p_tags) > 0 and p_tags[0].text.strip() == \
+                u"La fiche de l'auteur n'est plus accessible.":
+            return None
+
         meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
-        author = Author(meta.xpath('./span[@class="t3"]')[0].text.strip())
-        if 'homme' in meta.xpath('./a[@class="t0"]')[0].text:
+        author_name = meta.xpath('./span[@class="t3"]')[0].text
+        if author_name is None:
+            author_name = self.group_dict['name']
+        author = Author(author_name.strip())
+        gender = meta.xpath('./a[@class="t0"]')[0].text
+        if not gender:
+            author.sex = author.UNKNOWN
+        elif 'homme' in gender:
             author.sex = author.MALE
-        else:
+        elif 'femme' in gender:
             author.sex = author.FEMALE
+        else:
+            author.sex = author.TRANSEXUAL
 
         author.description = u''
         for para in meta.getchildren():
@@ -142,5 +175,8 @@ class AuthorPage(BasePage):
             if para.tail is not None:
                 author.description += '\n%s' % para.tail.strip()
         author.description = author.description.replace(u'\x92', "'").strip()
+
+        if author.description.startswith(u'0 récit '):
+            self.logger.warning('This author does not have published any story.')
         return author