From 0f589ad94177cf595d2d59e21b7d27b4dbf19f2c Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 3 Aug 2011 15:16:52 +0200 Subject: [PATCH] several fixes and website errors detection --- weboob/backends/hds/browser.py | 7 ++---- weboob/backends/hds/pages.py | 46 ++++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/weboob/backends/hds/browser.py b/weboob/backends/hds/browser.py index e93ceeab..5bdb4b10 100644 --- a/weboob/backends/hds/browser.py +++ b/weboob/backends/hds/browser.py @@ -36,14 +36,11 @@ class HDSBrowser(BaseBrowser): def iter_stories(self): self.location('/sexe/histoires-par-date.php') n = 1 - while 1: + while self.page.get_numerous() == n: count = 0 for count, story in enumerate(self.page.iter_stories()): yield story - if count < 49: - return - n += 1 self.location('/sexe/histoires-par-date.php?p=%d' % n) @@ -55,7 +52,7 @@ class HDSBrowser(BaseBrowser): return self.page.get_story() def get_author(self, name): - self.location(self.buildurl('/fiche.php', auteur=name)) + self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15'))) assert self.is_on_page(AuthorPage) return self.page.get_author() diff --git a/weboob/backends/hds/pages.py b/weboob/backends/hds/pages.py index 532cc405..4c02a206 100644 --- a/weboob/backends/hds/pages.py +++ b/weboob/backends/hds/pages.py @@ -50,10 +50,16 @@ class Story(object): self.id = id self.title = u'' self.date = None + self.category = None self.author = None self.body = None class HistoryPage(BasePage): + def get_numerous(self): + td = self.parser.select(self.document.getroot(), 'td.t0', 1) + n = td.xpath('//u/strong|//u/b')[0].text + return int(n) + def iter_stories(self): links = self.parser.select(self.document.getroot(), 'a.t11') story = None @@ -67,6 +73,10 @@ class HistoryPage(BasePage): story.title = link.text.strip() else: story.author = Author(link.text.strip()) + if not link.tail: + self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name) + story = None + continue date_text = link.tail.strip().split('\n')[-1].strip() m = re.match('(\d+)-(\d+)-(\d+)', date_text) if not m: @@ -81,6 +91,11 @@ class HistoryPage(BasePage): class StoryPage(BasePage): def get_story(self): + p_tags = self.document.getroot().xpath('//body/p') + if len(p_tags) > 0 and p_tags[0].text.strip() == \ + u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.": + return None + story = Story((self.group_dict['id'])) story.body = u'' meta = self.parser.select(self.document.getroot(), 'td.t0', 1) @@ -103,8 +118,11 @@ class StoryPage(BasePage): self.logger.warning('Unable to know what image is %s' % img.attrib['src']) story.author.email += img.tail.strip() - story.title = self.parser.select(self.document.getroot(), 'h1', 1).text.strip() - date_text = self.parser.select(self.document.getroot(), 'span.t4', 1).text.strip().split('\n')[-1].strip() + title_tag = self.parser.select(self.document.getroot(), 'h1', 1) + story.title = title_tag.text.strip() if title_tag.text else u'' + + span = self.parser.select(self.document.getroot(), 'span.t4', 1) + date_text = span.text.strip().split('\n')[-1].strip() m = re.match('(\d+)-(\d+)-(\d+)', date_text) if m: story.date = datetime.date(int(m.group(3)), @@ -113,6 +131,8 @@ class StoryPage(BasePage): else: self.logger.warning('Unable to parse datetime "%s"' % date_text) + story.category = span.find('br').tail.split(':')[1].strip() + div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1) for para in div.findall('br'): if para.text is not None: @@ -126,12 +146,25 @@ class StoryPage(BasePage): class AuthorPage(BasePage): def get_author(self): + p_tags = self.document.getroot().xpath('//body/div/font/b') + if len(p_tags) > 0 and p_tags[0].text.strip() == \ + u"La fiche de l'auteur n'est plus accessible.": + return None + meta = self.parser.select(self.document.getroot(), 'td.t0', 1) - author = Author(meta.xpath('./span[@class="t3"]')[0].text.strip()) - if 'homme' in meta.xpath('./a[@class="t0"]')[0].text: + author_name = meta.xpath('./span[@class="t3"]')[0].text + if author_name is None: + author_name = self.group_dict['name'] + author = Author(author_name.strip()) + gender = meta.xpath('./a[@class="t0"]')[0].text + if not gender: + author.sex = author.UNKNOWN + elif 'homme' in gender: author.sex = author.MALE - else: + elif 'femme' in gender: author.sex = author.FEMALE + else: + author.sex = author.TRANSEXUAL author.description = u'' for para in meta.getchildren(): @@ -142,5 +175,8 @@ class AuthorPage(BasePage): if para.tail is not None: author.description += '\n%s' % para.tail.strip() author.description = author.description.replace(u'\x92', "'").strip() + + if author.description.startswith(u'0 récit '): + self.logger.warning('This author does not have published any story.') return author