From 89e7bbe9efd8a1da783a10d164ed07d4f82bfc91 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Sat, 11 Oct 2014 16:56:32 +0200 Subject: [PATCH] convert hds module to browser2 and fix it --- contrib/hds/export.py | 2 +- modules/hds/browser.py | 35 +++---- modules/hds/module.py | 16 ++- modules/hds/pages.py | 222 +++++++++++++++++------------------------ 4 files changed, 113 insertions(+), 162 deletions(-) diff --git a/contrib/hds/export.py b/contrib/hds/export.py index 5e324495..2bc660ad 100755 --- a/contrib/hds/export.py +++ b/contrib/hds/export.py @@ -68,7 +68,7 @@ def main(filename): sys.stdout.write('Getting stories list from website... ') sys.stdout.flush() for story in br.iter_stories(): - if story.id in stored: + if int(story.id) in stored: break to_fetch.add(story.id) authors.add(story.author.name) diff --git a/modules/hds/browser.py b/modules/hds/browser.py index 2bc406c1..a5de3797 100644 --- a/modules/hds/browser.py +++ b/modules/hds/browser.py @@ -18,43 +18,40 @@ # along with weboob. If not, see . -from weboob.deprecated.browser import Browser +from weboob.browser import PagesBrowser, URL from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage # Browser -class HDSBrowser(Browser): - ENCODING = 'ISO-8859-1' - DOMAIN = 'histoires-de-sexe.net' - PAGES = {'http://histoires-de-sexe.net/': ValidationPage, - 'http://histoires-de-sexe.net/menu.php': HomePage, - 'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage, - 'http://histoires-de-sexe.net/sexe.php\?histoire=(?P.+)': StoryPage, - 'http://histoires-de-sexe.net/fiche.php\?auteur=(?P.+)': AuthorPage, - } +class HDSBrowser(PagesBrowser): + BASEURL = 'http://histoires-de-sexe.net' + + validation_page = URL('^/$', ValidationPage) + home = URL(r'/menu.php', HomePage) + history = URL(r'/sexe/histoires-par-date.php\?p=(?P\d+)', HistoryPage) + story = URL(r'/sexe.php\?histoire=(?P.+)', StoryPage) + author = URL(r'/fiche.php\?auteur=(?P.+)', AuthorPage) def iter_stories(self): - self.location('/sexe/histoires-par-date.php') n = 1 + self.history.go(pagenum=n) while self.page.get_numerous() == n: - count = 0 - for count, story in enumerate(self.page.iter_stories()): + for story in self.page.iter_stories(): yield story n += 1 - self.location('/sexe/histoires-par-date.php?p=%d' % n) + self.history.go(pagenum=n) def get_story(self, id): - id = int(id) + self.story.go(id=id) - self.location('/sexe.php?histoire=%d' % id) - assert self.is_on_page(StoryPage) + assert self.story.is_here() return self.page.get_story() def get_author(self, name): - self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15'))) + self.author.go(name=name) - assert self.is_on_page(AuthorPage) + assert self.author.is_here() return self.page.get_author() diff --git a/modules/hds/module.py b/modules/hds/module.py index aa68235b..96c87d45 100644 --- a/modules/hds/module.py +++ b/modules/hds/module.py @@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages): #### CapMessages ############################################## def iter_threads(self): - with self.browser: - for story in self.browser.iter_stories(): - thread = Thread(story.id) - thread.title = story.title - thread.date = story.date - yield thread + for story in self.browser.iter_stories(): + thread = Thread(story.id) + thread.title = story.title + thread.date = story.date + yield thread GENDERS = ['', 'boy', 'girl', 'transexual'] @@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages): else: thread = None - with self.browser: - story = self.browser.get_story(id) + story = self.browser.get_story(id) if not story: return None @@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages): parent=None, content=story.body, children=[], - signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email), + signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category), flags=flags) return thread diff --git a/modules/hds/pages.py b/modules/hds/pages.py index 5a12538a..9b0e52f0 100644 --- a/modules/hds/pages.py +++ b/modules/hds/pages.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2011 Romain Bignon +# Copyright(C) 2014 Romain Bignon # # This file is part of weboob. # @@ -18,17 +18,17 @@ # along with weboob. If not, see . -import datetime -import re - -from weboob.deprecated.browser import Page +from weboob.browser.pages import HTMLPage +from weboob.browser.elements import method, ListElement, ItemElement +from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter +from weboob.browser.filters.html import XPath, Link -class ValidationPage(Page): +class ValidationPage(HTMLPage): pass -class HomePage(Page): +class HomePage(HTMLPage): pass @@ -38,15 +38,23 @@ class Author(object): FEMALE, TRANSEXUAL) = xrange(4) - def __init__(self, name): + def __init__(self, name=None): self.name = name self.sex = self.UNKNOWN - self.email = None self.description = None + class Sex2Enum(Filter): + def filter(self, text): + if text == 'homme': + return Author.MALE + if text == 'femme': + return Author.FEMALE + return Author.TRANSEXUAL + + class Story(object): - def __init__(self, id): + def __init__(self, id=None): self.id = id self.title = u'' self.date = None @@ -55,129 +63,77 @@ class Story(object): self.body = None -class HistoryPage(Page): +class HistoryPage(HTMLPage): + ENCODING = 'iso-8859-1' + def get_numerous(self): - td = self.parser.select(self.document.getroot(), 'td.t0', 1) - n = td.xpath('//u/strong|//u/b')[0].text - return int(n) + return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc)) - def iter_stories(self): - links = self.parser.select(self.document.getroot(), 'a.t11') - story = None - for link in links: - if not story: - m = re.match('.*histoire=(\d+)', link.attrib['href']) - if not m: - self.logger.warning('Unable to parse ID "%s"' % link.attrib['href']) + @method + class iter_stories(ListElement): + item_xpath = '//div[@align="justify"]/span[has-class("t4")]' + + class item(ItemElement): + klass = Story + + def parse(self, el): + self.env['header'] = el.getprevious().xpath('.//span')[0] + self.env['body'] = el.getnext().xpath('.//a') + + obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)') + obj_title = CleanText('.') + obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date + obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur') + + def obj_author(self): + return Author(self.env['header'].xpath('.//a/text()')[0]) + +class StoryPage(HTMLPage): + ENCODING = 'iso-8859-1' + + @method + class get_story(ItemElement): + klass = Story + + obj_id = Env('id') + obj_title = CleanText('//h1') + obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date + obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]') + + def obj_body(self): + div = self.el.xpath('//div[@align="justify"]')[0] + body = '' + for para in div.findall('br'): + if para.text is not None: + body += para.text.strip() + body += '\n' + if para.tail is not None: + body += para.tail.strip() + return body.replace(u'\x92', "'").strip() + + + class obj_author(ItemElement): + klass = Author + + obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]') + obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum + + +class AuthorPage(HTMLPage): + @method + class get_author(ItemElement): + klass = Author + + obj_name = CleanText('//span[has-class("t3")]') + obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum + + def obj_description(self): + description = u'' + for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren(): + if para.tag not in ('b', 'br'): continue - story = Story(int(m.group(1))) - story.title = link.text.strip() - else: - story.author = Author(link.text.strip()) - if not link.tail: - self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name) - story = None - continue - date_text = link.tail.strip().split('\n')[-1].strip() - m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text) - if not m: - self.logger.warning('Unable to parse datetime "%s"' % date_text) - story = None - continue - story.date = datetime.date(int(m.group(3)), - int(m.group(2)), - int(m.group(1))) - yield story - story = None - - -class StoryPage(Page): - def get_story(self): - p_tags = self.document.getroot().xpath('//body/p') - if len(p_tags) > 0 and p_tags[0].text.strip() == \ - u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.": - return None - - story = Story((self.group_dict['id'])) - story.body = u'' - meta = self.parser.select(self.document.getroot(), 'td.t0', 1) - story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip()) - gender = meta.xpath('./a[@class="t0"]')[0].text - if 'homme' in gender: - story.author.sex = story.author.MALE - elif 'femme' in gender: - story.author.sex = story.author.FEMALE - else: - story.author.sex = story.author.TRANSEXUAL - email_tag = meta.xpath('./span[@class="police1"]')[0] - story.author.email = email_tag.text.strip() - for img in email_tag.findall('img'): - if img.attrib['src'].endswith('meyle1.gif'): - story.author.email += '@' - elif img.attrib['src'].endswith('meyle1pouan.gif'): - story.author.email += '.' - else: - self.logger.warning('Unable to know what image is %s' % img.attrib['src']) - story.author.email += img.tail.strip() - - title_tag = self.parser.select(self.document.getroot(), 'h1', 1) - story.title = title_tag.text.strip() if title_tag.text else u'' - - span = self.parser.select(self.document.getroot(), 'span.t4', 1) - date_text = span.text.strip().split('\n')[-1].strip() - m = re.match('(\d+)-(\d+)-(\d+)', date_text) - if m: - story.date = datetime.date(int(m.group(3)), - int(m.group(2)), - int(m.group(1))) - else: - self.logger.warning('Unable to parse datetime "%s"' % date_text) - - story.category = span.find('br').tail.split(':')[1].strip() - - div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1) - for para in div.findall('br'): - if para.text is not None: - story.body += para.text.strip() - story.body += '\n' - if para.tail is not None: - story.body += para.tail.strip() - story.body = story.body.replace(u'\x92', "'").strip() - return story - - -class AuthorPage(Page): - def get_author(self): - p_tags = self.document.getroot().xpath('//body/div/font/b') - if len(p_tags) > 0 and p_tags[0].text.strip() == \ - u"La fiche de l'auteur n'est plus accessible.": - return None - - meta = self.parser.select(self.document.getroot(), 'td.t0', 1) - author_name = meta.xpath('./span[@class="t3"]')[0].text - if author_name is None: - author_name = self.group_dict['name'] - author = Author(author_name.strip()) - gender = meta.xpath('./a[@class="t0"]')[0].text - if not gender: - author.sex = author.UNKNOWN - elif 'homme' in gender: - author.sex = author.MALE - elif 'femme' in gender: - author.sex = author.FEMALE - else: - author.sex = author.TRANSEXUAL - - author.description = u'' - for para in meta.getchildren(): - if para.tag not in ('b', 'br'): - continue - if para.text is not None: - author.description += '\n\n%s' % para.text.strip() - if para.tail is not None: - author.description += '\n%s' % para.tail.strip() - author.description = author.description.replace(u'\x92', "'").strip() - - if author.description.startswith(u'0 récit '): - self.logger.warning('This author does not have published any story.') - return author + if para.text is not None: + description += '\n\n%s' % para.text.strip() + if para.tail is not None: + description += '\n%s' % para.tail.strip() + return description.replace(u'\x92', "'").strip()