diff --git a/contrib/hds/export.py b/contrib/hds/export.py
index 5e324495..2bc660ad 100755
--- a/contrib/hds/export.py
+++ b/contrib/hds/export.py
@@ -68,7 +68,7 @@ def main(filename):
sys.stdout.write('Getting stories list from website... ')
sys.stdout.flush()
for story in br.iter_stories():
- if story.id in stored:
+ if int(story.id) in stored:
break
to_fetch.add(story.id)
authors.add(story.author.name)
diff --git a/modules/hds/browser.py b/modules/hds/browser.py
index 2bc406c1..a5de3797 100644
--- a/modules/hds/browser.py
+++ b/modules/hds/browser.py
@@ -18,43 +18,40 @@
# along with weboob. If not, see .
-from weboob.deprecated.browser import Browser
+from weboob.browser import PagesBrowser, URL
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
# Browser
-class HDSBrowser(Browser):
- ENCODING = 'ISO-8859-1'
- DOMAIN = 'histoires-de-sexe.net'
- PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
- 'http://histoires-de-sexe.net/menu.php': HomePage,
- 'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
- 'http://histoires-de-sexe.net/sexe.php\?histoire=(?P.+)': StoryPage,
- 'http://histoires-de-sexe.net/fiche.php\?auteur=(?P.+)': AuthorPage,
- }
+class HDSBrowser(PagesBrowser):
+ BASEURL = 'http://histoires-de-sexe.net'
+
+ validation_page = URL('^/$', ValidationPage)
+ home = URL(r'/menu.php', HomePage)
+ history = URL(r'/sexe/histoires-par-date.php\?p=(?P\d+)', HistoryPage)
+ story = URL(r'/sexe.php\?histoire=(?P.+)', StoryPage)
+ author = URL(r'/fiche.php\?auteur=(?P.+)', AuthorPage)
def iter_stories(self):
- self.location('/sexe/histoires-par-date.php')
n = 1
+ self.history.go(pagenum=n)
while self.page.get_numerous() == n:
- count = 0
- for count, story in enumerate(self.page.iter_stories()):
+ for story in self.page.iter_stories():
yield story
n += 1
- self.location('/sexe/histoires-par-date.php?p=%d' % n)
+ self.history.go(pagenum=n)
def get_story(self, id):
- id = int(id)
+ self.story.go(id=id)
- self.location('/sexe.php?histoire=%d' % id)
- assert self.is_on_page(StoryPage)
+ assert self.story.is_here()
return self.page.get_story()
def get_author(self, name):
- self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
+ self.author.go(name=name)
- assert self.is_on_page(AuthorPage)
+ assert self.author.is_here()
return self.page.get_author()
diff --git a/modules/hds/module.py b/modules/hds/module.py
index aa68235b..96c87d45 100644
--- a/modules/hds/module.py
+++ b/modules/hds/module.py
@@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages):
#### CapMessages ##############################################
def iter_threads(self):
- with self.browser:
- for story in self.browser.iter_stories():
- thread = Thread(story.id)
- thread.title = story.title
- thread.date = story.date
- yield thread
+ for story in self.browser.iter_stories():
+ thread = Thread(story.id)
+ thread.title = story.title
+ thread.date = story.date
+ yield thread
GENDERS = ['', 'boy', 'girl', 'transexual']
@@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages):
else:
thread = None
- with self.browser:
- story = self.browser.get_story(id)
+ story = self.browser.get_story(id)
if not story:
return None
@@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages):
parent=None,
content=story.body,
children=[],
- signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
+ signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category),
flags=flags)
return thread
diff --git a/modules/hds/pages.py b/modules/hds/pages.py
index 5a12538a..9b0e52f0 100644
--- a/modules/hds/pages.py
+++ b/modules/hds/pages.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright(C) 2011 Romain Bignon
+# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
@@ -18,17 +18,17 @@
# along with weboob. If not, see .
-import datetime
-import re
-
-from weboob.deprecated.browser import Page
+from weboob.browser.pages import HTMLPage
+from weboob.browser.elements import method, ListElement, ItemElement
+from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
+from weboob.browser.filters.html import XPath, Link
-class ValidationPage(Page):
+class ValidationPage(HTMLPage):
pass
-class HomePage(Page):
+class HomePage(HTMLPage):
pass
@@ -38,15 +38,23 @@ class Author(object):
FEMALE,
TRANSEXUAL) = xrange(4)
- def __init__(self, name):
+ def __init__(self, name=None):
self.name = name
self.sex = self.UNKNOWN
- self.email = None
self.description = None
+ class Sex2Enum(Filter):
+ def filter(self, text):
+ if text == 'homme':
+ return Author.MALE
+ if text == 'femme':
+ return Author.FEMALE
+ return Author.TRANSEXUAL
+
+
class Story(object):
- def __init__(self, id):
+ def __init__(self, id=None):
self.id = id
self.title = u''
self.date = None
@@ -55,129 +63,77 @@ class Story(object):
self.body = None
-class HistoryPage(Page):
+class HistoryPage(HTMLPage):
+ ENCODING = 'iso-8859-1'
+
def get_numerous(self):
- td = self.parser.select(self.document.getroot(), 'td.t0', 1)
- n = td.xpath('//u/strong|//u/b')[0].text
- return int(n)
+ return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))
- def iter_stories(self):
- links = self.parser.select(self.document.getroot(), 'a.t11')
- story = None
- for link in links:
- if not story:
- m = re.match('.*histoire=(\d+)', link.attrib['href'])
- if not m:
- self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
+ @method
+ class iter_stories(ListElement):
+ item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
+
+ class item(ItemElement):
+ klass = Story
+
+ def parse(self, el):
+ self.env['header'] = el.getprevious().xpath('.//span')[0]
+ self.env['body'] = el.getnext().xpath('.//a')
+
+ obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
+ obj_title = CleanText('.')
+ obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
+ obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
+
+ def obj_author(self):
+ return Author(self.env['header'].xpath('.//a/text()')[0])
+
+class StoryPage(HTMLPage):
+ ENCODING = 'iso-8859-1'
+
+ @method
+ class get_story(ItemElement):
+ klass = Story
+
+ obj_id = Env('id')
+ obj_title = CleanText('//h1')
+ obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
+ obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
+
+ def obj_body(self):
+ div = self.el.xpath('//div[@align="justify"]')[0]
+ body = ''
+ for para in div.findall('br'):
+ if para.text is not None:
+ body += para.text.strip()
+ body += '\n'
+ if para.tail is not None:
+ body += para.tail.strip()
+ return body.replace(u'\x92', "'").strip()
+
+
+ class obj_author(ItemElement):
+ klass = Author
+
+ obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
+ obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
+
+
+class AuthorPage(HTMLPage):
+ @method
+ class get_author(ItemElement):
+ klass = Author
+
+ obj_name = CleanText('//span[has-class("t3")]')
+ obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
+
+ def obj_description(self):
+ description = u''
+ for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
+ if para.tag not in ('b', 'br'):
continue
- story = Story(int(m.group(1)))
- story.title = link.text.strip()
- else:
- story.author = Author(link.text.strip())
- if not link.tail:
- self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
- story = None
- continue
- date_text = link.tail.strip().split('\n')[-1].strip()
- m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text)
- if not m:
- self.logger.warning('Unable to parse datetime "%s"' % date_text)
- story = None
- continue
- story.date = datetime.date(int(m.group(3)),
- int(m.group(2)),
- int(m.group(1)))
- yield story
- story = None
-
-
-class StoryPage(Page):
- def get_story(self):
- p_tags = self.document.getroot().xpath('//body/p')
- if len(p_tags) > 0 and p_tags[0].text.strip() == \
- u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
- return None
-
- story = Story((self.group_dict['id']))
- story.body = u''
- meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
- story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
- gender = meta.xpath('./a[@class="t0"]')[0].text
- if 'homme' in gender:
- story.author.sex = story.author.MALE
- elif 'femme' in gender:
- story.author.sex = story.author.FEMALE
- else:
- story.author.sex = story.author.TRANSEXUAL
- email_tag = meta.xpath('./span[@class="police1"]')[0]
- story.author.email = email_tag.text.strip()
- for img in email_tag.findall('img'):
- if img.attrib['src'].endswith('meyle1.gif'):
- story.author.email += '@'
- elif img.attrib['src'].endswith('meyle1pouan.gif'):
- story.author.email += '.'
- else:
- self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
- story.author.email += img.tail.strip()
-
- title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
- story.title = title_tag.text.strip() if title_tag.text else u''
-
- span = self.parser.select(self.document.getroot(), 'span.t4', 1)
- date_text = span.text.strip().split('\n')[-1].strip()
- m = re.match('(\d+)-(\d+)-(\d+)', date_text)
- if m:
- story.date = datetime.date(int(m.group(3)),
- int(m.group(2)),
- int(m.group(1)))
- else:
- self.logger.warning('Unable to parse datetime "%s"' % date_text)
-
- story.category = span.find('br').tail.split(':')[1].strip()
-
- div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
- for para in div.findall('br'):
- if para.text is not None:
- story.body += para.text.strip()
- story.body += '\n'
- if para.tail is not None:
- story.body += para.tail.strip()
- story.body = story.body.replace(u'\x92', "'").strip()
- return story
-
-
-class AuthorPage(Page):
- def get_author(self):
- p_tags = self.document.getroot().xpath('//body/div/font/b')
- if len(p_tags) > 0 and p_tags[0].text.strip() == \
- u"La fiche de l'auteur n'est plus accessible.":
- return None
-
- meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
- author_name = meta.xpath('./span[@class="t3"]')[0].text
- if author_name is None:
- author_name = self.group_dict['name']
- author = Author(author_name.strip())
- gender = meta.xpath('./a[@class="t0"]')[0].text
- if not gender:
- author.sex = author.UNKNOWN
- elif 'homme' in gender:
- author.sex = author.MALE
- elif 'femme' in gender:
- author.sex = author.FEMALE
- else:
- author.sex = author.TRANSEXUAL
-
- author.description = u''
- for para in meta.getchildren():
- if para.tag not in ('b', 'br'):
- continue
- if para.text is not None:
- author.description += '\n\n%s' % para.text.strip()
- if para.tail is not None:
- author.description += '\n%s' % para.tail.strip()
- author.description = author.description.replace(u'\x92', "'").strip()
-
- if author.description.startswith(u'0 récit '):
- self.logger.warning('This author does not have published any story.')
- return author
+ if para.text is not None:
+ description += '\n\n%s' % para.text.strip()
+ if para.tail is not None:
+ description += '\n%s' % para.tail.strip()
+ return description.replace(u'\x92', "'").strip()