convert hds module to browser2 and fix it

This commit is contained in:
Romain Bignon 2014-10-11 16:56:32 +02:00
commit 89e7bbe9ef
4 changed files with 114 additions and 163 deletions

View file

@ -68,7 +68,7 @@ def main(filename):
sys.stdout.write('Getting stories list from website... ')
sys.stdout.flush()
for story in br.iter_stories():
if story.id in stored:
if int(story.id) in stored:
break
to_fetch.add(story.id)
authors.add(story.author.name)

View file

@ -18,43 +18,40 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser
from weboob.browser import PagesBrowser, URL
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
# Browser
class HDSBrowser(Browser):
ENCODING = 'ISO-8859-1'
DOMAIN = 'histoires-de-sexe.net'
PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
'http://histoires-de-sexe.net/menu.php': HomePage,
'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
}
class HDSBrowser(PagesBrowser):
BASEURL = 'http://histoires-de-sexe.net'
validation_page = URL('^/$', ValidationPage)
home = URL(r'/menu.php', HomePage)
history = URL(r'/sexe/histoires-par-date.php\?p=(?P<pagenum>\d+)', HistoryPage)
story = URL(r'/sexe.php\?histoire=(?P<id>.+)', StoryPage)
author = URL(r'/fiche.php\?auteur=(?P<name>.+)', AuthorPage)
def iter_stories(self):
self.location('/sexe/histoires-par-date.php')
n = 1
self.history.go(pagenum=n)
while self.page.get_numerous() == n:
count = 0
for count, story in enumerate(self.page.iter_stories()):
for story in self.page.iter_stories():
yield story
n += 1
self.location('/sexe/histoires-par-date.php?p=%d' % n)
self.history.go(pagenum=n)
def get_story(self, id):
id = int(id)
self.story.go(id=id)
self.location('/sexe.php?histoire=%d' % id)
assert self.is_on_page(StoryPage)
assert self.story.is_here()
return self.page.get_story()
def get_author(self, name):
self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
self.author.go(name=name)
assert self.is_on_page(AuthorPage)
assert self.author.is_here()
return self.page.get_author()

View file

@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages):
#### CapMessages ##############################################
def iter_threads(self):
with self.browser:
for story in self.browser.iter_stories():
thread = Thread(story.id)
thread.title = story.title
thread.date = story.date
yield thread
for story in self.browser.iter_stories():
thread = Thread(story.id)
thread.title = story.title
thread.date = story.date
yield thread
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages):
else:
thread = None
with self.browser:
story = self.browser.get_story(id)
story = self.browser.get_story(id)
if not story:
return None
@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages):
parent=None,
content=story.body,
children=[],
signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category),
flags=flags)
return thread

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
# Copyright(C) 2014 Romain Bignon
#
# This file is part of weboob.
#
@ -18,17 +18,17 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
import re
from weboob.deprecated.browser import Page
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import method, ListElement, ItemElement
from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
from weboob.browser.filters.html import XPath, Link
class ValidationPage(Page):
class ValidationPage(HTMLPage):
pass
class HomePage(Page):
class HomePage(HTMLPage):
pass
@ -38,15 +38,23 @@ class Author(object):
FEMALE,
TRANSEXUAL) = xrange(4)
def __init__(self, name):
def __init__(self, name=None):
self.name = name
self.sex = self.UNKNOWN
self.email = None
self.description = None
class Sex2Enum(Filter):
def filter(self, text):
if text == 'homme':
return Author.MALE
if text == 'femme':
return Author.FEMALE
return Author.TRANSEXUAL
class Story(object):
def __init__(self, id):
def __init__(self, id=None):
self.id = id
self.title = u''
self.date = None
@ -55,129 +63,77 @@ class Story(object):
self.body = None
class HistoryPage(Page):
class HistoryPage(HTMLPage):
ENCODING = 'iso-8859-1'
def get_numerous(self):
td = self.parser.select(self.document.getroot(), 'td.t0', 1)
n = td.xpath('//u/strong|//u/b')[0].text
return int(n)
return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))
def iter_stories(self):
links = self.parser.select(self.document.getroot(), 'a.t11')
story = None
for link in links:
if not story:
m = re.match('.*histoire=(\d+)', link.attrib['href'])
if not m:
self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
@method
class iter_stories(ListElement):
item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
class item(ItemElement):
klass = Story
def parse(self, el):
self.env['header'] = el.getprevious().xpath('.//span')[0]
self.env['body'] = el.getnext().xpath('.//a')
obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
obj_title = CleanText('.')
obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
def obj_author(self):
return Author(self.env['header'].xpath('.//a/text()')[0])
class StoryPage(HTMLPage):
ENCODING = 'iso-8859-1'
@method
class get_story(ItemElement):
klass = Story
obj_id = Env('id')
obj_title = CleanText('//h1')
obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
def obj_body(self):
div = self.el.xpath('//div[@align="justify"]')[0]
body = ''
for para in div.findall('br'):
if para.text is not None:
body += para.text.strip()
body += '\n'
if para.tail is not None:
body += para.tail.strip()
return body.replace(u'\x92', "'").strip()
class obj_author(ItemElement):
klass = Author
obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
class AuthorPage(HTMLPage):
@method
class get_author(ItemElement):
klass = Author
obj_name = CleanText('//span[has-class("t3")]')
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
def obj_description(self):
description = u''
for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
if para.tag not in ('b', 'br'):
continue
story = Story(int(m.group(1)))
story.title = link.text.strip()
else:
story.author = Author(link.text.strip())
if not link.tail:
self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
story = None
continue
date_text = link.tail.strip().split('\n')[-1].strip()
m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text)
if not m:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story = None
continue
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
yield story
story = None
class StoryPage(Page):
def get_story(self):
p_tags = self.document.getroot().xpath('//body/p')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
return None
story = Story((self.group_dict['id']))
story.body = u''
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if 'homme' in gender:
story.author.sex = story.author.MALE
elif 'femme' in gender:
story.author.sex = story.author.FEMALE
else:
story.author.sex = story.author.TRANSEXUAL
email_tag = meta.xpath('./span[@class="police1"]')[0]
story.author.email = email_tag.text.strip()
for img in email_tag.findall('img'):
if img.attrib['src'].endswith('meyle1.gif'):
story.author.email += '@'
elif img.attrib['src'].endswith('meyle1pouan.gif'):
story.author.email += '.'
else:
self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
story.author.email += img.tail.strip()
title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
story.title = title_tag.text.strip() if title_tag.text else u''
span = self.parser.select(self.document.getroot(), 'span.t4', 1)
date_text = span.text.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if m:
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
else:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story.category = span.find('br').tail.split(':')[1].strip()
div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
for para in div.findall('br'):
if para.text is not None:
story.body += para.text.strip()
story.body += '\n'
if para.tail is not None:
story.body += para.tail.strip()
story.body = story.body.replace(u'\x92', "'").strip()
return story
class AuthorPage(Page):
def get_author(self):
p_tags = self.document.getroot().xpath('//body/div/font/b')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"La fiche de l'auteur n'est plus accessible.":
return None
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
author_name = meta.xpath('./span[@class="t3"]')[0].text
if author_name is None:
author_name = self.group_dict['name']
author = Author(author_name.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if not gender:
author.sex = author.UNKNOWN
elif 'homme' in gender:
author.sex = author.MALE
elif 'femme' in gender:
author.sex = author.FEMALE
else:
author.sex = author.TRANSEXUAL
author.description = u''
for para in meta.getchildren():
if para.tag not in ('b', 'br'):
continue
if para.text is not None:
author.description += '\n\n%s' % para.text.strip()
if para.tail is not None:
author.description += '\n%s' % para.tail.strip()
author.description = author.description.replace(u'\x92', "'").strip()
if author.description.startswith(u'0 récit '):
self.logger.warning('This author does not have published any story.')
return author
if para.text is not None:
description += '\n\n%s' % para.text.strip()
if para.tail is not None:
description += '\n%s' % para.tail.strip()
return description.replace(u'\x92', "'").strip()