convert hds module to browser2 and fix it

This commit is contained in:
Romain Bignon 2014-10-11 16:56:32 +02:00
commit 89e7bbe9ef
4 changed files with 114 additions and 163 deletions

View file

@ -68,7 +68,7 @@ def main(filename):
sys.stdout.write('Getting stories list from website... ') sys.stdout.write('Getting stories list from website... ')
sys.stdout.flush() sys.stdout.flush()
for story in br.iter_stories(): for story in br.iter_stories():
if story.id in stored: if int(story.id) in stored:
break break
to_fetch.add(story.id) to_fetch.add(story.id)
authors.add(story.author.name) authors.add(story.author.name)

View file

@ -18,43 +18,40 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser from weboob.browser import PagesBrowser, URL
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
# Browser # Browser
class HDSBrowser(Browser): class HDSBrowser(PagesBrowser):
ENCODING = 'ISO-8859-1' BASEURL = 'http://histoires-de-sexe.net'
DOMAIN = 'histoires-de-sexe.net'
PAGES = {'http://histoires-de-sexe.net/': ValidationPage, validation_page = URL('^/$', ValidationPage)
'http://histoires-de-sexe.net/menu.php': HomePage, home = URL(r'/menu.php', HomePage)
'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage, history = URL(r'/sexe/histoires-par-date.php\?p=(?P<pagenum>\d+)', HistoryPage)
'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage, story = URL(r'/sexe.php\?histoire=(?P<id>.+)', StoryPage)
'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage, author = URL(r'/fiche.php\?auteur=(?P<name>.+)', AuthorPage)
}
def iter_stories(self): def iter_stories(self):
self.location('/sexe/histoires-par-date.php')
n = 1 n = 1
self.history.go(pagenum=n)
while self.page.get_numerous() == n: while self.page.get_numerous() == n:
count = 0 for story in self.page.iter_stories():
for count, story in enumerate(self.page.iter_stories()):
yield story yield story
n += 1 n += 1
self.location('/sexe/histoires-par-date.php?p=%d' % n) self.history.go(pagenum=n)
def get_story(self, id): def get_story(self, id):
id = int(id) self.story.go(id=id)
self.location('/sexe.php?histoire=%d' % id) assert self.story.is_here()
assert self.is_on_page(StoryPage)
return self.page.get_story() return self.page.get_story()
def get_author(self, name): def get_author(self, name):
self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15'))) self.author.go(name=name)
assert self.is_on_page(AuthorPage) assert self.author.is_here()
return self.page.get_author() return self.page.get_author()

View file

@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages):
#### CapMessages ############################################## #### CapMessages ##############################################
def iter_threads(self): def iter_threads(self):
with self.browser: for story in self.browser.iter_stories():
for story in self.browser.iter_stories(): thread = Thread(story.id)
thread = Thread(story.id) thread.title = story.title
thread.title = story.title thread.date = story.date
thread.date = story.date yield thread
yield thread
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual'] GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages):
else: else:
thread = None thread = None
with self.browser: story = self.browser.get_story(id)
story = self.browser.get_story(id)
if not story: if not story:
return None return None
@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages):
parent=None, parent=None,
content=story.body, content=story.body,
children=[], children=[],
signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email), signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category),
flags=flags) flags=flags)
return thread return thread

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon # Copyright(C) 2014 Romain Bignon
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -18,17 +18,17 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime from weboob.browser.pages import HTMLPage
import re from weboob.browser.elements import method, ListElement, ItemElement
from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
from weboob.deprecated.browser import Page from weboob.browser.filters.html import XPath, Link
class ValidationPage(Page): class ValidationPage(HTMLPage):
pass pass
class HomePage(Page): class HomePage(HTMLPage):
pass pass
@ -38,15 +38,23 @@ class Author(object):
FEMALE, FEMALE,
TRANSEXUAL) = xrange(4) TRANSEXUAL) = xrange(4)
def __init__(self, name): def __init__(self, name=None):
self.name = name self.name = name
self.sex = self.UNKNOWN self.sex = self.UNKNOWN
self.email = None
self.description = None self.description = None
class Sex2Enum(Filter):
def filter(self, text):
if text == 'homme':
return Author.MALE
if text == 'femme':
return Author.FEMALE
return Author.TRANSEXUAL
class Story(object): class Story(object):
def __init__(self, id): def __init__(self, id=None):
self.id = id self.id = id
self.title = u'' self.title = u''
self.date = None self.date = None
@ -55,129 +63,77 @@ class Story(object):
self.body = None self.body = None
class HistoryPage(Page): class HistoryPage(HTMLPage):
ENCODING = 'iso-8859-1'
def get_numerous(self): def get_numerous(self):
td = self.parser.select(self.document.getroot(), 'td.t0', 1) return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))
n = td.xpath('//u/strong|//u/b')[0].text
return int(n)
def iter_stories(self): @method
links = self.parser.select(self.document.getroot(), 'a.t11') class iter_stories(ListElement):
story = None item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
for link in links:
if not story: class item(ItemElement):
m = re.match('.*histoire=(\d+)', link.attrib['href']) klass = Story
if not m:
self.logger.warning('Unable to parse ID "%s"' % link.attrib['href']) def parse(self, el):
self.env['header'] = el.getprevious().xpath('.//span')[0]
self.env['body'] = el.getnext().xpath('.//a')
obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
obj_title = CleanText('.')
obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
def obj_author(self):
return Author(self.env['header'].xpath('.//a/text()')[0])
class StoryPage(HTMLPage):
ENCODING = 'iso-8859-1'
@method
class get_story(ItemElement):
klass = Story
obj_id = Env('id')
obj_title = CleanText('//h1')
obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
def obj_body(self):
div = self.el.xpath('//div[@align="justify"]')[0]
body = ''
for para in div.findall('br'):
if para.text is not None:
body += para.text.strip()
body += '\n'
if para.tail is not None:
body += para.tail.strip()
return body.replace(u'\x92', "'").strip()
class obj_author(ItemElement):
klass = Author
obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
class AuthorPage(HTMLPage):
@method
class get_author(ItemElement):
klass = Author
obj_name = CleanText('//span[has-class("t3")]')
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
def obj_description(self):
description = u''
for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
if para.tag not in ('b', 'br'):
continue continue
story = Story(int(m.group(1))) if para.text is not None:
story.title = link.text.strip() description += '\n\n%s' % para.text.strip()
else: if para.tail is not None:
story.author = Author(link.text.strip()) description += '\n%s' % para.tail.strip()
if not link.tail: return description.replace(u'\x92', "'").strip()
self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
story = None
continue
date_text = link.tail.strip().split('\n')[-1].strip()
m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text)
if not m:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story = None
continue
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
yield story
story = None
class StoryPage(Page):
def get_story(self):
p_tags = self.document.getroot().xpath('//body/p')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
return None
story = Story((self.group_dict['id']))
story.body = u''
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if 'homme' in gender:
story.author.sex = story.author.MALE
elif 'femme' in gender:
story.author.sex = story.author.FEMALE
else:
story.author.sex = story.author.TRANSEXUAL
email_tag = meta.xpath('./span[@class="police1"]')[0]
story.author.email = email_tag.text.strip()
for img in email_tag.findall('img'):
if img.attrib['src'].endswith('meyle1.gif'):
story.author.email += '@'
elif img.attrib['src'].endswith('meyle1pouan.gif'):
story.author.email += '.'
else:
self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
story.author.email += img.tail.strip()
title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
story.title = title_tag.text.strip() if title_tag.text else u''
span = self.parser.select(self.document.getroot(), 'span.t4', 1)
date_text = span.text.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if m:
story.date = datetime.date(int(m.group(3)),
int(m.group(2)),
int(m.group(1)))
else:
self.logger.warning('Unable to parse datetime "%s"' % date_text)
story.category = span.find('br').tail.split(':')[1].strip()
div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
for para in div.findall('br'):
if para.text is not None:
story.body += para.text.strip()
story.body += '\n'
if para.tail is not None:
story.body += para.tail.strip()
story.body = story.body.replace(u'\x92', "'").strip()
return story
class AuthorPage(Page):
def get_author(self):
p_tags = self.document.getroot().xpath('//body/div/font/b')
if len(p_tags) > 0 and p_tags[0].text.strip() == \
u"La fiche de l'auteur n'est plus accessible.":
return None
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
author_name = meta.xpath('./span[@class="t3"]')[0].text
if author_name is None:
author_name = self.group_dict['name']
author = Author(author_name.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if not gender:
author.sex = author.UNKNOWN
elif 'homme' in gender:
author.sex = author.MALE
elif 'femme' in gender:
author.sex = author.FEMALE
else:
author.sex = author.TRANSEXUAL
author.description = u''
for para in meta.getchildren():
if para.tag not in ('b', 'br'):
continue
if para.text is not None:
author.description += '\n\n%s' % para.text.strip()
if para.tail is not None:
author.description += '\n%s' % para.tail.strip()
author.description = author.description.replace(u'\x92', "'").strip()
if author.description.startswith(u'0 récit '):
self.logger.warning('This author does not have published any story.')
return author