convert hds module to browser2 and fix it
This commit is contained in:
parent
94cea6458b
commit
89e7bbe9ef
4 changed files with 114 additions and 163 deletions
|
|
@ -68,7 +68,7 @@ def main(filename):
|
||||||
sys.stdout.write('Getting stories list from website... ')
|
sys.stdout.write('Getting stories list from website... ')
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
for story in br.iter_stories():
|
for story in br.iter_stories():
|
||||||
if story.id in stored:
|
if int(story.id) in stored:
|
||||||
break
|
break
|
||||||
to_fetch.add(story.id)
|
to_fetch.add(story.id)
|
||||||
authors.add(story.author.name)
|
authors.add(story.author.name)
|
||||||
|
|
|
||||||
|
|
@ -18,43 +18,40 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
from weboob.deprecated.browser import Browser
|
from weboob.browser import PagesBrowser, URL
|
||||||
|
|
||||||
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
|
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
|
||||||
|
|
||||||
# Browser
|
# Browser
|
||||||
|
|
||||||
|
|
||||||
class HDSBrowser(Browser):
|
class HDSBrowser(PagesBrowser):
|
||||||
ENCODING = 'ISO-8859-1'
|
BASEURL = 'http://histoires-de-sexe.net'
|
||||||
DOMAIN = 'histoires-de-sexe.net'
|
|
||||||
PAGES = {'http://histoires-de-sexe.net/': ValidationPage,
|
validation_page = URL('^/$', ValidationPage)
|
||||||
'http://histoires-de-sexe.net/menu.php': HomePage,
|
home = URL(r'/menu.php', HomePage)
|
||||||
'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
|
history = URL(r'/sexe/histoires-par-date.php\?p=(?P<pagenum>\d+)', HistoryPage)
|
||||||
'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
|
story = URL(r'/sexe.php\?histoire=(?P<id>.+)', StoryPage)
|
||||||
'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
|
author = URL(r'/fiche.php\?auteur=(?P<name>.+)', AuthorPage)
|
||||||
}
|
|
||||||
|
|
||||||
def iter_stories(self):
|
def iter_stories(self):
|
||||||
self.location('/sexe/histoires-par-date.php')
|
|
||||||
n = 1
|
n = 1
|
||||||
|
self.history.go(pagenum=n)
|
||||||
while self.page.get_numerous() == n:
|
while self.page.get_numerous() == n:
|
||||||
count = 0
|
for story in self.page.iter_stories():
|
||||||
for count, story in enumerate(self.page.iter_stories()):
|
|
||||||
yield story
|
yield story
|
||||||
|
|
||||||
n += 1
|
n += 1
|
||||||
self.location('/sexe/histoires-par-date.php?p=%d' % n)
|
self.history.go(pagenum=n)
|
||||||
|
|
||||||
def get_story(self, id):
|
def get_story(self, id):
|
||||||
id = int(id)
|
self.story.go(id=id)
|
||||||
|
|
||||||
self.location('/sexe.php?histoire=%d' % id)
|
assert self.story.is_here()
|
||||||
assert self.is_on_page(StoryPage)
|
|
||||||
return self.page.get_story()
|
return self.page.get_story()
|
||||||
|
|
||||||
def get_author(self, name):
|
def get_author(self, name):
|
||||||
self.location(self.buildurl('/fiche.php', auteur=name.encode('iso-8859-15')))
|
self.author.go(name=name)
|
||||||
|
|
||||||
assert self.is_on_page(AuthorPage)
|
assert self.author.is_here()
|
||||||
return self.page.get_author()
|
return self.page.get_author()
|
||||||
|
|
|
||||||
|
|
@ -40,12 +40,11 @@ class HDSModule(Module, CapMessages):
|
||||||
#### CapMessages ##############################################
|
#### CapMessages ##############################################
|
||||||
|
|
||||||
def iter_threads(self):
|
def iter_threads(self):
|
||||||
with self.browser:
|
for story in self.browser.iter_stories():
|
||||||
for story in self.browser.iter_stories():
|
thread = Thread(story.id)
|
||||||
thread = Thread(story.id)
|
thread.title = story.title
|
||||||
thread.title = story.title
|
thread.date = story.date
|
||||||
thread.date = story.date
|
yield thread
|
||||||
yield thread
|
|
||||||
|
|
||||||
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
|
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
|
||||||
|
|
||||||
|
|
@ -56,8 +55,7 @@ class HDSModule(Module, CapMessages):
|
||||||
else:
|
else:
|
||||||
thread = None
|
thread = None
|
||||||
|
|
||||||
with self.browser:
|
story = self.browser.get_story(id)
|
||||||
story = self.browser.get_story(id)
|
|
||||||
|
|
||||||
if not story:
|
if not story:
|
||||||
return None
|
return None
|
||||||
|
|
@ -80,7 +78,7 @@ class HDSModule(Module, CapMessages):
|
||||||
parent=None,
|
parent=None,
|
||||||
content=story.body,
|
content=story.body,
|
||||||
children=[],
|
children=[],
|
||||||
signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
|
signature=u'Written by a %s in category %s' % (self.GENDERS[story.author.sex], story.category),
|
||||||
flags=flags)
|
flags=flags)
|
||||||
|
|
||||||
return thread
|
return thread
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright(C) 2011 Romain Bignon
|
# Copyright(C) 2014 Romain Bignon
|
||||||
#
|
#
|
||||||
# This file is part of weboob.
|
# This file is part of weboob.
|
||||||
#
|
#
|
||||||
|
|
@ -18,17 +18,17 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
import datetime
|
from weboob.browser.pages import HTMLPage
|
||||||
import re
|
from weboob.browser.elements import method, ListElement, ItemElement
|
||||||
|
from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
|
||||||
from weboob.deprecated.browser import Page
|
from weboob.browser.filters.html import XPath, Link
|
||||||
|
|
||||||
|
|
||||||
class ValidationPage(Page):
|
class ValidationPage(HTMLPage):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class HomePage(Page):
|
class HomePage(HTMLPage):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -38,15 +38,23 @@ class Author(object):
|
||||||
FEMALE,
|
FEMALE,
|
||||||
TRANSEXUAL) = xrange(4)
|
TRANSEXUAL) = xrange(4)
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name=None):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.sex = self.UNKNOWN
|
self.sex = self.UNKNOWN
|
||||||
self.email = None
|
|
||||||
self.description = None
|
self.description = None
|
||||||
|
|
||||||
|
class Sex2Enum(Filter):
|
||||||
|
def filter(self, text):
|
||||||
|
if text == 'homme':
|
||||||
|
return Author.MALE
|
||||||
|
if text == 'femme':
|
||||||
|
return Author.FEMALE
|
||||||
|
return Author.TRANSEXUAL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Story(object):
|
class Story(object):
|
||||||
def __init__(self, id):
|
def __init__(self, id=None):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.title = u''
|
self.title = u''
|
||||||
self.date = None
|
self.date = None
|
||||||
|
|
@ -55,129 +63,77 @@ class Story(object):
|
||||||
self.body = None
|
self.body = None
|
||||||
|
|
||||||
|
|
||||||
class HistoryPage(Page):
|
class HistoryPage(HTMLPage):
|
||||||
|
ENCODING = 'iso-8859-1'
|
||||||
|
|
||||||
def get_numerous(self):
|
def get_numerous(self):
|
||||||
td = self.parser.select(self.document.getroot(), 'td.t0', 1)
|
return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))
|
||||||
n = td.xpath('//u/strong|//u/b')[0].text
|
|
||||||
return int(n)
|
|
||||||
|
|
||||||
def iter_stories(self):
|
@method
|
||||||
links = self.parser.select(self.document.getroot(), 'a.t11')
|
class iter_stories(ListElement):
|
||||||
story = None
|
item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
|
||||||
for link in links:
|
|
||||||
if not story:
|
class item(ItemElement):
|
||||||
m = re.match('.*histoire=(\d+)', link.attrib['href'])
|
klass = Story
|
||||||
if not m:
|
|
||||||
self.logger.warning('Unable to parse ID "%s"' % link.attrib['href'])
|
def parse(self, el):
|
||||||
|
self.env['header'] = el.getprevious().xpath('.//span')[0]
|
||||||
|
self.env['body'] = el.getnext().xpath('.//a')
|
||||||
|
|
||||||
|
obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
|
||||||
|
obj_title = CleanText('.')
|
||||||
|
obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
|
||||||
|
obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
|
||||||
|
|
||||||
|
def obj_author(self):
|
||||||
|
return Author(self.env['header'].xpath('.//a/text()')[0])
|
||||||
|
|
||||||
|
class StoryPage(HTMLPage):
|
||||||
|
ENCODING = 'iso-8859-1'
|
||||||
|
|
||||||
|
@method
|
||||||
|
class get_story(ItemElement):
|
||||||
|
klass = Story
|
||||||
|
|
||||||
|
obj_id = Env('id')
|
||||||
|
obj_title = CleanText('//h1')
|
||||||
|
obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
|
||||||
|
obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
|
||||||
|
|
||||||
|
def obj_body(self):
|
||||||
|
div = self.el.xpath('//div[@align="justify"]')[0]
|
||||||
|
body = ''
|
||||||
|
for para in div.findall('br'):
|
||||||
|
if para.text is not None:
|
||||||
|
body += para.text.strip()
|
||||||
|
body += '\n'
|
||||||
|
if para.tail is not None:
|
||||||
|
body += para.tail.strip()
|
||||||
|
return body.replace(u'\x92', "'").strip()
|
||||||
|
|
||||||
|
|
||||||
|
class obj_author(ItemElement):
|
||||||
|
klass = Author
|
||||||
|
|
||||||
|
obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
|
||||||
|
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
|
||||||
|
|
||||||
|
|
||||||
|
class AuthorPage(HTMLPage):
|
||||||
|
@method
|
||||||
|
class get_author(ItemElement):
|
||||||
|
klass = Author
|
||||||
|
|
||||||
|
obj_name = CleanText('//span[has-class("t3")]')
|
||||||
|
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
|
||||||
|
|
||||||
|
def obj_description(self):
|
||||||
|
description = u''
|
||||||
|
for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
|
||||||
|
if para.tag not in ('b', 'br'):
|
||||||
continue
|
continue
|
||||||
story = Story(int(m.group(1)))
|
if para.text is not None:
|
||||||
story.title = link.text.strip()
|
description += '\n\n%s' % para.text.strip()
|
||||||
else:
|
if para.tail is not None:
|
||||||
story.author = Author(link.text.strip())
|
description += '\n%s' % para.tail.strip()
|
||||||
if not link.tail:
|
return description.replace(u'\x92', "'").strip()
|
||||||
self.logger.warning('There is probably a mistake in the name of %s, skipping...' % story.author.name)
|
|
||||||
story = None
|
|
||||||
continue
|
|
||||||
date_text = link.tail.strip().split('\n')[-1].strip()
|
|
||||||
m = re.match('.*, le (\d+)-(\d+)-(\d+)', date_text)
|
|
||||||
if not m:
|
|
||||||
self.logger.warning('Unable to parse datetime "%s"' % date_text)
|
|
||||||
story = None
|
|
||||||
continue
|
|
||||||
story.date = datetime.date(int(m.group(3)),
|
|
||||||
int(m.group(2)),
|
|
||||||
int(m.group(1)))
|
|
||||||
yield story
|
|
||||||
story = None
|
|
||||||
|
|
||||||
|
|
||||||
class StoryPage(Page):
|
|
||||||
def get_story(self):
|
|
||||||
p_tags = self.document.getroot().xpath('//body/p')
|
|
||||||
if len(p_tags) > 0 and p_tags[0].text.strip() == \
|
|
||||||
u"Le r\xe9cit que vous demandez n'est pas accessible actuellement.":
|
|
||||||
return None
|
|
||||||
|
|
||||||
story = Story((self.group_dict['id']))
|
|
||||||
story.body = u''
|
|
||||||
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
|
|
||||||
story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
|
|
||||||
gender = meta.xpath('./a[@class="t0"]')[0].text
|
|
||||||
if 'homme' in gender:
|
|
||||||
story.author.sex = story.author.MALE
|
|
||||||
elif 'femme' in gender:
|
|
||||||
story.author.sex = story.author.FEMALE
|
|
||||||
else:
|
|
||||||
story.author.sex = story.author.TRANSEXUAL
|
|
||||||
email_tag = meta.xpath('./span[@class="police1"]')[0]
|
|
||||||
story.author.email = email_tag.text.strip()
|
|
||||||
for img in email_tag.findall('img'):
|
|
||||||
if img.attrib['src'].endswith('meyle1.gif'):
|
|
||||||
story.author.email += '@'
|
|
||||||
elif img.attrib['src'].endswith('meyle1pouan.gif'):
|
|
||||||
story.author.email += '.'
|
|
||||||
else:
|
|
||||||
self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
|
|
||||||
story.author.email += img.tail.strip()
|
|
||||||
|
|
||||||
title_tag = self.parser.select(self.document.getroot(), 'h1', 1)
|
|
||||||
story.title = title_tag.text.strip() if title_tag.text else u''
|
|
||||||
|
|
||||||
span = self.parser.select(self.document.getroot(), 'span.t4', 1)
|
|
||||||
date_text = span.text.strip().split('\n')[-1].strip()
|
|
||||||
m = re.match('(\d+)-(\d+)-(\d+)', date_text)
|
|
||||||
if m:
|
|
||||||
story.date = datetime.date(int(m.group(3)),
|
|
||||||
int(m.group(2)),
|
|
||||||
int(m.group(1)))
|
|
||||||
else:
|
|
||||||
self.logger.warning('Unable to parse datetime "%s"' % date_text)
|
|
||||||
|
|
||||||
story.category = span.find('br').tail.split(':')[1].strip()
|
|
||||||
|
|
||||||
div = self.parser.select(self.document.getroot(), 'div[align=justify]', 1)
|
|
||||||
for para in div.findall('br'):
|
|
||||||
if para.text is not None:
|
|
||||||
story.body += para.text.strip()
|
|
||||||
story.body += '\n'
|
|
||||||
if para.tail is not None:
|
|
||||||
story.body += para.tail.strip()
|
|
||||||
story.body = story.body.replace(u'\x92', "'").strip()
|
|
||||||
return story
|
|
||||||
|
|
||||||
|
|
||||||
class AuthorPage(Page):
|
|
||||||
def get_author(self):
|
|
||||||
p_tags = self.document.getroot().xpath('//body/div/font/b')
|
|
||||||
if len(p_tags) > 0 and p_tags[0].text.strip() == \
|
|
||||||
u"La fiche de l'auteur n'est plus accessible.":
|
|
||||||
return None
|
|
||||||
|
|
||||||
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
|
|
||||||
author_name = meta.xpath('./span[@class="t3"]')[0].text
|
|
||||||
if author_name is None:
|
|
||||||
author_name = self.group_dict['name']
|
|
||||||
author = Author(author_name.strip())
|
|
||||||
gender = meta.xpath('./a[@class="t0"]')[0].text
|
|
||||||
if not gender:
|
|
||||||
author.sex = author.UNKNOWN
|
|
||||||
elif 'homme' in gender:
|
|
||||||
author.sex = author.MALE
|
|
||||||
elif 'femme' in gender:
|
|
||||||
author.sex = author.FEMALE
|
|
||||||
else:
|
|
||||||
author.sex = author.TRANSEXUAL
|
|
||||||
|
|
||||||
author.description = u''
|
|
||||||
for para in meta.getchildren():
|
|
||||||
if para.tag not in ('b', 'br'):
|
|
||||||
continue
|
|
||||||
if para.text is not None:
|
|
||||||
author.description += '\n\n%s' % para.text.strip()
|
|
||||||
if para.tail is not None:
|
|
||||||
author.description += '\n%s' % para.tail.strip()
|
|
||||||
author.description = author.description.replace(u'\x92', "'").strip()
|
|
||||||
|
|
||||||
if author.description.startswith(u'0 récit '):
|
|
||||||
self.logger.warning('This author does not have published any story.')
|
|
||||||
return author
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue