support authors informations

This commit is contained in:
Romain Bignon 2011-08-03 09:54:02 +02:00
commit 08a7b36408
3 changed files with 73 additions and 10 deletions

View file

@ -50,6 +50,8 @@ class HDSBackend(BaseBackend, ICapMessages):
thread.nb_messages = 1 thread.nb_messages = 1
yield thread yield thread
GENDERS = ['<unknown>', 'boy', 'girl', 'transexual']
def get_thread(self, id): def get_thread(self, id):
if isinstance(id, Thread): if isinstance(id, Thread):
thread = id thread = id
@ -75,12 +77,13 @@ class HDSBackend(BaseBackend, ICapMessages):
thread.root = Message(thread=thread, thread.root = Message(thread=thread,
id=0, id=0,
title=story.title, title=story.title,
sender=story.author or u'', sender=story.author.name,
receivers=None, receivers=None,
date=thread.date, date=thread.date,
parent=None, parent=None,
content=story.body, content=story.body,
children=[], children=[],
signature='Written by a %s (%s)' % (self.GENDERS[story.author.sex], story.author.email),
flags=flags) flags=flags)
return thread return thread

View file

@ -20,7 +20,7 @@
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from .pages import ValidationPage, HomePage, HistoryPage, StoryPage from .pages import ValidationPage, HomePage, HistoryPage, StoryPage, AuthorPage
# Browser # Browser
class HDSBrowser(BaseBrowser): class HDSBrowser(BaseBrowser):
@ -30,6 +30,7 @@ class HDSBrowser(BaseBrowser):
'http://histoires-de-sexe.net/menu.php': HomePage, 'http://histoires-de-sexe.net/menu.php': HomePage,
'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage, 'http://histoires-de-sexe.net/sexe/histoires-par-date.php.*': HistoryPage,
'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage, 'http://histoires-de-sexe.net/sexe.php\?histoire=(?P<id>.+)': StoryPage,
'http://histoires-de-sexe.net/fiche.php\?auteur=(?P<name>.+)': AuthorPage,
} }
def iter_stories(self): def iter_stories(self):
@ -52,3 +53,9 @@ class HDSBrowser(BaseBrowser):
self.location('/sexe.php?histoire=%d' % id) self.location('/sexe.php?histoire=%d' % id)
assert self.is_on_page(StoryPage) assert self.is_on_page(StoryPage)
return self.page.get_story() return self.page.get_story()
def get_author(self, name):
self.location(self.buildurl('/fiche.php', auteur=name))
assert self.is_on_page(AuthorPage)
return self.page.get_author()

View file

@ -33,6 +33,18 @@ class ValidationPage(BasePage):
class HomePage(BasePage): class HomePage(BasePage):
pass pass
class Author(object):
(UNKNOWN,
MALE,
FEMALE,
TRANSEXUAL) = xrange(4)
def __init__(self, name):
self.name = name
self.sex = self.UNKNOWN
self.email = None
self.description = None
class Story(object): class Story(object):
def __init__(self, id): def __init__(self, id):
self.id = id self.id = id
@ -54,14 +66,14 @@ class HistoryPage(BasePage):
story = Story(int(m.group(1))) story = Story(int(m.group(1)))
story.title = link.text.strip() story.title = link.text.strip()
else: else:
story.author = link.text.strip() story.author = Author(link.text.strip())
date_text = link.tail.strip().split('\n')[-1].strip() date_text = link.tail.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text) m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if not m: if not m:
self.logger.warning('Unable to parse datetime "%s"' % date_text) self.logger.warning('Unable to parse datetime "%s"' % date_text)
story = None story = None
continue continue
story.date = datetime.datetime(int(m.group(3)), story.date = datetime.date(int(m.group(3)),
int(m.group(2)), int(m.group(2)),
int(m.group(1))) int(m.group(1)))
yield story yield story
@ -71,12 +83,31 @@ class StoryPage(BasePage):
def get_story(self): def get_story(self):
story = Story((self.group_dict['id'])) story = Story((self.group_dict['id']))
story.body = u'' story.body = u''
story.author = self.parser.select(self.document.getroot(), 'a.t3', 1).text.strip() meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
story.author = Author(meta.xpath('./a[@class="t3"]')[0].text.strip())
gender = meta.xpath('./a[@class="t0"]')[0].text
if 'homme' in gender:
story.author.sex = story.author.MALE
elif 'femme' in gender:
story.author.sex = story.author.FEMALE
else:
story.author.sex = story.author.TRANSEXUAL
email_tag = meta.xpath('./span[@class="police1"]')[0]
story.author.email = email_tag.text.strip()
for img in email_tag.findall('img'):
if img.attrib['src'].endswith('meyle1.gif'):
story.author.email += '@'
elif img.attrib['src'].endswith('meyle1pouan.gif'):
story.author.email += '.'
else:
self.logger.warning('Unable to know what image is %s' % img.attrib['src'])
story.author.email += img.tail.strip()
story.title = self.parser.select(self.document.getroot(), 'h1', 1).text.strip() story.title = self.parser.select(self.document.getroot(), 'h1', 1).text.strip()
date_text = self.parser.select(self.document.getroot(), 'span.t4', 1).text.strip().split('\n')[-1].strip() date_text = self.parser.select(self.document.getroot(), 'span.t4', 1).text.strip().split('\n')[-1].strip()
m = re.match('(\d+)-(\d+)-(\d+)', date_text) m = re.match('(\d+)-(\d+)-(\d+)', date_text)
if m: if m:
story.date = datetime.datetime(int(m.group(3)), story.date = datetime.date(int(m.group(3)),
int(m.group(2)), int(m.group(2)),
int(m.group(1))) int(m.group(1)))
else: else:
@ -91,3 +122,25 @@ class StoryPage(BasePage):
story.body += para.tail.strip() story.body += para.tail.strip()
story.body = story.body.replace(u'\x92', "'").strip() story.body = story.body.replace(u'\x92', "'").strip()
return story return story
class AuthorPage(BasePage):
def get_author(self):
meta = self.parser.select(self.document.getroot(), 'td.t0', 1)
author = Author(meta.xpath('./span[@class="t3"]')[0].text.strip())
if 'homme' in meta.xpath('./a[@class="t0"]')[0].text:
author.sex = author.MALE
else:
author.sex = author.FEMALE
author.description = u''
for para in meta.getchildren():
if para.tag not in ('b', 'br'):
continue
if para.text is not None:
author.description += '\n\n%s' % para.text.strip()
if para.tail is not None:
author.description += '\n%s' % para.tail.strip()
author.description = author.description.replace(u'\x92', "'").strip()
return author