139 lines
4.5 KiB
Python
139 lines
4.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright(C) 2014 Romain Bignon
|
|
#
|
|
# This file is part of weboob.
|
|
#
|
|
# weboob is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Affero General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# weboob is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Affero General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Affero General Public License
|
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
from weboob.browser.pages import HTMLPage
|
|
from weboob.browser.elements import method, ListElement, ItemElement
|
|
from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter
|
|
from weboob.browser.filters.html import XPath, Link
|
|
|
|
|
|
class ValidationPage(HTMLPage):
|
|
pass
|
|
|
|
|
|
class HomePage(HTMLPage):
|
|
pass
|
|
|
|
|
|
class Author(object):
|
|
(UNKNOWN,
|
|
MALE,
|
|
FEMALE,
|
|
TRANSEXUAL) = xrange(4)
|
|
|
|
def __init__(self, name=None):
|
|
self.name = name
|
|
self.sex = self.UNKNOWN
|
|
self.description = None
|
|
|
|
class Sex2Enum(Filter):
|
|
def filter(self, text):
|
|
if text == 'homme':
|
|
return Author.MALE
|
|
if text == 'femme':
|
|
return Author.FEMALE
|
|
return Author.TRANSEXUAL
|
|
|
|
|
|
|
|
class Story(object):
|
|
def __init__(self, id=None):
|
|
self.id = id
|
|
self.title = u''
|
|
self.date = None
|
|
self.category = None
|
|
self.author = None
|
|
self.body = None
|
|
|
|
|
|
class HistoryPage(HTMLPage):
|
|
ENCODING = 'iso-8859-1'
|
|
|
|
def get_numerous(self):
|
|
return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc))
|
|
|
|
@method
|
|
class iter_stories(ListElement):
|
|
item_xpath = '//div[@align="justify"]/span[has-class("t4")]'
|
|
|
|
class item(ItemElement):
|
|
klass = Story
|
|
|
|
def parse(self, el):
|
|
self.env['header'] = el.getprevious().xpath('.//span')[0]
|
|
self.env['body'] = el.getnext().xpath('.//a')
|
|
|
|
obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)')
|
|
obj_title = CleanText('.')
|
|
obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
|
|
obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur')
|
|
|
|
def obj_author(self):
|
|
return Author(self.env['header'].xpath('.//a/text()')[0])
|
|
|
|
class StoryPage(HTMLPage):
|
|
ENCODING = 'iso-8859-1'
|
|
|
|
@method
|
|
class get_story(ItemElement):
|
|
klass = Story
|
|
|
|
obj_id = Env('id')
|
|
obj_title = CleanText('//h1')
|
|
obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date
|
|
obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]')
|
|
|
|
def obj_body(self):
|
|
div = self.el.xpath('//div[@align="justify"]')[0]
|
|
body = ''
|
|
for para in div.findall('br'):
|
|
if para.text is not None:
|
|
body += para.text.strip()
|
|
body += '\n'
|
|
if para.tail is not None:
|
|
body += para.tail.strip()
|
|
return body.replace(u'\x92', "'").strip()
|
|
|
|
|
|
class obj_author(ItemElement):
|
|
klass = Author
|
|
|
|
obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]')
|
|
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
|
|
|
|
|
|
class AuthorPage(HTMLPage):
|
|
@method
|
|
class get_author(ItemElement):
|
|
klass = Author
|
|
|
|
obj_name = CleanText('//span[has-class("t3")]')
|
|
obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum
|
|
|
|
def obj_description(self):
|
|
description = u''
|
|
for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren():
|
|
if para.tag not in ('b', 'br'):
|
|
continue
|
|
if para.text is not None:
|
|
description += '\n\n%s' % para.text.strip()
|
|
if para.tail is not None:
|
|
description += '\n%s' % para.tail.strip()
|
|
return description.replace(u'\x92', "'").strip()
|