# -*- coding: utf-8 -*- # Copyright(C) 2014 Romain Bignon # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . from weboob.browser.pages import HTMLPage from weboob.browser.elements import method, ListElement, ItemElement from weboob.browser.filters.standard import CleanText, Regexp, Date, Env, Filter from weboob.browser.filters.html import XPath, Link class ValidationPage(HTMLPage): pass class HomePage(HTMLPage): pass class Author(object): (UNKNOWN, MALE, FEMALE, TRANSEXUAL) = xrange(4) def __init__(self, name=None): self.name = name self.sex = self.UNKNOWN self.description = None class Sex2Enum(Filter): def filter(self, text): if text == 'homme': return Author.MALE if text == 'femme': return Author.FEMALE return Author.TRANSEXUAL class Story(object): def __init__(self, id=None): self.id = id self.title = u'' self.date = None self.category = None self.author = None self.body = None class HistoryPage(HTMLPage): ENCODING = 'iso-8859-1' def get_numerous(self): return int(CleanText('//div[@align="justify"]/table[1]//td[has-class("t0")]/font/u/strong[1]')(self.doc)) @method class iter_stories(ListElement): item_xpath = '//div[@align="justify"]/span[has-class("t4")]' class item(ItemElement): klass = Story def parse(self, el): self.env['header'] = el.getprevious().xpath('.//span')[0] self.env['body'] = el.getnext().xpath('.//a') obj_id = XPath(Env('body')) & Link & Regexp(pattern=r'.*histoire=(\d+)') obj_title = CleanText('.') obj_date = XPath(Env('header')) & CleanText & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date obj_category = XPath(Env('header')) & CleanText & Regexp(pattern=u'Catégorie :\s*(.*)\s*Auteur') def obj_author(self): return Author(self.env['header'].xpath('.//a/text()')[0]) class StoryPage(HTMLPage): ENCODING = 'iso-8859-1' @method class get_story(ItemElement): klass = Story obj_id = Env('id') obj_title = CleanText('//h1') obj_date = CleanText('//span[has-class("t4")]') & Regexp(pattern=r'le (\d+)-(\d+)-(\d+)', template=r'\3-\2-\1') & Date obj_category = CleanText('//a[starts-with(@href, "histoires-cat")]') def obj_body(self): div = self.el.xpath('//div[@align="justify"]')[0] body = '' for para in div.findall('br'): if para.text is not None: body += para.text.strip() body += '\n' if para.tail is not None: body += para.tail.strip() return body.replace(u'\x92', "'").strip() class obj_author(ItemElement): klass = Author obj_name = CleanText('//a[starts-with(@href, "fiche.php")][2]') obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum class AuthorPage(HTMLPage): @method class get_author(ItemElement): klass = Author obj_name = CleanText('//span[has-class("t3")]') obj_sex = CleanText('//td[has-class("t0")]') & Regexp(pattern=r"Auteur (\w+)") & Author.Sex2Enum def obj_description(self): description = u'' for para in self.el.xpath('//td[has-class("t0")]')[0].getchildren(): if para.tag not in ('b', 'br'): continue if para.text is not None: description += '\n\n%s' % para.text.strip() if para.tail is not None: description += '\n%s' % para.tail.strip() return description.replace(u'\x92', "'").strip()