diff --git a/modules/regionsjob/backend.py b/modules/regionsjob/backend.py index 8f24ebc1..c4ff23f7 100644 --- a/modules/regionsjob/backend.py +++ b/modules/regionsjob/backend.py @@ -19,13 +19,11 @@ from weboob.tools.backend import BaseBackend, BackendConfig -from weboob.capabilities.job import ICapJob +from weboob.capabilities.job import ICapJob, BaseJobAdvert from .browser import RegionsjobBrowser from weboob.tools.ordereddict import OrderedDict from weboob.tools.value import Value -from .job import RegionsJobAdvert - __all__ = ['RegionsjobBackend'] @@ -153,21 +151,19 @@ class RegionsjobBackend(BaseBackend, ICapJob): return self.create_browser(self.config['website'].get()) def search_job(self, pattern=''): - with self.browser: - return self.browser.search_job(pattern=pattern) + return self.browser.search_job(pattern=pattern) def advanced_search_job(self): - return self.browser.advanced_search_job(metier=self.config['metier'].get(), - fonction=int(self.config['fonction'].get()), - secteur=int(self.config['secteur'].get()), - contract=int(self.config['contract'].get()), - experience=int(self.config['experience'].get())) + return self.browser.search_job(pattern=self.config['metier'].get(), + fonction=int(self.config['fonction'].get()), + secteur=int(self.config['secteur'].get()), + contract=int(self.config['contract'].get()), + experience=int(self.config['experience'].get())) def get_job_advert(self, _id, advert=None): - with self.browser: - return self.browser.get_job_advert(_id, advert) + return self.browser.get_job_advert(_id, advert) def fill_obj(self, advert, fields): - self.get_job_advert(advert.id, advert) + return self.get_job_advert(advert.id, advert) - OBJECTS = {RegionsJobAdvert: fill_obj} + OBJECTS = {BaseJobAdvert: fill_obj} diff --git a/modules/regionsjob/browser.py b/modules/regionsjob/browser.py index f4fdc4d8..68c52162 100644 --- a/modules/regionsjob/browser.py +++ b/modules/regionsjob/browser.py @@ -18,54 +18,31 @@ # along with weboob. If not, see . import urllib -from weboob.tools.browser import BaseBrowser -from weboob.tools.browser.decorators import id2url +from weboob.tools.browser2 import PagesBrowser, URL from .pages import SearchPage, AdvertPage -from .job import RegionsJobAdvert - __all__ = ['RegionsjobBrowser'] -class RegionsjobBrowser(BaseBrowser): - PROTOCOL = 'http' - ENCODING = 'utf-8' +class RegionsjobBrowser(PagesBrowser): - PAGES = { - '%s://(.*?)/offre_emploi/index.aspx\?v=___(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_' % (PROTOCOL): SearchPage, - '%s://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation' % (PROTOCOL): AdvertPage, - } + advert_page = URL('/offre_emploi/detailoffre.aspx\?numoffre=(?P<_id>.*)&de=consultation', AdvertPage) + search_page = URL('/offre_emploi/index.aspx\?v=___0_(?P.*)_(?P.*)_0_(?P.*)_0_0_(?P.*)_0_(?P.*)_', SearchPage) def __init__(self, website, *args, **kwargs): - self.DOMAIN = website - BaseBrowser.__init__(self, *args, **kwargs) + self.BASEURL = 'http://%s' % website + PagesBrowser.__init__(self, *args, **kwargs) - def search_job(self, pattern=''): - self.location('%s://%s/offre_emploi/index.aspx?v=___0_0_0_0_0_0_0_0_0_%s_' - % (self.PROTOCOL, self.DOMAIN, urllib.quote_plus(pattern.encode(self.ENCODING)))) - assert self.is_on_page(SearchPage) - return self.page.iter_job_adverts(self.DOMAIN) + def search_job(self, pattern='', fonction=0, secteur=0, contract=0, experience=0): + return self.search_page.go(fonction=fonction, + experience=experience, + contract=contract, + secteur=secteur, + metier=urllib.quote_plus(pattern.encode('utf-8')) + ).iter_job_adverts(domain=self.BASEURL) - def advanced_search_job(self, metier, fonction, secteur, contract, experience): - self.location('%s://%s/offre_emploi/index.aspx?v=___%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_' - % (self.PROTOCOL, - self.DOMAIN, - '0', - fonction, - experience, - '0', - contract, - '0', - '0', - secteur, - '0', - urllib.quote_plus(metier.encode(self.ENCODING)))) - assert self.is_on_page(SearchPage) - return self.page.iter_job_adverts(self.DOMAIN) - - @id2url(RegionsJobAdvert.id2url) - def get_job_advert(self, url, advert): - self.location(url) - assert self.is_on_page(AdvertPage) - return self.page.get_job_advert(url, advert) + def get_job_advert(self, _id, advert): + splitted_id = _id.split('#') + self.BASEURL = splitted_id[0] + return self.advert_page.go(_id=splitted_id[1]).get_job_advert(obj=advert) diff --git a/modules/regionsjob/job.py b/modules/regionsjob/job.py deleted file mode 100644 index 743592af..00000000 --- a/modules/regionsjob/job.py +++ /dev/null @@ -1,29 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2013 Bezleputh -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - -from weboob.capabilities.job import BaseJobAdvert - - -class RegionsJobAdvert(BaseJobAdvert): - @classmethod - def id2url(cls, _id): - splitted_id = _id.split('|') - return 'http://%s/offre_emploi/detailoffre.aspx?numoffre=%s&de=consultation' \ - % (splitted_id[0], splitted_id[1]) - diff --git a/modules/regionsjob/pages.py b/modules/regionsjob/pages.py index 6add5623..b0ee3199 100644 --- a/modules/regionsjob/pages.py +++ b/modules/regionsjob/pages.py @@ -17,100 +17,59 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.misc import html2text -from weboob.tools.browser import BasePage -from .job import RegionsJobAdvert -from datetime import datetime, date -import re +from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement +from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime +from weboob.tools.date import LinearDateGuesser +from weboob.capabilities.job import BaseJobAdvert __all__ = ['SearchPage'] -class SearchPage(BasePage): - def iter_job_adverts(self, website): - re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL) - lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li') - for li in lis: - a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath') - _id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2)) - advert = RegionsJobAdvert(_id) - advert.title = u'%s' % a.text +class SearchPage(HTMLPage): + @method + class iter_job_adverts(ListElement): + item_xpath = '//div[@id="liste_offres"]/ul/li' - society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a', - method='xpath') - if len(society_name) > 0: - advert.society_name = u'%s' % society_name[0].text + class item(ItemElement): + klass = BaseJobAdvert - advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span', - 1, method='xpath').text.strip() - _date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]', - 1, method='xpath').text_content() - year = date.today().year - splitted_date = _date.split('/') - advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0])) - advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span', - 1, method='xpath').text - yield advert + obj_id = Format(u'%s#%s', + Env('domain'), + Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation')) + obj_title = CleanText('div/span[@class="offres_poste"]/a') + obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a') + obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span') + obj_contract_type = CleanText('div/span[@class="offres_poste"]/span') + obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser()) -class AdvertPage(BasePage): - def get_job_advert(self, url, advert): - re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL) - if advert is None: - _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2)) - advert = RegionsJobAdvert(_id) +class AdvertPage(HTMLPage): + @method + class get_job_advert(ItemElement): + klass = BaseJobAdvert - advert.url = u'%s' % url + def parse(self, el): + if self.obj.id: + advert = self.obj + advert.url = self.page.url + advert.description = Format(u'%s\r\n%s', + CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'), + CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el) + advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el) + raise SkipItem() - div = self.document.getroot().xpath('//div[@id="annonce"]')[0] + self.env['url'] = self.page.url - advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text + obj_description = Format(u'%s%s', + CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'), + CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]')) - content = self.parser.select(div, 'p', method='xpath') - - next_is_date = False - next_is_pay = False - description = '' - - for p in content: - if next_is_date: - m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date) - if m: - dd = int(m.group(1)) - mm = int(m.group(2)) - yyyy = int(m.group(3)) - advert.publication_date = datetime.date(yyyy, mm, dd) - next_is_date = False - - elif next_is_pay: - advert.pay = html2text(self.parser.tostring(p)) - next_is_pay = False - - elif 'class' in p.attrib: - if p.attrib['class'] == 'contrat_loc': - _p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath') - content_p = _p.text_content().strip().split('\r\n') - for el in content_p: - splitted_el = el.split(':') - if len(splitted_el) == 2: - if splitted_el[0] == 'Entreprise': - advert.society_name = splitted_el[1] - elif splitted_el[0] == 'Contrat': - advert.contract_type = splitted_el[1] - elif splitted_el[0] == 'Localisation': - advert.place = splitted_el[1] - - elif p.attrib['class'] == 'date_ref': - next_is_date = True - - elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire': - next_is_pay = True - - else: - description = description + html2text(self.parser.tostring(p)) - else: - description = description + html2text(self.parser.tostring(p)) - - advert.description = u'%s' % description - - return advert + obj_id = Env('_id') + obj_url = Env('url') + obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'), + '(\d{2}/\d{2}/\d{4})')) + obj_title = CleanText('//div[@id="annonce"]/h1') + obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]') + obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]') + obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]') + obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')