[regionsjob] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-08 00:08:39 +02:00 committed by Florent
commit ec07532a63
4 changed files with 81 additions and 178 deletions

View file

@ -19,13 +19,11 @@
from weboob.tools.backend import BaseBackend, BackendConfig from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.capabilities.job import ICapJob from weboob.capabilities.job import ICapJob, BaseJobAdvert
from .browser import RegionsjobBrowser from .browser import RegionsjobBrowser
from weboob.tools.ordereddict import OrderedDict from weboob.tools.ordereddict import OrderedDict
from weboob.tools.value import Value from weboob.tools.value import Value
from .job import RegionsJobAdvert
__all__ = ['RegionsjobBackend'] __all__ = ['RegionsjobBackend']
@ -153,21 +151,19 @@ class RegionsjobBackend(BaseBackend, ICapJob):
return self.create_browser(self.config['website'].get()) return self.create_browser(self.config['website'].get())
def search_job(self, pattern=''): def search_job(self, pattern=''):
with self.browser: return self.browser.search_job(pattern=pattern)
return self.browser.search_job(pattern=pattern)
def advanced_search_job(self): def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(), return self.browser.search_job(pattern=self.config['metier'].get(),
fonction=int(self.config['fonction'].get()), fonction=int(self.config['fonction'].get()),
secteur=int(self.config['secteur'].get()), secteur=int(self.config['secteur'].get()),
contract=int(self.config['contract'].get()), contract=int(self.config['contract'].get()),
experience=int(self.config['experience'].get())) experience=int(self.config['experience'].get()))
def get_job_advert(self, _id, advert=None): def get_job_advert(self, _id, advert=None):
with self.browser: return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields): def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert) return self.get_job_advert(advert.id, advert)
OBJECTS = {RegionsJobAdvert: fill_obj} OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -18,54 +18,31 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import urllib import urllib
from weboob.tools.browser import BaseBrowser from weboob.tools.browser2 import PagesBrowser, URL
from weboob.tools.browser.decorators import id2url
from .pages import SearchPage, AdvertPage from .pages import SearchPage, AdvertPage
from .job import RegionsJobAdvert
__all__ = ['RegionsjobBrowser'] __all__ = ['RegionsjobBrowser']
class RegionsjobBrowser(BaseBrowser): class RegionsjobBrowser(PagesBrowser):
PROTOCOL = 'http'
ENCODING = 'utf-8'
PAGES = { advert_page = URL('/offre_emploi/detailoffre.aspx\?numoffre=(?P<_id>.*)&de=consultation', AdvertPage)
'%s://(.*?)/offre_emploi/index.aspx\?v=___(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_' % (PROTOCOL): SearchPage, search_page = URL('/offre_emploi/index.aspx\?v=___0_(?P<fonction>.*)_(?P<experience>.*)_0_(?P<contract>.*)_0_0_(?P<secteur>.*)_0_(?P<metier>.*)_', SearchPage)
'%s://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation' % (PROTOCOL): AdvertPage,
}
def __init__(self, website, *args, **kwargs): def __init__(self, website, *args, **kwargs):
self.DOMAIN = website self.BASEURL = 'http://%s' % website
BaseBrowser.__init__(self, *args, **kwargs) PagesBrowser.__init__(self, *args, **kwargs)
def search_job(self, pattern=''): def search_job(self, pattern='', fonction=0, secteur=0, contract=0, experience=0):
self.location('%s://%s/offre_emploi/index.aspx?v=___0_0_0_0_0_0_0_0_0_%s_' return self.search_page.go(fonction=fonction,
% (self.PROTOCOL, self.DOMAIN, urllib.quote_plus(pattern.encode(self.ENCODING)))) experience=experience,
assert self.is_on_page(SearchPage) contract=contract,
return self.page.iter_job_adverts(self.DOMAIN) secteur=secteur,
metier=urllib.quote_plus(pattern.encode('utf-8'))
).iter_job_adverts(domain=self.BASEURL)
def advanced_search_job(self, metier, fonction, secteur, contract, experience): def get_job_advert(self, _id, advert):
self.location('%s://%s/offre_emploi/index.aspx?v=___%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_' splitted_id = _id.split('#')
% (self.PROTOCOL, self.BASEURL = splitted_id[0]
self.DOMAIN, return self.advert_page.go(_id=splitted_id[1]).get_job_advert(obj=advert)
'0',
fonction,
experience,
'0',
contract,
'0',
'0',
secteur,
'0',
urllib.quote_plus(metier.encode(self.ENCODING))))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts(self.DOMAIN)
@id2url(RegionsJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)

View file

@ -1,29 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class RegionsJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
splitted_id = _id.split('|')
return 'http://%s/offre_emploi/detailoffre.aspx?numoffre=%s&de=consultation' \
% (splitted_id[0], splitted_id[1])

View file

@ -17,100 +17,59 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.misc import html2text from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser import BasePage from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
from .job import RegionsJobAdvert from weboob.tools.date import LinearDateGuesser
from datetime import datetime, date from weboob.capabilities.job import BaseJobAdvert
import re
__all__ = ['SearchPage'] __all__ = ['SearchPage']
class SearchPage(BasePage): class SearchPage(HTMLPage):
def iter_job_adverts(self, website): @method
re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL) class iter_job_adverts(ListElement):
lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li') item_xpath = '//div[@id="liste_offres"]/ul/li'
for li in lis:
a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath')
_id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2))
advert = RegionsJobAdvert(_id)
advert.title = u'%s' % a.text
society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a', class item(ItemElement):
method='xpath') klass = BaseJobAdvert
if len(society_name) > 0:
advert.society_name = u'%s' % society_name[0].text
advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span', obj_id = Format(u'%s#%s',
1, method='xpath').text.strip() Env('domain'),
_date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]', Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation'))
1, method='xpath').text_content() obj_title = CleanText('div/span[@class="offres_poste"]/a')
year = date.today().year obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a')
splitted_date = _date.split('/') obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span')
advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0])) obj_contract_type = CleanText('div/span[@class="offres_poste"]/span')
advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span', obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser())
1, method='xpath').text
yield advert
class AdvertPage(BasePage): class AdvertPage(HTMLPage):
def get_job_advert(self, url, advert): @method
re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL) class get_job_advert(ItemElement):
if advert is None: klass = BaseJobAdvert
_id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
advert = RegionsJobAdvert(_id)
advert.url = u'%s' % url def parse(self, el):
if self.obj.id:
advert = self.obj
advert.url = self.page.url
advert.description = Format(u'%s\r\n%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el)
advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el)
raise SkipItem()
div = self.document.getroot().xpath('//div[@id="annonce"]')[0] self.env['url'] = self.page.url
advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text obj_description = Format(u'%s%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))
content = self.parser.select(div, 'p', method='xpath') obj_id = Env('_id')
obj_url = Env('url')
next_is_date = False obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'),
next_is_pay = False '(\d{2}/\d{2}/\d{4})'))
description = '' obj_title = CleanText('//div[@id="annonce"]/h1')
obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]')
for p in content: obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]')
if next_is_date: obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]')
m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date) obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')
if m:
dd = int(m.group(1))
mm = int(m.group(2))
yyyy = int(m.group(3))
advert.publication_date = datetime.date(yyyy, mm, dd)
next_is_date = False
elif next_is_pay:
advert.pay = html2text(self.parser.tostring(p))
next_is_pay = False
elif 'class' in p.attrib:
if p.attrib['class'] == 'contrat_loc':
_p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
content_p = _p.text_content().strip().split('\r\n')
for el in content_p:
splitted_el = el.split(':')
if len(splitted_el) == 2:
if splitted_el[0] == 'Entreprise':
advert.society_name = splitted_el[1]
elif splitted_el[0] == 'Contrat':
advert.contract_type = splitted_el[1]
elif splitted_el[0] == 'Localisation':
advert.place = splitted_el[1]
elif p.attrib['class'] == 'date_ref':
next_is_date = True
elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
next_is_pay = True
else:
description = description + html2text(self.parser.tostring(p))
else:
description = description + html2text(self.parser.tostring(p))
advert.description = u'%s' % description
return advert