[regionsjob] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-08 00:08:39 +02:00 committed by Florent
commit ec07532a63
4 changed files with 81 additions and 178 deletions

View file

@ -19,13 +19,11 @@
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.capabilities.job import ICapJob
from weboob.capabilities.job import ICapJob, BaseJobAdvert
from .browser import RegionsjobBrowser
from weboob.tools.ordereddict import OrderedDict
from weboob.tools.value import Value
from .job import RegionsJobAdvert
__all__ = ['RegionsjobBackend']
@ -153,21 +151,19 @@ class RegionsjobBackend(BaseBackend, ICapJob):
return self.create_browser(self.config['website'].get())
def search_job(self, pattern=''):
with self.browser:
return self.browser.search_job(pattern=pattern)
return self.browser.search_job(pattern=pattern)
def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(),
fonction=int(self.config['fonction'].get()),
secteur=int(self.config['secteur'].get()),
contract=int(self.config['contract'].get()),
experience=int(self.config['experience'].get()))
return self.browser.search_job(pattern=self.config['metier'].get(),
fonction=int(self.config['fonction'].get()),
secteur=int(self.config['secteur'].get()),
contract=int(self.config['contract'].get()),
experience=int(self.config['experience'].get()))
def get_job_advert(self, _id, advert=None):
with self.browser:
return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert)
return self.get_job_advert(advert.id, advert)
OBJECTS = {RegionsJobAdvert: fill_obj}
OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -18,54 +18,31 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import urllib
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from weboob.tools.browser2 import PagesBrowser, URL
from .pages import SearchPage, AdvertPage
from .job import RegionsJobAdvert
__all__ = ['RegionsjobBrowser']
class RegionsjobBrowser(BaseBrowser):
PROTOCOL = 'http'
ENCODING = 'utf-8'
class RegionsjobBrowser(PagesBrowser):
PAGES = {
'%s://(.*?)/offre_emploi/index.aspx\?v=___(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_' % (PROTOCOL): SearchPage,
'%s://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation' % (PROTOCOL): AdvertPage,
}
advert_page = URL('/offre_emploi/detailoffre.aspx\?numoffre=(?P<_id>.*)&de=consultation', AdvertPage)
search_page = URL('/offre_emploi/index.aspx\?v=___0_(?P<fonction>.*)_(?P<experience>.*)_0_(?P<contract>.*)_0_0_(?P<secteur>.*)_0_(?P<metier>.*)_', SearchPage)
def __init__(self, website, *args, **kwargs):
self.DOMAIN = website
BaseBrowser.__init__(self, *args, **kwargs)
self.BASEURL = 'http://%s' % website
PagesBrowser.__init__(self, *args, **kwargs)
def search_job(self, pattern=''):
self.location('%s://%s/offre_emploi/index.aspx?v=___0_0_0_0_0_0_0_0_0_%s_'
% (self.PROTOCOL, self.DOMAIN, urllib.quote_plus(pattern.encode(self.ENCODING))))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts(self.DOMAIN)
def search_job(self, pattern='', fonction=0, secteur=0, contract=0, experience=0):
return self.search_page.go(fonction=fonction,
experience=experience,
contract=contract,
secteur=secteur,
metier=urllib.quote_plus(pattern.encode('utf-8'))
).iter_job_adverts(domain=self.BASEURL)
def advanced_search_job(self, metier, fonction, secteur, contract, experience):
self.location('%s://%s/offre_emploi/index.aspx?v=___%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_'
% (self.PROTOCOL,
self.DOMAIN,
'0',
fonction,
experience,
'0',
contract,
'0',
'0',
secteur,
'0',
urllib.quote_plus(metier.encode(self.ENCODING))))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts(self.DOMAIN)
@id2url(RegionsJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)
def get_job_advert(self, _id, advert):
splitted_id = _id.split('#')
self.BASEURL = splitted_id[0]
return self.advert_page.go(_id=splitted_id[1]).get_job_advert(obj=advert)

View file

@ -1,29 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class RegionsJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
splitted_id = _id.split('|')
return 'http://%s/offre_emploi/detailoffre.aspx?numoffre=%s&de=consultation' \
% (splitted_id[0], splitted_id[1])

View file

@ -17,100 +17,59 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.misc import html2text
from weboob.tools.browser import BasePage
from .job import RegionsJobAdvert
from datetime import datetime, date
import re
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
from weboob.tools.date import LinearDateGuesser
from weboob.capabilities.job import BaseJobAdvert
__all__ = ['SearchPage']
class SearchPage(BasePage):
def iter_job_adverts(self, website):
re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL)
lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li')
for li in lis:
a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath')
_id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2))
advert = RegionsJobAdvert(_id)
advert.title = u'%s' % a.text
class SearchPage(HTMLPage):
@method
class iter_job_adverts(ListElement):
item_xpath = '//div[@id="liste_offres"]/ul/li'
society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a',
method='xpath')
if len(society_name) > 0:
advert.society_name = u'%s' % society_name[0].text
class item(ItemElement):
klass = BaseJobAdvert
advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span',
1, method='xpath').text.strip()
_date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]',
1, method='xpath').text_content()
year = date.today().year
splitted_date = _date.split('/')
advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0]))
advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span',
1, method='xpath').text
yield advert
obj_id = Format(u'%s#%s',
Env('domain'),
Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation'))
obj_title = CleanText('div/span[@class="offres_poste"]/a')
obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a')
obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span')
obj_contract_type = CleanText('div/span[@class="offres_poste"]/span')
obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser())
class AdvertPage(BasePage):
def get_job_advert(self, url, advert):
re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL)
if advert is None:
_id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
advert = RegionsJobAdvert(_id)
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
advert.url = u'%s' % url
def parse(self, el):
if self.obj.id:
advert = self.obj
advert.url = self.page.url
advert.description = Format(u'%s\r\n%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el)
advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el)
raise SkipItem()
div = self.document.getroot().xpath('//div[@id="annonce"]')[0]
self.env['url'] = self.page.url
advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
obj_description = Format(u'%s%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))
content = self.parser.select(div, 'p', method='xpath')
next_is_date = False
next_is_pay = False
description = ''
for p in content:
if next_is_date:
m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date)
if m:
dd = int(m.group(1))
mm = int(m.group(2))
yyyy = int(m.group(3))
advert.publication_date = datetime.date(yyyy, mm, dd)
next_is_date = False
elif next_is_pay:
advert.pay = html2text(self.parser.tostring(p))
next_is_pay = False
elif 'class' in p.attrib:
if p.attrib['class'] == 'contrat_loc':
_p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
content_p = _p.text_content().strip().split('\r\n')
for el in content_p:
splitted_el = el.split(':')
if len(splitted_el) == 2:
if splitted_el[0] == 'Entreprise':
advert.society_name = splitted_el[1]
elif splitted_el[0] == 'Contrat':
advert.contract_type = splitted_el[1]
elif splitted_el[0] == 'Localisation':
advert.place = splitted_el[1]
elif p.attrib['class'] == 'date_ref':
next_is_date = True
elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
next_is_pay = True
else:
description = description + html2text(self.parser.tostring(p))
else:
description = description + html2text(self.parser.tostring(p))
advert.description = u'%s' % description
return advert
obj_id = Env('_id')
obj_url = Env('url')
obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'),
'(\d{2}/\d{2}/\d{4})'))
obj_title = CleanText('//div[@id="annonce"]/h1')
obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]')
obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]')
obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]')
obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')