[regionsjob] adapt to browser2
This commit is contained in:
parent
34a7481895
commit
ec07532a63
4 changed files with 81 additions and 178 deletions
|
|
@ -19,13 +19,11 @@
|
|||
|
||||
|
||||
from weboob.tools.backend import BaseBackend, BackendConfig
|
||||
from weboob.capabilities.job import ICapJob
|
||||
from weboob.capabilities.job import ICapJob, BaseJobAdvert
|
||||
from .browser import RegionsjobBrowser
|
||||
from weboob.tools.ordereddict import OrderedDict
|
||||
from weboob.tools.value import Value
|
||||
|
||||
from .job import RegionsJobAdvert
|
||||
|
||||
|
||||
__all__ = ['RegionsjobBackend']
|
||||
|
||||
|
|
@ -153,21 +151,19 @@ class RegionsjobBackend(BaseBackend, ICapJob):
|
|||
return self.create_browser(self.config['website'].get())
|
||||
|
||||
def search_job(self, pattern=''):
|
||||
with self.browser:
|
||||
return self.browser.search_job(pattern=pattern)
|
||||
return self.browser.search_job(pattern=pattern)
|
||||
|
||||
def advanced_search_job(self):
|
||||
return self.browser.advanced_search_job(metier=self.config['metier'].get(),
|
||||
fonction=int(self.config['fonction'].get()),
|
||||
secteur=int(self.config['secteur'].get()),
|
||||
contract=int(self.config['contract'].get()),
|
||||
experience=int(self.config['experience'].get()))
|
||||
return self.browser.search_job(pattern=self.config['metier'].get(),
|
||||
fonction=int(self.config['fonction'].get()),
|
||||
secteur=int(self.config['secteur'].get()),
|
||||
contract=int(self.config['contract'].get()),
|
||||
experience=int(self.config['experience'].get()))
|
||||
|
||||
def get_job_advert(self, _id, advert=None):
|
||||
with self.browser:
|
||||
return self.browser.get_job_advert(_id, advert)
|
||||
return self.browser.get_job_advert(_id, advert)
|
||||
|
||||
def fill_obj(self, advert, fields):
|
||||
self.get_job_advert(advert.id, advert)
|
||||
return self.get_job_advert(advert.id, advert)
|
||||
|
||||
OBJECTS = {RegionsJobAdvert: fill_obj}
|
||||
OBJECTS = {BaseJobAdvert: fill_obj}
|
||||
|
|
|
|||
|
|
@ -18,54 +18,31 @@
|
|||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
import urllib
|
||||
|
||||
from weboob.tools.browser import BaseBrowser
|
||||
from weboob.tools.browser.decorators import id2url
|
||||
from weboob.tools.browser2 import PagesBrowser, URL
|
||||
|
||||
from .pages import SearchPage, AdvertPage
|
||||
from .job import RegionsJobAdvert
|
||||
|
||||
|
||||
__all__ = ['RegionsjobBrowser']
|
||||
|
||||
|
||||
class RegionsjobBrowser(BaseBrowser):
|
||||
PROTOCOL = 'http'
|
||||
ENCODING = 'utf-8'
|
||||
class RegionsjobBrowser(PagesBrowser):
|
||||
|
||||
PAGES = {
|
||||
'%s://(.*?)/offre_emploi/index.aspx\?v=___(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_' % (PROTOCOL): SearchPage,
|
||||
'%s://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation' % (PROTOCOL): AdvertPage,
|
||||
}
|
||||
advert_page = URL('/offre_emploi/detailoffre.aspx\?numoffre=(?P<_id>.*)&de=consultation', AdvertPage)
|
||||
search_page = URL('/offre_emploi/index.aspx\?v=___0_(?P<fonction>.*)_(?P<experience>.*)_0_(?P<contract>.*)_0_0_(?P<secteur>.*)_0_(?P<metier>.*)_', SearchPage)
|
||||
|
||||
def __init__(self, website, *args, **kwargs):
|
||||
self.DOMAIN = website
|
||||
BaseBrowser.__init__(self, *args, **kwargs)
|
||||
self.BASEURL = 'http://%s' % website
|
||||
PagesBrowser.__init__(self, *args, **kwargs)
|
||||
|
||||
def search_job(self, pattern=''):
|
||||
self.location('%s://%s/offre_emploi/index.aspx?v=___0_0_0_0_0_0_0_0_0_%s_'
|
||||
% (self.PROTOCOL, self.DOMAIN, urllib.quote_plus(pattern.encode(self.ENCODING))))
|
||||
assert self.is_on_page(SearchPage)
|
||||
return self.page.iter_job_adverts(self.DOMAIN)
|
||||
def search_job(self, pattern='', fonction=0, secteur=0, contract=0, experience=0):
|
||||
return self.search_page.go(fonction=fonction,
|
||||
experience=experience,
|
||||
contract=contract,
|
||||
secteur=secteur,
|
||||
metier=urllib.quote_plus(pattern.encode('utf-8'))
|
||||
).iter_job_adverts(domain=self.BASEURL)
|
||||
|
||||
def advanced_search_job(self, metier, fonction, secteur, contract, experience):
|
||||
self.location('%s://%s/offre_emploi/index.aspx?v=___%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_'
|
||||
% (self.PROTOCOL,
|
||||
self.DOMAIN,
|
||||
'0',
|
||||
fonction,
|
||||
experience,
|
||||
'0',
|
||||
contract,
|
||||
'0',
|
||||
'0',
|
||||
secteur,
|
||||
'0',
|
||||
urllib.quote_plus(metier.encode(self.ENCODING))))
|
||||
assert self.is_on_page(SearchPage)
|
||||
return self.page.iter_job_adverts(self.DOMAIN)
|
||||
|
||||
@id2url(RegionsJobAdvert.id2url)
|
||||
def get_job_advert(self, url, advert):
|
||||
self.location(url)
|
||||
assert self.is_on_page(AdvertPage)
|
||||
return self.page.get_job_advert(url, advert)
|
||||
def get_job_advert(self, _id, advert):
|
||||
splitted_id = _id.split('#')
|
||||
self.BASEURL = splitted_id[0]
|
||||
return self.advert_page.go(_id=splitted_id[1]).get_job_advert(obj=advert)
|
||||
|
|
|
|||
|
|
@ -1,29 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright(C) 2013 Bezleputh
|
||||
#
|
||||
# This file is part of weboob.
|
||||
#
|
||||
# weboob is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# weboob is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.capabilities.job import BaseJobAdvert
|
||||
|
||||
|
||||
class RegionsJobAdvert(BaseJobAdvert):
|
||||
@classmethod
|
||||
def id2url(cls, _id):
|
||||
splitted_id = _id.split('|')
|
||||
return 'http://%s/offre_emploi/detailoffre.aspx?numoffre=%s&de=consultation' \
|
||||
% (splitted_id[0], splitted_id[1])
|
||||
|
||||
|
|
@ -17,100 +17,59 @@
|
|||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.tools.misc import html2text
|
||||
from weboob.tools.browser import BasePage
|
||||
from .job import RegionsJobAdvert
|
||||
from datetime import datetime, date
|
||||
import re
|
||||
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
|
||||
from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
|
||||
from weboob.tools.date import LinearDateGuesser
|
||||
from weboob.capabilities.job import BaseJobAdvert
|
||||
|
||||
__all__ = ['SearchPage']
|
||||
|
||||
|
||||
class SearchPage(BasePage):
|
||||
def iter_job_adverts(self, website):
|
||||
re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL)
|
||||
lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li')
|
||||
for li in lis:
|
||||
a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath')
|
||||
_id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2))
|
||||
advert = RegionsJobAdvert(_id)
|
||||
advert.title = u'%s' % a.text
|
||||
class SearchPage(HTMLPage):
|
||||
@method
|
||||
class iter_job_adverts(ListElement):
|
||||
item_xpath = '//div[@id="liste_offres"]/ul/li'
|
||||
|
||||
society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a',
|
||||
method='xpath')
|
||||
if len(society_name) > 0:
|
||||
advert.society_name = u'%s' % society_name[0].text
|
||||
class item(ItemElement):
|
||||
klass = BaseJobAdvert
|
||||
|
||||
advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span',
|
||||
1, method='xpath').text.strip()
|
||||
_date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]',
|
||||
1, method='xpath').text_content()
|
||||
year = date.today().year
|
||||
splitted_date = _date.split('/')
|
||||
advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0]))
|
||||
advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span',
|
||||
1, method='xpath').text
|
||||
yield advert
|
||||
obj_id = Format(u'%s#%s',
|
||||
Env('domain'),
|
||||
Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation'))
|
||||
obj_title = CleanText('div/span[@class="offres_poste"]/a')
|
||||
obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a')
|
||||
obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span')
|
||||
obj_contract_type = CleanText('div/span[@class="offres_poste"]/span')
|
||||
obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser())
|
||||
|
||||
|
||||
class AdvertPage(BasePage):
|
||||
def get_job_advert(self, url, advert):
|
||||
re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL)
|
||||
if advert is None:
|
||||
_id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
|
||||
advert = RegionsJobAdvert(_id)
|
||||
class AdvertPage(HTMLPage):
|
||||
@method
|
||||
class get_job_advert(ItemElement):
|
||||
klass = BaseJobAdvert
|
||||
|
||||
advert.url = u'%s' % url
|
||||
def parse(self, el):
|
||||
if self.obj.id:
|
||||
advert = self.obj
|
||||
advert.url = self.page.url
|
||||
advert.description = Format(u'%s\r\n%s',
|
||||
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
|
||||
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el)
|
||||
advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el)
|
||||
raise SkipItem()
|
||||
|
||||
div = self.document.getroot().xpath('//div[@id="annonce"]')[0]
|
||||
self.env['url'] = self.page.url
|
||||
|
||||
advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
|
||||
obj_description = Format(u'%s%s',
|
||||
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
|
||||
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))
|
||||
|
||||
content = self.parser.select(div, 'p', method='xpath')
|
||||
|
||||
next_is_date = False
|
||||
next_is_pay = False
|
||||
description = ''
|
||||
|
||||
for p in content:
|
||||
if next_is_date:
|
||||
m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date)
|
||||
if m:
|
||||
dd = int(m.group(1))
|
||||
mm = int(m.group(2))
|
||||
yyyy = int(m.group(3))
|
||||
advert.publication_date = datetime.date(yyyy, mm, dd)
|
||||
next_is_date = False
|
||||
|
||||
elif next_is_pay:
|
||||
advert.pay = html2text(self.parser.tostring(p))
|
||||
next_is_pay = False
|
||||
|
||||
elif 'class' in p.attrib:
|
||||
if p.attrib['class'] == 'contrat_loc':
|
||||
_p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
|
||||
content_p = _p.text_content().strip().split('\r\n')
|
||||
for el in content_p:
|
||||
splitted_el = el.split(':')
|
||||
if len(splitted_el) == 2:
|
||||
if splitted_el[0] == 'Entreprise':
|
||||
advert.society_name = splitted_el[1]
|
||||
elif splitted_el[0] == 'Contrat':
|
||||
advert.contract_type = splitted_el[1]
|
||||
elif splitted_el[0] == 'Localisation':
|
||||
advert.place = splitted_el[1]
|
||||
|
||||
elif p.attrib['class'] == 'date_ref':
|
||||
next_is_date = True
|
||||
|
||||
elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
|
||||
next_is_pay = True
|
||||
|
||||
else:
|
||||
description = description + html2text(self.parser.tostring(p))
|
||||
else:
|
||||
description = description + html2text(self.parser.tostring(p))
|
||||
|
||||
advert.description = u'%s' % description
|
||||
|
||||
return advert
|
||||
obj_id = Env('_id')
|
||||
obj_url = Env('url')
|
||||
obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'),
|
||||
'(\d{2}/\d{2}/\d{4})'))
|
||||
obj_title = CleanText('//div[@id="annonce"]/h1')
|
||||
obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]')
|
||||
obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]')
|
||||
obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]')
|
||||
obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue