[popolemploi] fix search and upgrade to browser2

This commit is contained in:
Bezleputh 2014-12-15 15:15:53 +01:00 committed by Florent
commit 72f4e6c224
4 changed files with 67 additions and 149 deletions

View file

@ -17,55 +17,41 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser.decorators import id2url
from weboob.deprecated.browser import Browser
import urllib
from .pages import SearchPage, AdvertPage from .pages import SearchPage, AdvertPage
from .job import PopolemploiJobAdvert from weboob.browser import PagesBrowser, URL
from urllib import quote_plus, quote
__all__ = ['PopolemploiBrowser'] __all__ = ['PopolemploiBrowser']
class PopolemploiBrowser(Browser): class PopolemploiBrowser(PagesBrowser):
PROTOCOL = 'http'
DOMAIN = 'http://www.pole-emploi.fr/accueil/'
ENCODING = None
PAGES = { BASEURL = 'http://candidat.pole-emploi.fr'
'https?://candidat.pole-emploi.fr/candidat/rechercheoffres/resultats(.*?)': SearchPage,
'https?://candidat.pole-emploi.fr/candidat/rechercheoffres/detail/(?P<id>.+)': AdvertPage, advert = URL('candidat/rechercheoffres/detail/(?P<id>.*)', AdvertPage)
} search = URL('candidat/rechercheoffres/resultats/(?P<search>.*?)',
'http://offre.pole-emploi.fr/resultat\?offresPartenaires=true&libMetier=(?P<pattern>.*?)', SearchPage)
def search_job(self, pattern=None): def search_job(self, pattern=None):
self.location('http://offre.pole-emploi.fr/resultat?offresPartenaires=true&libMetier=%s' return self.search.go(pattern=quote_plus(pattern)).iter_job_adverts()
% pattern.replace(' ', '+'))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
def advanced_search_job(self, metier='', place=None, contrat=None, salary=None, def advanced_search_job(self, metier='', place=None, contrat=None, salary=None,
qualification=None, limit_date=None, domain=None): qualification=None, limit_date=None, domain=None):
splitted_place = place.split('|') splitted_place = place.split('|')
params = 'A_%s_%s_%s__%s_P_%s_%s_%s_______INDIFFERENT______________%s' % (urllib.quote(metier.encode('utf-8')).replace('%', '$00'), search = 'A_%s_%s_%s__%s_P_%s_%s_%s_______INDIFFERENT______________%s___' % (quote(metier.encode('utf-8')).replace('%', '$00'),
splitted_place[1], splitted_place[1],
splitted_place[2], splitted_place[2],
contrat, contrat,
domain, domain,
salary, salary,
qualification, qualification,
limit_date limit_date
) )
self.location('http://candidat.pole-emploi.fr/candidat/rechercheoffres/resultats/%s' % params) return self.search.go(search=search).iter_job_adverts()
assert self.is_on_page(SearchPage) def get_job_advert(self, id, advert):
return self.page.iter_job_adverts() return self.advert.go(id=id).get_job_advert(obj=advert)
@id2url(PopolemploiJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)

View file

@ -1,26 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class PopolemploiJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
return 'http://candidat.pole-emploi.fr/candidat/rechercheoffres/detail/%s' % _id

View file

@ -16,7 +16,7 @@
# #
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
from weboob.tools.backend import Module, BackendConfig from weboob.tools.backend import Module, BackendConfig
from weboob.capabilities.job import CapJob from weboob.capabilities.job import CapJob
@ -24,7 +24,6 @@ from weboob.tools.value import Value
from weboob.tools.ordereddict import OrderedDict from weboob.tools.ordereddict import OrderedDict
from .browser import PopolemploiBrowser from .browser import PopolemploiBrowser
from .job import PopolemploiJobAdvert
__all__ = ['PopolemploiModule'] __all__ = ['PopolemploiModule']
@ -331,8 +330,7 @@ class PopolemploiModule(Module, CapJob):
Value('domain', label=u'Domain', choices=domain_choices, default='')) Value('domain', label=u'Domain', choices=domain_choices, default=''))
def search_job(self, pattern=None): def search_job(self, pattern=None):
with self.browser: return self.browser.search_job(pattern=pattern)
return self.browser.search_job(pattern=pattern)
def advanced_search_job(self): def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(), return self.browser.advanced_search_job(metier=self.config['metier'].get(),
@ -344,10 +342,9 @@ class PopolemploiModule(Module, CapJob):
domain=self.config['domain'].get()) domain=self.config['domain'].get())
def get_job_advert(self, _id, advert=None): def get_job_advert(self, _id, advert=None):
with self.browser: return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields): def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert) return self.get_job_advert(advert.id, advert)
OBJECTS = {PopolemploiJobAdvert: fill_obj} OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -17,83 +17,44 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.html import html2text from weboob.capabilities.job import BaseJobAdvert
from weboob.deprecated.browser import Page from weboob.browser.pages import HTMLPage
import dateutil.parser from weboob.browser.elements import ItemElement, ListElement, method
import re from weboob.browser.filters.standard import Regexp, CleanText, Date, Env, BrowserURL
from weboob.browser.filters.html import Link, CleanHTML
from .job import PopolemploiJobAdvert
class SearchPage(Page): class SearchPage(HTMLPage):
def iter_job_adverts(self): @method
rows = self.document.getroot().xpath('//table[@class="definition-table ordered"]/tbody/tr') class iter_job_adverts(ListElement):
for row in rows: item_xpath = '//table[@class="definition-table ordered"]/tbody/tr'
advert = self.create_job_advert(row)
if advert:
yield advert
def create_job_advert(self, row): class item(ItemElement):
re_id = re.compile('/candidat/rechercheoffres/resultats\.composantresultatrechercheoffre\.tableauresultatrechercheoffre:detailoffre/(.*?)\?(.*?)', re.DOTALL) klass = BaseJobAdvert
a = self.parser.select(row, 'td[@headers="offre"]/a', 1, method='xpath')
if re_id.match(a.attrib['href']): obj_id = Regexp(Link('td[@headers="offre"]/a'), '.*detailoffre/(.*?)(?:\?|;).*')
_id = u'%s' % (re_id.search(a.attrib['href']).group(1)) obj_contract_type = CleanText('td[@headers="contrat"]')
advert = PopolemploiJobAdvert(_id) obj_title = CleanText('td[@headers="offre"]/a')
advert.contract_type = u'%s' % self.parser.select(row, 'td[@headers="contrat"]', 1, method='xpath').text obj_society_name = CleanText('td/div/p/span[@class="company"]/span', default='')
advert.title = u'%s' % a.text_content().strip() obj_place = CleanText('td[@headers="lieu"]')
society = self.parser.select(row, 'td/div/p/span[@class="company"]', method='xpath') obj_publication_date = Date(CleanText('td[@headers="dateEmission"]'))
if society:
advert.society_name = society[0].text
advert.place = u'%s' % self.parser.select(row, 'td[@headers="lieu"]', 1, method='xpath').text_content()
date = self.parser.select(row, 'td[@headers="dateEmission"]', 1, method='xpath')
advert.publication_date = dateutil.parser.parse(date.text, dayfirst=True).date()
return advert
class AdvertPage(Page): class AdvertPage(HTMLPage):
def get_job_advert(self, url, advert): @method
content = self.document.getroot().xpath('//div[@id="offre-body"]')[0] class get_job_advert(ItemElement):
if not advert: klass = BaseJobAdvert
_id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text
advert = PopolemploiJobAdvert(_id)
advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() obj_id = Env('id')
advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip() obj_url = BrowserURL('advert', id=Env('id'))
obj_title = CleanText('//div[@id="offre-body"]/h4[@itemprop="title"]')
description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath') obj_job_name = CleanText('//div[@id="offre-body"]/h4[@itemprop="title"]')
advert.description = html2text(self.parser.tostring(description)) obj_description = CleanHTML('//div[@id="offre-body"]/p[@itemprop="description"]')
obj_society_name = CleanText('//div[@id="offre-body"]/div[@class="vcard"]/p[@class="title"]/span',
society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath') default='')
obj_contract_type = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="employmentType"]')
if society_name: obj_place = CleanText('//div[@id="offre-body"]/dl/dd/ul/li[@itemprop="addressRegion"]')
advert.society_name = u'%s' % society_name[0].text obj_formation = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="qualifications"]')
obj_pay = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="baseSalary"]')
advert.url = url obj_experience = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="experienceRequirements"]')
obj_publication_date = Date(CleanText('//span[@itemprop="datePosted"]'))
place = u'%s' % self.parser.select(content,
'dl/dd/ul/li[@itemprop="addressRegion"]',
1, method='xpath').text
advert.place = place.strip()
contract_type = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="employmentType"]',
1, method='xpath').text
advert.contract_type = contract_type.strip()
experience = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="experienceRequirements"]',
1, method='xpath').text
advert.experience = experience.strip()
formation = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="qualifications"]',
1, method='xpath').text
advert.formation = formation.strip()
pay = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="baseSalary"]',
1, method='xpath').text
advert.pay = pay.strip()
return advert