[popolemploi] fix search and upgrade to browser2

This commit is contained in:
Bezleputh 2014-12-15 15:15:53 +01:00 committed by Florent
commit 72f4e6c224
4 changed files with 67 additions and 149 deletions

View file

@ -17,55 +17,41 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser.decorators import id2url
from weboob.deprecated.browser import Browser
import urllib
from .pages import SearchPage, AdvertPage
from .job import PopolemploiJobAdvert
from weboob.browser import PagesBrowser, URL
from urllib import quote_plus, quote
__all__ = ['PopolemploiBrowser']
class PopolemploiBrowser(Browser):
PROTOCOL = 'http'
DOMAIN = 'http://www.pole-emploi.fr/accueil/'
ENCODING = None
class PopolemploiBrowser(PagesBrowser):
PAGES = {
'https?://candidat.pole-emploi.fr/candidat/rechercheoffres/resultats(.*?)': SearchPage,
'https?://candidat.pole-emploi.fr/candidat/rechercheoffres/detail/(?P<id>.+)': AdvertPage,
}
BASEURL = 'http://candidat.pole-emploi.fr'
advert = URL('candidat/rechercheoffres/detail/(?P<id>.*)', AdvertPage)
search = URL('candidat/rechercheoffres/resultats/(?P<search>.*?)',
'http://offre.pole-emploi.fr/resultat\?offresPartenaires=true&libMetier=(?P<pattern>.*?)', SearchPage)
def search_job(self, pattern=None):
self.location('http://offre.pole-emploi.fr/resultat?offresPartenaires=true&libMetier=%s'
% pattern.replace(' ', '+'))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
return self.search.go(pattern=quote_plus(pattern)).iter_job_adverts()
def advanced_search_job(self, metier='', place=None, contrat=None, salary=None,
qualification=None, limit_date=None, domain=None):
splitted_place = place.split('|')
params = 'A_%s_%s_%s__%s_P_%s_%s_%s_______INDIFFERENT______________%s' % (urllib.quote(metier.encode('utf-8')).replace('%', '$00'),
splitted_place[1],
splitted_place[2],
contrat,
domain,
salary,
qualification,
limit_date
)
search = 'A_%s_%s_%s__%s_P_%s_%s_%s_______INDIFFERENT______________%s___' % (quote(metier.encode('utf-8')).replace('%', '$00'),
splitted_place[1],
splitted_place[2],
contrat,
domain,
salary,
qualification,
limit_date
)
self.location('http://candidat.pole-emploi.fr/candidat/rechercheoffres/resultats/%s' % params)
return self.search.go(search=search).iter_job_adverts()
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
@id2url(PopolemploiJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)
def get_job_advert(self, id, advert):
return self.advert.go(id=id).get_job_advert(obj=advert)

View file

@ -1,26 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class PopolemploiJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
return 'http://candidat.pole-emploi.fr/candidat/rechercheoffres/detail/%s' % _id

View file

@ -16,7 +16,7 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
from weboob.tools.backend import Module, BackendConfig
from weboob.capabilities.job import CapJob
@ -24,7 +24,6 @@ from weboob.tools.value import Value
from weboob.tools.ordereddict import OrderedDict
from .browser import PopolemploiBrowser
from .job import PopolemploiJobAdvert
__all__ = ['PopolemploiModule']
@ -331,8 +330,7 @@ class PopolemploiModule(Module, CapJob):
Value('domain', label=u'Domain', choices=domain_choices, default=''))
def search_job(self, pattern=None):
with self.browser:
return self.browser.search_job(pattern=pattern)
return self.browser.search_job(pattern=pattern)
def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(),
@ -344,10 +342,9 @@ class PopolemploiModule(Module, CapJob):
domain=self.config['domain'].get())
def get_job_advert(self, _id, advert=None):
with self.browser:
return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert)
return self.get_job_advert(advert.id, advert)
OBJECTS = {PopolemploiJobAdvert: fill_obj}
OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -17,83 +17,44 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.html import html2text
from weboob.deprecated.browser import Page
import dateutil.parser
import re
from .job import PopolemploiJobAdvert
from weboob.capabilities.job import BaseJobAdvert
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import Regexp, CleanText, Date, Env, BrowserURL
from weboob.browser.filters.html import Link, CleanHTML
class SearchPage(Page):
def iter_job_adverts(self):
rows = self.document.getroot().xpath('//table[@class="definition-table ordered"]/tbody/tr')
for row in rows:
advert = self.create_job_advert(row)
if advert:
yield advert
class SearchPage(HTMLPage):
@method
class iter_job_adverts(ListElement):
item_xpath = '//table[@class="definition-table ordered"]/tbody/tr'
def create_job_advert(self, row):
re_id = re.compile('/candidat/rechercheoffres/resultats\.composantresultatrechercheoffre\.tableauresultatrechercheoffre:detailoffre/(.*?)\?(.*?)', re.DOTALL)
a = self.parser.select(row, 'td[@headers="offre"]/a', 1, method='xpath')
if re_id.match(a.attrib['href']):
_id = u'%s' % (re_id.search(a.attrib['href']).group(1))
advert = PopolemploiJobAdvert(_id)
advert.contract_type = u'%s' % self.parser.select(row, 'td[@headers="contrat"]', 1, method='xpath').text
advert.title = u'%s' % a.text_content().strip()
society = self.parser.select(row, 'td/div/p/span[@class="company"]', method='xpath')
if society:
advert.society_name = society[0].text
advert.place = u'%s' % self.parser.select(row, 'td[@headers="lieu"]', 1, method='xpath').text_content()
date = self.parser.select(row, 'td[@headers="dateEmission"]', 1, method='xpath')
advert.publication_date = dateutil.parser.parse(date.text, dayfirst=True).date()
return advert
class item(ItemElement):
klass = BaseJobAdvert
obj_id = Regexp(Link('td[@headers="offre"]/a'), '.*detailoffre/(.*?)(?:\?|;).*')
obj_contract_type = CleanText('td[@headers="contrat"]')
obj_title = CleanText('td[@headers="offre"]/a')
obj_society_name = CleanText('td/div/p/span[@class="company"]/span', default='')
obj_place = CleanText('td[@headers="lieu"]')
obj_publication_date = Date(CleanText('td[@headers="dateEmission"]'))
class AdvertPage(Page):
def get_job_advert(self, url, advert):
content = self.document.getroot().xpath('//div[@id="offre-body"]')[0]
if not advert:
_id = self.parser.select(content, 'div/div/ul/li/div[@class="value"]/span', 1, method='xpath').text
advert = PopolemploiJobAdvert(_id)
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
advert.title = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()
advert.job_name = u'%s' % self.parser.select(content, 'h4', 1, method='xpath').text.strip()
description = self.parser.select(content, 'p[@itemprop="description"]', 1, method='xpath')
advert.description = html2text(self.parser.tostring(description))
society_name = self.parser.select(content, 'div[@class="vcard"]/p[@class="title"]/span', method='xpath')
if society_name:
advert.society_name = u'%s' % society_name[0].text
advert.url = url
place = u'%s' % self.parser.select(content,
'dl/dd/ul/li[@itemprop="addressRegion"]',
1, method='xpath').text
advert.place = place.strip()
contract_type = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="employmentType"]',
1, method='xpath').text
advert.contract_type = contract_type.strip()
experience = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="experienceRequirements"]',
1, method='xpath').text
advert.experience = experience.strip()
formation = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="qualifications"]',
1, method='xpath').text
advert.formation = formation.strip()
pay = u'%s' % self.parser.select(content,
'dl/dd/span[@itemprop="baseSalary"]',
1, method='xpath').text
advert.pay = pay.strip()
return advert
obj_id = Env('id')
obj_url = BrowserURL('advert', id=Env('id'))
obj_title = CleanText('//div[@id="offre-body"]/h4[@itemprop="title"]')
obj_job_name = CleanText('//div[@id="offre-body"]/h4[@itemprop="title"]')
obj_description = CleanHTML('//div[@id="offre-body"]/p[@itemprop="description"]')
obj_society_name = CleanText('//div[@id="offre-body"]/div[@class="vcard"]/p[@class="title"]/span',
default='')
obj_contract_type = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="employmentType"]')
obj_place = CleanText('//div[@id="offre-body"]/dl/dd/ul/li[@itemprop="addressRegion"]')
obj_formation = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="qualifications"]')
obj_pay = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="baseSalary"]')
obj_experience = CleanText('//div[@id="offre-body"]/dl/dd/span[@itemprop="experienceRequirements"]')
obj_publication_date = Date(CleanText('//span[@itemprop="datePosted"]'))