[apec] adapt to the new version of the website and use browser2

This commit is contained in:
Bezleputh 2015-07-17 17:10:33 +02:00
commit 184bd6869a
3 changed files with 329 additions and 171 deletions

View file

@ -17,50 +17,97 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser.decorators import id2url
from weboob.deprecated.browser import Browser
import urllib
from .pages import SearchPage, AdvertPage
from .job import ApecJobAdvert
from weboob.browser.profiles import Profile
from weboob.browser import PagesBrowser, URL
from .pages import IdsPage, OffrePage
__all__ = ['ApecBrowser']
class ApecBrowser(Browser):
PROTOCOL = 'https'
DOMAIN = 'www.apec.fr'
ENCODING = 'ISO-8859-1'
class JsonProfile(Profile):
def setup_session(self, session):
session.headers["Content-Type"] = "application/json"
PAGES = {
'https://cadres.apec.fr/liste-offres-emploi-cadres/71____(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)___offre-d-emploi.html': SearchPage,
'https://cadres.apec.fr/MesOffres/RechercheOffres/ApecRechercheOffre.jsp\?keywords=(.*?)': SearchPage,
'https://cadres.apec.fr/offres-emploi-cadres/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)': AdvertPage,
}
class ApecBrowser(PagesBrowser):
BASEURL = 'https://cadres.apec.fr'
PROFILE = JsonProfile()
start = 0
json_count = URL('/cms/webservices/rechercheOffre/count', IdsPage)
json_ids = URL('/cms/webservices/rechercheOffre/ids', IdsPage)
json_offre = URL('/cms/webservices/offre/public\?numeroOffre=(?P<_id>.*)', OffrePage)
def create_parameters(self, pattern='', fonctions='[]', lieux='[]', secteursActivite='[]', typesContrat='[]', typesConvention='[]', niveauxExperience='[]', salaire_min='', salaire_max='', date_publication='', start=0, range=20):
if date_publication:
date_publication = ',"anciennetePublication":%s' % (date_publication)
if salaire_max:
salaire_max = ',"salaireMaximum":%s' % (salaire_max)
if salaire_min:
salaire_min = ',"salaireMinimum":%s' % (salaire_min)
return '{"activeFiltre":true,"motsCles":"%s","fonctions":%s,"lieux":%s,"secteursActivite":%s,"typesContrat":%s,"typesConvention":%s,"niveauxExperience":%s%s%s%s,"sorts":[{"type":"SCORE","direction":"DESCENDING"}],"pagination":{"startIndex":%s,"range":%s},"typeClient":"CADRE"}' % (pattern, fonctions, lieux, secteursActivite, typesContrat, typesConvention, niveauxExperience, salaire_min, salaire_max, date_publication, start, range)
def search_job(self, pattern=None):
self.location('https://cadres.apec.fr/MesOffres/RechercheOffres/ApecRechercheOffre.jsp?keywords=%s'
% urllib.quote_plus(pattern.encode(self.ENCODING)))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
data = self.create_parameters(pattern=pattern)
count = self.json_count.go(data=data).get_adverts_number()
self.start = 0
if count:
ids = self.json_ids.go(data=data).iter_job_adverts(pattern=pattern,
fonctions='[]',
lieux='[]',
secteursActivite='[]',
typesContrat='[]',
typesConvention='[]',
niveauxExperience='[]',
salaire_min='',
salaire_max='',
date_publication='',
start=self.start,
count=count,
range=20)
for _id in ids:
yield self.json_offre.go(_id=_id.id).get_job_advert()
def advanced_search_job(self, region=None, fonction=None, secteur=None, salaire=None, contrat=None, limit_date=None, level=None):
self.location(
'https://cadres.apec.fr/liste-offres-emploi-cadres/71____%s_%s_%s_%s_%s_%s_%s___offre-d-emploi.html'
% (
region,
fonction,
secteur,
salaire,
level,
limit_date,
contrat
))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
def get_job_advert(self, _id, advert=None):
return self.json_offre.go(_id=_id).get_job_advert(obj=advert)
@id2url(ApecJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)
def advanced_search_job(self, region='', fonction='', secteur='', salaire='', contrat='', limit_date='', level=''):
salaire_max = ''
salaire_min = ''
if salaire:
s = salaire.split('|')
salaire_max = s[1]
salaire_min = s[0]
data = self.create_parameters(fonctions='[%s]' % fonction,
lieux='[%s]' % region,
secteursActivite='[%s]' % secteur,
typesContrat='[%s]' % contrat,
niveauxExperience='[%s]' % level,
salaire_min=salaire_min,
salaire_max=salaire_max,
date_publication=limit_date)
count = self.json_count.go(data=data).get_adverts_number()
self.start
if count:
ids = self.json_ids.go(data=data).iter_job_adverts(pattern='',
fonctions='[%s]' % fonction,
lieux='[%s]' % region,
secteursActivite='[%s]' % secteur,
typesContrat='[%s]' % contrat,
niveauxExperience='[%s]' % level,
salaire_min=salaire_min,
salaire_max=salaire_max,
date_publication=limit_date,
start=self.start,
count=count,
range=20)
for _id in ids:
yield self.json_offre.go(_id=_id).get_job_advert()

View file

@ -17,13 +17,12 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
from weboob.tools.backend import Module, BackendConfig
from weboob.capabilities.job import CapJob
from weboob.tools.ordereddict import OrderedDict
from weboob.tools.value import Value
from .browser import ApecBrowser
from .job import ApecJobAdvert
__all__ = ['ApecModule']
@ -38,63 +37,174 @@ class ApecModule(Module, CapJob):
BROWSER = ApecBrowser
places_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
'00|': u'-- Indifférent --',
'01|700': u'Alsace',
'02|701': u'Aquitaine',
'03|702': u'Auvergne',
'04|703': u'Basse-Normandie',
'05|704': u'Bourgogne',
'06|705': u'Bretagne',
'07|706': u'Centre',
'08|707': u'Champagne',
'09|20': u'Corse',
'10|99712': u'France Outre-Mer',
'11|709': u'Franche-Comté',
'12|710': u'Haute-Normandie',
'13|711': u'Ile-de-France',
'14|712': u'Languedoc-Roussillon',
'15|713': u'Limousin',
'16|714': u'Lorraine',
'17|715': u'Midi-Pyrénées',
'18|716': u'Nord-Pas-de-Calais',
'19|720': u'PACA',
'20|717': u'Pays de La Loire',
'21|718': u'Picardie',
'22|719': u'Poitou-Charentes',
'23|721': u'Rhône-Alpes',
'24|99109': u'Allemagne',
'25|99106': u'Estonie',
'26|99108': u'Lituanie',
'27|99116': u'République Tchèque',
'28|99110': u'Autriche',
'29|99105': u'Finlande',
'30|99137': u'Luxembourg',
'31|99114': u'Roumanie',
'32|99131': u'Belgique',
'33|99126': u'Grèce',
'34|99144': u'Malte',
'35|99132': u'Royaume Uni',
'36|99111': u'Bulgarie',
'37|99112': u'Hongrie',
'38|99135': u'Pays Bas',
'39|99117': u'Slovaquie',
'40|99254': u'Chypre',
'41|99136': u'Irlande',
'42|99122': u'Pologne',
'43|99145': u'Slovénie',
'44|99101': u'Danemark',
'45|99127': u'Italie',
'46|99139': u'Portugal',
'47|99104': u'Suède',
'48|99134': u'Espagne',
'49|99107': u'Lettonie',
'50|99700': u'UE Hors France',
'51|99702': u'Amérique du Nord',
'52|99715': u'Afrique',
'53|99711': u'Océanie',
'54|99701': u'Europe Hors UE',
'55|99714': u'Amérique Latine',
'56|99716': u'Asie',
'001|99700': u'UE Hors France',
'002|99126': u'..Grèce',
'003|99132': u'..Royaume Uni',
'004|99134': u'..Espagne',
'005|99136': u'..Irlande',
'006|99139': u'..Portugal',
'007|99254': u'..Chypre',
'008|99127': u'..Italie',
'009|99131': u'..Belgique',
'010|99135': u'..Pays Bas',
'011|99137': u'..Luxembourg',
'012|99144': u'..Malte',
'013|99145': u'..Slovénie',
'014|99101': u'..Danemark',
'015|99104': u'..Suède',
'016|99105': u'..Finlande',
'017|99106': u'..Estonie',
'018|99107': u'..Lettonie',
'019|99108': u'..Lituanie',
'020|99109': u'..Allemagne',
'021|99110': u'..Autriche',
'022|99111': u'..Bulgarie',
'023|99112': u'..Hongrie',
'024|99114': u'..Roumanie',
'025|99116': u'..République Tchèque',
'026|99117': u'..Slovaquie',
'027|99119': u'..Croatie',
'028|99122': u'..Pologne',
'029|799': u'France',
'030|711': u'..Ile-de-France',
'031|75': u'....Paris',
'032|77': u'....Seine-et-Marne',
'033|78': u'....Yvelines',
'034|91': u'....Essonne',
'035|92': u'....Hauts-de-Seine',
'036|93': u'....Seine-Saint-Denis',
'037|94': u'....Val-de-Marne',
'038|95': u'....Val-d\'Oise',
'039|703': u'..Basse-Normandie',
'040|14': u'....Calvados',
'041|50': u'....Manche',
'042|61': u'....Orne',
'043|705': u'..Bretagne',
'044|22': u'....Côtes d\'Armor',
'045|29': u'....Finistère',
'046|35': u'....Ille-et-Vilaine',
'047|56': u'....Morbihan',
'048|706': u'..Centre',
'049|18': u'....Cher',
'050|28': u'....Eure-et-Loir',
'051|36': u'....Indre',
'052|37': u'....Indre-et-Loire',
'053|41': u'....Loir-et-Cher',
'054|45': u'....Loiret',
'055|710': u'..Haute-Normandie',
'056|27': u'....Eure',
'057|76': u'....Seine-Maritime',
'058|717': u'..Pays de La Loire',
'059|44': u'....Loire-Atlantique',
'060|49': u'....Maine-et-Loire',
'061|53': u'....Mayenne',
'062|72': u'....Sarthe',
'063|85': u'....Vendée',
'064|700': u'..Alsace',
'065|67': u'....Bas-Rhin',
'066|68': u'....Haut-Rhin',
'067|704': u'..Bourgogne',
'068|21': u'....Côte d\'Or',
'069|58': u'....Nièvre',
'070|71': u'....Saône-et-Loire',
'071|89': u'....Yonne',
'072|707': u'..Champagne',
'073|8': u'....Ardennes',
'074|10': u'....Aube',
'075|51': u'....Marne',
'076|52': u'....Haute-Marne',
'077|709': u'..Franche-Comté',
'078|25': u'....Doubs',
'079|39': u'....Jura',
'080|70': u'....Haute-Saône',
'081|90': u'....Territoire de Belfort',
'082|714': u'..Lorraine',
'083|54': u'....Meurthe-et-Moselle',
'084|55': u'....Meuse',
'085|57': u'....Moselle',
'086|88': u'....Vosges',
'087|716': u'..Nord-Pas-de-Calais',
'088|59': u'....Nord',
'089|62': u'....Pas-de-Calais',
'090|718': u'..Picardie',
'091|2': u'....Aisne',
'092|60': u'....Oise',
'093|80': u'....Somme',
'094|20': u'..Corse',
'095|750': u'....Corse du Sud',
'096|751': u'....Haute-Corse',
'097|702': u'..Auvergne',
'098|3': u'....Allier',
'099|15': u'....Cantal',
'100|43': u'....Haute-Loire',
'101|63': u'....Puy-de-Dôme',
'102|720': u'..PACA',
'103|4': u'....Alpes-de-Haute-Provence',
'104|5': u'....Hautes-Alpes',
'105|6': u'....Alpes-Maritimes',
'106|13': u'....Bouches-du-Rhône',
'107|83': u'....Var',
'108|84': u'....Vaucluse',
'109|721': u'..Rhône-Alpes',
'110|1': u'....Ain',
'111|7': u'....Ardèche',
'112|26': u'....Drôme',
'113|38': u'....Isère',
'114|42': u'....Loire',
'115|69': u'....Rhône',
'116|73': u'....Savoie',
'117|74': u'....Haute-Savoie',
'118|701': u'..Aquitaine',
'119|24': u'....Dordogne',
'120|33': u'....Gironde',
'121|40': u'....Landes',
'122|47': u'....Lot-et-Garonne',
'123|64': u'....Pyrénées-Atlantiques',
'124|712': u'..Languedoc-Roussillon',
'125|11': u'....Aude',
'126|30': u'....Gard',
'127|34': u'....Hérault',
'128|48': u'....Lozère',
'129|66': u'....Pyrénées-Orientales',
'130|713': u'..Limousin',
'131|19': u'....Corrèze',
'132|23': u'....Creuse',
'133|87': u'....Haute-Vienne',
'134|715': u'..Midi-Pyrénées',
'135|9': u'....Ariège',
'136|12': u'....Aveyron',
'137|31': u'....Haute-Garonne',
'138|32': u'....Gers',
'139|46': u'....Lot',
'140|65': u'....Hautes-Pyrénées',
'141|81': u'....Tarn',
'142|82': u'....Tarn-et-Garonne',
'143|719': u'..Poitou-Charentes',
'144|16': u'....Charente',
'145|17': u'....Charente-Maritime',
'146|79': u'....Deux-Sèvres',
'147|86': u'....Vienne',
'148|99712': u'..France Outre-Mer',
'149|99519': u'....Terres Australes et Antarctiques Françaises',
'150|97100': u'....Guadeloupe',
'151|97200': u'....Martinique',
'152|97300': u'....Guyane',
'153|97400': u'....La Réunion',
'154|97500': u'....Saint-Pierre-et-Miquelon',
'155|97600': u'....Mayotte',
'156|98300': u'....Polynésie Française',
'157|98600': u'....Wallis et Futuna',
'158|98800': u'....Nouvelle Calédonie',
'159|97800': u'....Saint-Martin',
'160|97700': u'....Saint-Barthélémy',
'161|102099': u'International',
'162|99715': u'..Afrique',
'163|99716': u'..Asie',
'164|99700': u'..UE Hors France',
'165|99701': u'..Europe Hors UE',
'166|99702': u'..Amérique du Nord',
'167|99711': u'..Océanie',
'168|99714': u'..Amérique Latine',
}.iteritems())])
fonction_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
@ -191,18 +301,18 @@ class ApecModule(Module, CapJob):
type_contrat_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
' ': u'-- Indifférent --',
'143694': u'CDI',
'143695': u'CDD',
'143696': u'Travail Temporaire',
'101888': u'CDI',
'101887': u'CDD',
'101889': u'Interim',
}.iteritems())])
salary_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
' ': u'-- Indifférent --',
'101839': u'Moins de 35 K€',
'101840': u'Entre 35 et 49 K€',
'101841': u'Entre 50 et 69 K€',
'101842': u'Entre 70 et 90 K€',
'101843': u'Plus de 90 K€',
'0|35': u'Moins de 35 K€',
'35|50': u'Entre 35 et 49 K€',
'50|70': u'Entre 50 et 69 K€',
'70|90': u'Entre 70 et 90 K€',
'90|1000': u'Plus de 90 K€',
}.iteritems())])
date_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
@ -214,9 +324,9 @@ class ApecModule(Module, CapJob):
}.iteritems())])
level_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
' ': u'-- Indifférent --',
'101846': u'Débutant',
'101848': u'Expérimenté',
'101882': u'Tous niveaux d\'expérience',
'101881': u'Débutant',
'101883': u'Expérimenté',
}.iteritems())])
CONFIG = BackendConfig(Value('place', label=u'Lieu', choices=places_choices, default=''),
@ -228,9 +338,8 @@ class ApecModule(Module, CapJob):
Value('level', label=u'Expérience', choices=level_choices, default=''))
def search_job(self, pattern=None):
with self.browser:
for job_advert in self.browser.search_job(pattern=pattern):
yield job_advert
for job_advert in self.browser.search_job(pattern=pattern):
yield self.fill_obj(job_advert)
def decode_choice(self, choice):
splitted_choice = choice.split('|')
@ -247,13 +356,19 @@ class ApecModule(Module, CapJob):
contrat=self.config['contrat'].get(),
limit_date=self.config['limit_date'].get(),
level=self.config['level'].get()):
yield job_advert
yield self.fill_obj(job_advert)
def get_job_advert(self, _id, advert=None):
with self.browser:
return self.browser.get_job_advert(_id, advert)
job_advert = self.browser.get_job_advert(_id, advert)
return self.fill_obj(job_advert)
def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert)
def fill_obj(self, advert, fields=None):
if advert.contract_type in self.type_contrat_choices:
advert.contract_type = self.type_contrat_choices[advert.contract_type]
OBJECTS = {ApecJobAdvert: fill_obj}
if advert.experience in self.level_choices:
advert.experience = self.level_choices[advert.experience]
return advert
OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -16,67 +16,63 @@
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import requests
from weboob.browser.elements import ItemElement, method, DictElement
from weboob.browser.pages import JsonPage, pagination
from weboob.browser.filters.standard import DateTime, Format, Regexp
from weboob.browser.filters.json import Dict
from weboob.browser.filters.html import CleanHTML
from weboob.capabilities.job import BaseJobAdvert
from weboob.capabilities.base import NotAvailable
from weboob.deprecated.browser import Page
from weboob.tools.html import html2text
import dateutil.parser
import re
class IdsPage(JsonPage):
from .job import ApecJobAdvert
def get_adverts_number(self):
return self.doc['totalCount']
@pagination
@method
class iter_job_adverts(DictElement):
item_xpath = 'resultats'
def next_page(self):
self.page.browser.start += self.env['range']
if self.page.browser.start <= self.env['count']:
data = self.page.browser.create_parameters(pattern=self.env['pattern'],
fonctions=self.env['fonctions'],
lieux=self.env['lieux'],
secteursActivite=self.env['secteursActivite'],
typesContrat=self.env['typesContrat'],
typesConvention=self.env['typesConvention'],
niveauxExperience=self.env['niveauxExperience'],
salaire_min=self.env['salaire_min'],
salaire_max=self.env['salaire_max'],
date_publication=self.env['date_publication'],
start=self.page.browser.start,
range=self.env['range'])
return requests.Request("POST", self.page.url, data=data)
class item(ItemElement):
klass = BaseJobAdvert
obj_id = Regexp(Dict('@uriOffre'), '.*=(.*)')
class SearchPage(Page):
def iter_job_adverts(self):
re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?).html', re.DOTALL)
divs = self.document.getroot().xpath("//div[@class='boxContent offre']") + self.document.getroot().xpath("//div[@class='boxContent offre even']")
for div in divs:
a = self.parser.select(div, 'div/div/h3/a', 1, method='xpath')
_id = u'%s/%s' % (re_id_title.search(a.attrib['href']).group(1), re_id_title.search(a.attrib['href']).group(9))
advert = ApecJobAdvert(_id)
advert.title = u'%s' % re_id_title.search(a.attrib['href']).group(9).replace('-', ' ')
l = self.parser.select(div, 'h4', 1).text.split('-')
advert.society_name = u'%s' % l[0].strip()
advert.place = u'%s' % l[-1].strip()
date = self.parser.select(div, 'div/div/div', 1, method='xpath')
advert.publication_date = dateutil.parser.parse(date.text_content().strip()[8:]).date()
yield advert
class OffrePage(JsonPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
class AdvertPage(Page):
def get_job_advert(self, url, advert):
re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL)
if advert is None:
_id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2))
advert = ApecJobAdvert(_id)
advert.title = re_id_title.search(url).group(2).replace('-', ' ')
description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0]
advert.description = html2text(self.parser.tostring(description))
advert.job_name = advert.title
trs = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr")
for tr in trs:
th = self.parser.select(tr, 'th', 1, method='xpath')
td = self.parser.select(tr, 'td', 1, method='xpath')
if u'Date de publication' in u'%s' % th.text_content():
advert.publication_date = dateutil.parser.parse(td.text_content()).date()
elif u'Société' in u'%s' % th.text_content() and not advert.society_name:
society_name = td.text_content()
a = self.parser.select(td, 'a', method='xpath')
if a:
advert.society_name = u'%s' % society_name.replace(a[0].text_content(), '').strip()
else:
advert.society_name = society_name.strip()
elif u'Type de contrat' in u'%s' % th.text_content():
advert.contract_type = u'%s' % td.text_content().strip()
elif u'Lieu' in u'%s' % th.text_content():
advert.place = u'%s' % td.text_content()
elif u'Salaire' in u'%s' % th.text_content():
advert.pay = u'%s' % td.text_content()
elif u'Expérience' in u'%s' % th.text_content():
advert.experience = u'%s' % td.text_content()
advert.url = url
return advert
obj_id = Dict('numeroOffre')
obj_title = Dict('intitule')
obj_description = CleanHTML(Dict('texteHtml'))
obj_job_name = Dict('intitule')
obj_publication_date = DateTime(Dict('datePublication'))
obj_society_name = Dict('nomCommercialEtablissement', default=NotAvailable)
obj_contract_type = Dict('idNomTypeContrat')
obj_place = Dict('lieuTexte')
obj_pay = Dict('salaireTexte')
obj_experience = Dict('idNomNiveauExperience')
obj_url = Format('https://cadres.apec.fr/home/mes-offres/recherche-des-offres-demploi/liste-des-offres-demploi/detail-de-loffre-demploi.html?numIdOffre=%s', Dict('numeroOffre'))