From 83520a177a2fddf5a9de99be092c9d0a2609ae53 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Tue, 7 Jan 2014 19:45:19 +0100 Subject: [PATCH] [regionsJob] Add new module regionsJob --- modules/regionsjob/__init__.py | 24 +++++ modules/regionsjob/backend.py | 173 +++++++++++++++++++++++++++++++++ modules/regionsjob/browser.py | 71 ++++++++++++++ modules/regionsjob/job.py | 29 ++++++ modules/regionsjob/pages.py | 104 ++++++++++++++++++++ modules/regionsjob/test.py | 37 +++++++ 6 files changed, 438 insertions(+) create mode 100644 modules/regionsjob/__init__.py create mode 100644 modules/regionsjob/backend.py create mode 100644 modules/regionsjob/browser.py create mode 100644 modules/regionsjob/job.py create mode 100644 modules/regionsjob/pages.py create mode 100644 modules/regionsjob/test.py diff --git a/modules/regionsjob/__init__.py b/modules/regionsjob/__init__.py new file mode 100644 index 00000000..e8dfa519 --- /dev/null +++ b/modules/regionsjob/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .backend import RegionsjobBackend + + +__all__ = ['RegionsjobBackend'] diff --git a/modules/regionsjob/backend.py b/modules/regionsjob/backend.py new file mode 100644 index 00000000..a71efaf2 --- /dev/null +++ b/modules/regionsjob/backend.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.backend import BaseBackend, BackendConfig +from weboob.capabilities.job import ICapJob +from .browser import RegionsjobBrowser +from weboob.tools.ordereddict import OrderedDict +from weboob.tools.value import Value + +from .job import RegionsJobAdvert + + +__all__ = ['RegionsjobBackend'] + + +class RegionsjobBackend(BaseBackend, ICapJob): + NAME = 'regionsjob' + DESCRIPTION = u'regionsjob website' + MAINTAINER = u'Bezleputh' + EMAIL = 'carton_ben@yahoo.fr' + LICENSE = 'AGPLv3+' + VERSION = '0.h' + + BROWSER = RegionsjobBrowser + + website_choices = OrderedDict([(k, u'%s (%s)' % (v, k)) for k, v in sorted({ + 'www.centrejob.com': u'CentreJob', + 'www.estjob.com': u'EstJob', + 'www.nordjob.com': u'NordJob', + 'www.ouestjob.com': u'OuestJob', + 'www.pacajob.com': u'PacaJob', + 'www.parisjob.com': u'ParisJob', + 'www.rhonealpesjob.com': u'RhoneAlpesJob', + 'www.sudouestjob.com': u'SudOuestJob', + 'www.jobtrotter.com': u'JobTrotter', + }.iteritems())]) + + fonction_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + '000000': u'indifferent', + '4': u'Achat', + '20': u'Assistanat/Adm.ventes/Accueil', + '1': u'BTP - Gros Oeuvre/Second Oeuvre', + '37': u'Bureau d\'Etudes/R&D/BTP archi/conception', + '39': u'Commercial - Technico-Commercial', + '31': u'Commercial auprès des particuliers', + '30': u'Commercial auprès des professionnels', + '5': u'Commercial-Vendeur en magasin', + '6': u'Compta/Gestion/Finance/Audit', + '34': u'Direction/Resp. Co. et Centre de Profit', + '21': u'Import/Export/International', + '22': u'Informatique - Dével. Hardware', + '7': u'Informatique - Développement', + '9': u'Informatique - Systèmes d\'Information', + '10': u'Informatique - Systèmes/Réseaux', + '11': u'Ingénierie - Agro/Agri', + '12': u'Ingénierie - Chimie/Pharmacie/Bio.', + '13': u'Ingénierie - Electro-tech./Automat.', + '14': u'Ingénierie - Mécanique/Aéron.', + '15': u'Ingénierie-Telecoms/Electronique', + '44': u'Juridique/Droit', + '36': u'Logistique/Métiers du Transport ', + '16': u'Marketing/Communication/Graphisme', + '45': u'Métiers de la distribution - Management/Resp.', + '40': u'Métiers de la Fonction Publique ', + '43': u'Négociation/Gestion immobilière', + '17': u'Production - Gestion/Maintenance', + '41': u'Production - Opérateur/Manoeuvre', + '18': u'Qualité/Hygiène/Sécurité/Environnement', + '26': u'Restauration/Tourisme/Hôtellerie/Loisirs', + '19': u'RH/Personnel/Formation', + '25': u'Santé/Social', + '35': u'SAV/Hotline/Téléconseiller', + '42': u'Services à la personne/aux entreprises', + }.iteritems())]) + + secteur_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + '000000': u'indifferent', + '14': u'Agriculture/Pêche', + '9': u'Banque/Assurance/Finance', + '3': u'BTP', + '4': u'Distribution/Commerce de gros', + '17': u'Enseignement/Formation', + '15': u'Immobilier', + '18': u'Industrie Aéronautique/Aérospatial', + '2': u'Industrie Agro-alimentaire', + '5': u'Industrie Auto/Meca/Navale', + '6': u'Industrie high-tech/Telecom', + '19': u'Industrie Manufacturière', + '20': u'Industrie Pétrolière/Pétrochimie', + '21': u'Industrie Pharmaceutique/Biotechn./Chimie', + '7': u'Média/Internet/Communication', + '10': u'Restauration', + '8': u'Santé/Social/Association', + '22': u'Secteur Energie/Environnement', + '11': u'Secteur informatique/SSII', + '27': u'Service public autres', + '1': u'Service public d''etat', + '25': u'Service public des collectivités territoriales', + '26': u'Service public hospitalier', + '13': u'Services aux Entreprises', + '23': u'Services aux Personnes/Particuliers', + '24': u'Tourisme/Hôtellerie/Loisirs', + '16': u'Transport/Logistique', + }.iteritems())]) + + experience_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + '000000': u'Indifférent', + '7': u'BEP/CAP', + '4': u'Employé/Opérateur/Ouvrier Spe/Bac', + '3': u'Technicien/Employé Bac +2', + '6': u'Agent de maîtrise/Bac +3/4', + '2': u'Ingénieur/Cadre/Bac +5', + '1': u'Cadre dirigeant', + }.iteritems())]) + + contract_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + '000000': u'Indifférent', + '6': u'Alternance', + '1': u'CDD', + '2': u'CDI', + '8': u'Franchise', + '7': u'Indépendant', + '3': u'Stage', + '4': u'Travail temporaire', + }.iteritems())]) + + CONFIG = BackendConfig(Value('website', label=u'Region', choices=website_choices), + Value('metier', label='Job name', masked=False, default=''), + Value('fonction', label=u'Fonction', choices=fonction_choices, default='000000'), + Value('secteur', label=u'Secteur', choices=secteur_choices, default='000000'), + Value('contract', label=u'Contract', choices=contract_choices, default='000000'), + Value('experience', label=u'Experience', choices=experience_choices, default='000000'), + ) + + def create_default_browser(self): + return self.create_browser(self.config['website'].get()) + + def search_job(self, pattern=''): + with self.browser: + return self.browser.search_job(pattern=pattern) + + def advanced_search_job(self): + return self.browser.advanced_search_job(metier=self.config['metier'].get(), + fonction=int(self.config['fonction'].get()), + secteur=int(self.config['secteur'].get()), + contract=int(self.config['contract'].get()), + experience=int(self.config['experience'].get())) + + def get_job_advert(self, _id, advert=None): + with self.browser: + return self.browser.get_job_advert(_id, advert) + + def fill_obj(self, advert, fields): + self.get_job_advert(advert.id, advert) + + OBJECTS = {RegionsJobAdvert: fill_obj} diff --git a/modules/regionsjob/browser.py b/modules/regionsjob/browser.py new file mode 100644 index 00000000..f4fdc4d8 --- /dev/null +++ b/modules/regionsjob/browser.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . +import urllib + +from weboob.tools.browser import BaseBrowser +from weboob.tools.browser.decorators import id2url + +from .pages import SearchPage, AdvertPage +from .job import RegionsJobAdvert + + +__all__ = ['RegionsjobBrowser'] + + +class RegionsjobBrowser(BaseBrowser): + PROTOCOL = 'http' + ENCODING = 'utf-8' + + PAGES = { + '%s://(.*?)/offre_emploi/index.aspx\?v=___(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_(.*?)_' % (PROTOCOL): SearchPage, + '%s://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation' % (PROTOCOL): AdvertPage, + } + + def __init__(self, website, *args, **kwargs): + self.DOMAIN = website + BaseBrowser.__init__(self, *args, **kwargs) + + def search_job(self, pattern=''): + self.location('%s://%s/offre_emploi/index.aspx?v=___0_0_0_0_0_0_0_0_0_%s_' + % (self.PROTOCOL, self.DOMAIN, urllib.quote_plus(pattern.encode(self.ENCODING)))) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts(self.DOMAIN) + + def advanced_search_job(self, metier, fonction, secteur, contract, experience): + self.location('%s://%s/offre_emploi/index.aspx?v=___%s_%s_%s_%s_%s_%s_%s_%s_%s_%s_' + % (self.PROTOCOL, + self.DOMAIN, + '0', + fonction, + experience, + '0', + contract, + '0', + '0', + secteur, + '0', + urllib.quote_plus(metier.encode(self.ENCODING)))) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts(self.DOMAIN) + + @id2url(RegionsJobAdvert.id2url) + def get_job_advert(self, url, advert): + self.location(url) + assert self.is_on_page(AdvertPage) + return self.page.get_job_advert(url, advert) diff --git a/modules/regionsjob/job.py b/modules/regionsjob/job.py new file mode 100644 index 00000000..743592af --- /dev/null +++ b/modules/regionsjob/job.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.job import BaseJobAdvert + + +class RegionsJobAdvert(BaseJobAdvert): + @classmethod + def id2url(cls, _id): + splitted_id = _id.split('|') + return 'http://%s/offre_emploi/detailoffre.aspx?numoffre=%s&de=consultation' \ + % (splitted_id[0], splitted_id[1]) + diff --git a/modules/regionsjob/pages.py b/modules/regionsjob/pages.py new file mode 100644 index 00000000..82dab7d7 --- /dev/null +++ b/modules/regionsjob/pages.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.misc import html2text +from weboob.tools.browser import BasePage +from .job import RegionsJobAdvert +from datetime import datetime, date +import re + +__all__ = ['SearchPage'] + + +class SearchPage(BasePage): + def iter_job_adverts(self, website): + re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL) + lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li') + for li in lis: + a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath') + _id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2)) + advert = RegionsJobAdvert(_id) + advert.title = u'%s' % a.text + advert.society_name = u'%s' % self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a', + 1, method='xpath').text + advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span', + 1, method='xpath').text.strip() + _date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]', + 1, method='xpath').text_content() + year = date.today().year + splitted_date = _date.split('/') + advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0])) + advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span', + 1, method='xpath').text + yield advert + + +class AdvertPage(BasePage): + def get_job_advert(self, url, advert): + re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL) + if advert is None: + _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2)) + advert = RegionsJobAdvert(_id) + + advert.url = u'%s' % url + + div = self.document.getroot().xpath('//div[@id="annonce"]')[0] + + advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text + + content = self.parser.select(div, 'p', method='xpath') + + next_is_date = False + next_is_pay = False + description = '' + + for p in content: + if next_is_date: + m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date) + if m: + dd = int(m.group(1)) + mm = int(m.group(2)) + yyyy = int(m.group(3)) + advert.publication_date = datetime.date(yyyy, mm, dd) + next_is_date = False + + elif next_is_pay: + advert.pay = html2text(self.parser.tostring(p)) + next_is_pay = False + + elif 'class' in p.attrib: + if p.attrib['class'] == 'contrat_loc': + contrat_loc = self.parser.select(div, 'p[@class="contrat_loc"]/strong', 3, method='xpath') + advert.society_name = u'%s' % contrat_loc[0].text + advert.contract_type = u'%s' % contrat_loc[1].text + advert.place = u'%s' % contrat_loc[2].text + elif p.attrib['class'] == 'date_ref': + next_is_date = True + + elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire': + next_is_pay = True + + else: + description = description + html2text(self.parser.tostring(p)) + else: + description = description + html2text(self.parser.tostring(p)) + + advert.description = u'%s' % description + + return advert diff --git a/modules/regionsjob/test.py b/modules/regionsjob/test.py new file mode 100644 index 00000000..dac71834 --- /dev/null +++ b/modules/regionsjob/test.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2014 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + + +class RegionsjobTest(BackendTest): + BACKEND = 'regionsjob' + + def test_regionjob_search(self): + l = list(self.backend.search_job(u'informaticien')) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, None) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url)) + + def test_regionjob_advanced_search(self): + l = list(self.backend.advanced_search_job()) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, None) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))