From 57bd31edc0645a7993740ad7b5120a997f58bbe2 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Tue, 18 Jun 2013 12:52:23 +0200 Subject: [PATCH] creation of Apec : module that uses ICapJob in order to find adverts on apec website --- modules/apec/__init__.py | 24 ++++++++++++++ modules/apec/backend.py | 46 ++++++++++++++++++++++++++ modules/apec/browser.py | 51 +++++++++++++++++++++++++++++ modules/apec/job.py | 27 ++++++++++++++++ modules/apec/pages.py | 70 ++++++++++++++++++++++++++++++++++++++++ modules/apec/test.py | 32 ++++++++++++++++++ 6 files changed, 250 insertions(+) create mode 100644 modules/apec/__init__.py create mode 100644 modules/apec/backend.py create mode 100644 modules/apec/browser.py create mode 100644 modules/apec/job.py create mode 100644 modules/apec/pages.py create mode 100644 modules/apec/test.py diff --git a/modules/apec/__init__.py b/modules/apec/__init__.py new file mode 100644 index 00000000..cce56b15 --- /dev/null +++ b/modules/apec/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .backend import ApecBackend + + +__all__ = ['ApecBackend'] diff --git a/modules/apec/backend.py b/modules/apec/backend.py new file mode 100644 index 00000000..62254b5a --- /dev/null +++ b/modules/apec/backend.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.backend import BaseBackend +from weboob.capabilities.job import ICapJob + +from .browser import ApecBrowser + + +__all__ = ['ApecBackend'] + + +class ApecBackend(BaseBackend, ICapJob): + NAME = 'apec' + DESCRIPTION = u'apec website' + MAINTAINER = u'Bezleputh' + EMAIL = 'carton_ben@yahoo.fr' + VERSION = '0.g' + + BROWSER = ApecBrowser + + def search_job(self, pattern=None): + with self.browser: + for job_advert in self.browser.search_job(pattern): + yield job_advert + + def get_job_advert(self, _id, advert=None): + with self.browser: + return self.browser.get_job_advert(_id, advert) diff --git a/modules/apec/browser.py b/modules/apec/browser.py new file mode 100644 index 00000000..95e0d87c --- /dev/null +++ b/modules/apec/browser.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.browser.decorators import id2url +from weboob.tools.browser import BaseBrowser + +from .pages import SearchPage, AdvertPage +from .job import ApecJobAdvert + +__all__ = ['ApecBrowser'] + + +class ApecBrowser(BaseBrowser): + PROTOCOL = 'http' + DOMAIN = 'www.apec.fr' + ENCODING = None + + PAGES = { + 'http://cadres.apec.fr/MesOffres/RechercheOffres/ApecRechercheOffre.jsp\?keywords=(.*?)': SearchPage, + 'http://cadres.apec.fr/offres-emploi-cadres/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)': AdvertPage, + } + + def search_job(self, pattern): + if pattern is not None: + self.location('http://cadres.apec.fr/MesOffres/RechercheOffres/ApecRechercheOffre.jsp?keywords=%s' % pattern.replace(' ','+')) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts() + else: + return [] + + @id2url(ApecJobAdvert.id2url) + def get_job_advert(self, url, advert): + self.location(url) + assert self.is_on_page(AdvertPage) + return self.page.get_job_advert(url, advert) diff --git a/modules/apec/job.py b/modules/apec/job.py new file mode 100644 index 00000000..69323702 --- /dev/null +++ b/modules/apec/job.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.job import BaseJobAdvert + + +class ApecJobAdvert(BaseJobAdvert): + @classmethod + def id2url(cls, _id): + splitted_id = _id.split('/') + return 'http://cadres.apec.fr/offres-emploi-cadres/offres-emploi-cadres/0_0_0_%s________%s.html' % (splitted_id[0], splitted_id[1]) diff --git a/modules/apec/pages.py b/modules/apec/pages.py new file mode 100644 index 00000000..20027d29 --- /dev/null +++ b/modules/apec/pages.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BasePage +import dateutil.parser +import re + +from .job import ApecJobAdvert + +__all__ = ['SearchPage', 'AdvertPage'] + + +class SearchPage(BasePage): + def iter_job_adverts(self): + adverts = [] + re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL) + divs = self.document.getroot().xpath("//div[@class='boxContent offre']") + self.document.getroot().xpath("//div[@class='boxContent offre even']") + for div in divs: + a = self.parser.select(div, 'div/h3/a', 1, method='xpath') + _id = u'%s/%s' % (re_id_title.search(a.attrib['href']).group(1), re_id_title.search(a.attrib['href']).group(2)) + advert = ApecJobAdvert(_id) + advert.title = u'%s' % re_id_title.search(a.attrib['href']).group(2).replace('-', ' ') + l = self.parser.select(div, 'h4', 1).text.split('-') + advert.society_name = u'%s' % l[0].strip() + advert.place = u'%s' % l[-1].strip() + date = self.parser.select(div, 'div/div/div', 1, method='xpath') + advert.publication_date = dateutil.parser.parse(date.text_content().strip()[8:]).date() + adverts.append(advert) + return adverts + + +class AdvertPage(BasePage): + def get_job_advert(self, url, advert): + re_id_title = re.compile('/offres-emploi-cadres/\d*_\d*_\d*_(.*?)________(.*?).html(.*?)', re.DOTALL) + if advert is None: + _id = u'%s/%s' % (re_id_title.search(url).group(1), re_id_title.search(url).group(2)) + advert = ApecJobAdvert(_id) + advert.title = re_id_title.search(url).group(2).replace('-', ' ') + + advert.description = self.document.getroot().xpath("//div[@class='contentWithDashedBorderTop marginTop boxContent']/div")[0].text_content() + + td = self.document.getroot().xpath("//table[@class='noFieldsTable']/tr/td") + advert.job_name = advert.title + advert.publication_date = dateutil.parser.parse(td[2].text_content()).date() + society_name = td[3].text_content() + a = self.parser.select(td[3], 'a', 1, method='xpath').text_content() + advert.society_name = u'%s' % society_name.replace(a, '').strip() + advert.contract_type = u'%s' % td[4].text_content().strip() + advert.place = u'%s' % td[5].text_content() + advert.pay = u'%s' % td[7].text_content() + advert.experience = u'%s' % td[8].text_content() + advert.url = url + return advert diff --git a/modules/apec/test.py b/modules/apec/test.py new file mode 100644 index 00000000..10cd9606 --- /dev/null +++ b/modules/apec/test.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + + +class ApecTest(BackendTest): + BACKEND = 'apec' + + def test_apec(self): + l = list(self.backend.search_job(u'maitre brasseur')) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, None) + print advert.__repr__() + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))