diff --git a/modules/indeed/__init__.py b/modules/indeed/__init__.py new file mode 100644 index 00000000..038f349e --- /dev/null +++ b/modules/indeed/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .backend import IndeedBackend + + +__all__ = ['IndeedBackend'] diff --git a/modules/indeed/backend.py b/modules/indeed/backend.py new file mode 100644 index 00000000..87c68839 --- /dev/null +++ b/modules/indeed/backend.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.backend import BaseBackend, BackendConfig +from weboob.tools.ordereddict import OrderedDict +from weboob.capabilities.job import ICapJob +from weboob.tools.value import Value +from .browser import IndeedBrowser +from .job import IndeedJobAdvert + +__all__ = ['IndeedBackend'] + + +class IndeedBackend(BaseBackend, ICapJob): + NAME = 'indeed' + DESCRIPTION = u'indeed website' + MAINTAINER = u'Bezleputh' + EMAIL = 'carton_ben@yahoo.fr' + LICENSE = 'AGPLv3+' + VERSION = '0.h' + + BROWSER = IndeedBrowser + + type_contrat_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + 'all': u'Tous les emplois', + 'fulltime': u'Temps plein', + 'parttime': u'Temps partiel', + 'contract': u'Durée indéterminée', + 'internship': u'Stage / Apprentissage', + 'temporary': u'Durée déterminée', + }.iteritems())]) + + limit_date_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + 'any': u'à tout moment', + '15': u'depuis 15 jours', + '7': u'depuis 7 jours', + '3': u'depuis 3 jours', + '1': u'depuis hier', + 'last': u'depuis ma dernière visite', + }.iteritems())]) + + CONFIG = BackendConfig(Value('metier', label=u'Job name', masked=False, default=''), + Value('limit_date', label=u'Date limite', choices=limit_date_choices, default=''), + Value('contrat', label=u'Contract', choices=type_contrat_choices, default='')) + + def search_job(self, pattern=None): + with self.browser: + return self.browser.search_job(pattern=pattern) + + def advanced_search_job(self): + return self.browser.advanced_search_job(metier=self.config['metier'].get(), + limit_date=self.config['limit_date'].get(), + contrat=self.config['contrat'].get(),) + + def get_job_advert(self, _id, advert=None): + with self.browser: + return self.browser.get_job_advert(_id, advert) + + def fill_obj(self, advert, fields): + self.get_job_advert(advert.id, advert) + + OBJECTS = {IndeedJobAdvert: fill_obj} diff --git a/modules/indeed/browser.py b/modules/indeed/browser.py new file mode 100644 index 00000000..2cfeae59 --- /dev/null +++ b/modules/indeed/browser.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BaseBrowser + +from weboob.tools.browser.decorators import id2url +from .pages import SearchPage, AdvertPage +from .job import IndeedJobAdvert + + +__all__ = ['IndeedBrowser'] + + +class IndeedBrowser(BaseBrowser): + PROTOCOL = 'http' + DOMAIN = 'www.indeed.fr' + ENCODING = None + PAGES = { + '%s://%s/Emplois-(.*?)' % (PROTOCOL, DOMAIN): SearchPage, + '%s://%s/emplois(.*?)' % (PROTOCOL, DOMAIN): SearchPage, + '%s://%s/cmp/(.*?)' % (PROTOCOL, DOMAIN): AdvertPage, + } + + def search_job(self, pattern=None, metier=None, place=None, contrat=None): + self.location('http://www.indeed.fr/emplois?as_and=%s&limit=50&sort=date&st=employer&sr=directhire' + % pattern.replace(' ', '+')) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts() + + def advanced_search_job(self, metier=None, contrat=None, limit_date=None): + self.location('http://www.indeed.fr/emplois?as_ttl=%s&limit=50&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s' + % (metier.replace(' ', '+'), contrat, limit_date)) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts() + + @id2url(IndeedJobAdvert.id2url) + def get_job_advert(self, url, advert): + self.location(url) + assert self.is_on_page(AdvertPage) + return self.page.get_job_advert(url, advert) diff --git a/modules/indeed/favicon.png b/modules/indeed/favicon.png new file mode 100644 index 00000000..3a7922f5 Binary files /dev/null and b/modules/indeed/favicon.png differ diff --git a/modules/indeed/job.py b/modules/indeed/job.py new file mode 100644 index 00000000..4c377a20 --- /dev/null +++ b/modules/indeed/job.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.job import BaseJobAdvert + + +class IndeedJobAdvert(BaseJobAdvert): + + @classmethod + def id2url(cls, _id): + dico_car_part = {" ": "-", + "/": "-", + } + for cle, valeur in dico_car_part.items(): + _id = _id.replace(cle, valeur) + + splitted_id = _id.split('|') + return 'http://www.indeed.fr/cmp/%s/jobs/%s-%s' % (splitted_id[0], splitted_id[1], splitted_id[2]) diff --git a/modules/indeed/pages.py b/modules/indeed/pages.py new file mode 100644 index 00000000..1f849714 --- /dev/null +++ b/modules/indeed/pages.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import datetime +from HTMLParser import HTMLParser +import re +from weboob.tools.browser import BasePage +from .job import IndeedJobAdvert + +__all__ = ['SearchPage', 'AdvertPage'] + + +class SearchPage(BasePage): + def iter_job_adverts(self): + rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]') + for row in rows: + advert = self.create_job_advert(row) + if advert: + yield advert + + def create_job_advert(self, row): + + advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath') + num_id = row.attrib['id'][2:] + title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title'] + society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip() + if num_id and title and society_name and advert_from and \ + len(advert_from) > 0 and 'Indeed' in advert_from[0].text_content().strip(): + + advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) + advert.title = u'%s' % title + advert.society_name = u'%s' % society_name + advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip() + + date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip() + now = datetime.datetime.now() + number = re.search("\d+", date) + if number: + if 'heures' in date: + date = now - datetime.timedelta(hours=int(number.group(0))) + advert.publication_date = date + elif 'jour' in date: + date = now - datetime.timedelta(days=int(number.group(0))) + advert.publication_date = date + return advert + return None + + +class AdvertPage(BasePage): + def get_job_advert(self, url, advert): + job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0] + if not advert: + title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() + society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content() + num_id = url.split('-')[-1] + advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) + + advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content() + description_content = self.document.getroot().xpath('//span[@class="summary"]')[0] + advert.description = u'%s' % self.strip_tags(self.parser.tostring(description_content)) + advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() + advert.url = url + + date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip() + now = datetime.datetime.now() + number = re.search("\d+", date) + if number: + if 'heures' in date: + date = now - datetime.timedelta(hours=int(number.group(0))) + advert.publication_date = date + elif 'jour' in date: + date = now - datetime.timedelta(days=int(number.group(0))) + advert.publication_date = date + + return advert + + def strip_tags(self, html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.fed = [] + + def handle_data(self, d): + self.fed.append(d) + + def get_data(self): + return ''.join(self.fed) diff --git a/modules/indeed/test.py b/modules/indeed/test.py new file mode 100644 index 00000000..320133c1 --- /dev/null +++ b/modules/indeed/test.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + + +class IndeedTest(BackendTest): + BACKEND = 'indeed' + + def test_indeed_search(self): + l = list(self.backend.search_job('informaticien')) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, l[0]) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url)) + + def test_indeed_advanced_search(self): + l = list(self.backend.advanced_search_job()) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, l[0]) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url)) + + def test_indeep_info_from_id(self): + l = list(self.backend.advanced_search_job()) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, None) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))