add indeed module

This commit is contained in:
Bezleputh 2013-09-03 13:02:29 +02:00 committed by Florent
commit 0d59cfcc8c
7 changed files with 345 additions and 0 deletions

View file

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from .backend import IndeedBackend
__all__ = ['IndeedBackend']

79
modules/indeed/backend.py Normal file
View file

@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.tools.ordereddict import OrderedDict
from weboob.capabilities.job import ICapJob
from weboob.tools.value import Value
from .browser import IndeedBrowser
from .job import IndeedJobAdvert
__all__ = ['IndeedBackend']
class IndeedBackend(BaseBackend, ICapJob):
NAME = 'indeed'
DESCRIPTION = u'indeed website'
MAINTAINER = u'Bezleputh'
EMAIL = 'carton_ben@yahoo.fr'
LICENSE = 'AGPLv3+'
VERSION = '0.h'
BROWSER = IndeedBrowser
type_contrat_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
'all': u'Tous les emplois',
'fulltime': u'Temps plein',
'parttime': u'Temps partiel',
'contract': u'Durée indéterminée',
'internship': u'Stage / Apprentissage',
'temporary': u'Durée déterminée',
}.iteritems())])
limit_date_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
'any': u'à tout moment',
'15': u'depuis 15 jours',
'7': u'depuis 7 jours',
'3': u'depuis 3 jours',
'1': u'depuis hier',
'last': u'depuis ma dernière visite',
}.iteritems())])
CONFIG = BackendConfig(Value('metier', label=u'Job name', masked=False, default=''),
Value('limit_date', label=u'Date limite', choices=limit_date_choices, default=''),
Value('contrat', label=u'Contract', choices=type_contrat_choices, default=''))
def search_job(self, pattern=None):
with self.browser:
return self.browser.search_job(pattern=pattern)
def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(),
limit_date=self.config['limit_date'].get(),
contrat=self.config['contrat'].get(),)
def get_job_advert(self, _id, advert=None):
with self.browser:
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert)
OBJECTS = {IndeedJobAdvert: fill_obj}

57
modules/indeed/browser.py Normal file
View file

@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from .pages import SearchPage, AdvertPage
from .job import IndeedJobAdvert
__all__ = ['IndeedBrowser']
class IndeedBrowser(BaseBrowser):
PROTOCOL = 'http'
DOMAIN = 'www.indeed.fr'
ENCODING = None
PAGES = {
'%s://%s/Emplois-(.*?)' % (PROTOCOL, DOMAIN): SearchPage,
'%s://%s/emplois(.*?)' % (PROTOCOL, DOMAIN): SearchPage,
'%s://%s/cmp/(.*?)' % (PROTOCOL, DOMAIN): AdvertPage,
}
def search_job(self, pattern=None, metier=None, place=None, contrat=None):
self.location('http://www.indeed.fr/emplois?as_and=%s&limit=50&sort=date&st=employer&sr=directhire'
% pattern.replace(' ', '+'))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
def advanced_search_job(self, metier=None, contrat=None, limit_date=None):
self.location('http://www.indeed.fr/emplois?as_ttl=%s&limit=50&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s'
% (metier.replace(' ', '+'), contrat, limit_date))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
@id2url(IndeedJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)

BIN
modules/indeed/favicon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.6 KiB

34
modules/indeed/job.py Normal file
View file

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class IndeedJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
dico_car_part = {" ": "-",
"/": "-",
}
for cle, valeur in dico_car_part.items():
_id = _id.replace(cle, valeur)
splitted_id = _id.split('|')
return 'http://www.indeed.fr/cmp/%s/jobs/%s-%s' % (splitted_id[0], splitted_id[1], splitted_id[2])

108
modules/indeed/pages.py Normal file
View file

@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from HTMLParser import HTMLParser
import re
from weboob.tools.browser import BasePage
from .job import IndeedJobAdvert
__all__ = ['SearchPage', 'AdvertPage']
class SearchPage(BasePage):
def iter_job_adverts(self):
rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]')
for row in rows:
advert = self.create_job_advert(row)
if advert:
yield advert
def create_job_advert(self, row):
advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath')
num_id = row.attrib['id'][2:]
title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title']
society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip()
if num_id and title and society_name and advert_from and \
len(advert_from) > 0 and 'Indeed' in advert_from[0].text_content().strip():
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.title = u'%s' % title
advert.society_name = u'%s' % society_name
advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip()
date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip()
now = datetime.datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = date
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = date
return advert
return None
class AdvertPage(BasePage):
def get_job_advert(self, url, advert):
job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0]
if not advert:
title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content()
num_id = url.split('-')[-1]
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content()
description_content = self.document.getroot().xpath('//span[@class="summary"]')[0]
advert.description = u'%s' % self.strip_tags(self.parser.tostring(description_content))
advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
advert.url = url
date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip()
now = datetime.datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = date
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = date
return advert
def strip_tags(self, html):
s = MLStripper()
s.feed(html)
return s.get_data()
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)

43
modules/indeed/test.py Normal file
View file

@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest
class IndeedTest(BackendTest):
BACKEND = 'indeed'
def test_indeed_search(self):
l = list(self.backend.search_job('informaticien'))
assert len(l)
advert = self.backend.get_job_advert(l[0].id, l[0])
self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))
def test_indeed_advanced_search(self):
l = list(self.backend.advanced_search_job())
assert len(l)
advert = self.backend.get_job_advert(l[0].id, l[0])
self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))
def test_indeep_info_from_id(self):
l = list(self.backend.advanced_search_job())
assert len(l)
advert = self.backend.get_job_advert(l[0].id, None)
self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))