From 0d59cfcc8c0639a7aeadd3b254ebea744d4a538e Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Tue, 3 Sep 2013 13:02:29 +0200 Subject: [PATCH] add indeed module --- modules/indeed/__init__.py | 24 +++++++++ modules/indeed/backend.py | 79 +++++++++++++++++++++++++++ modules/indeed/browser.py | 57 ++++++++++++++++++++ modules/indeed/favicon.png | Bin 0 -> 3716 bytes modules/indeed/job.py | 34 ++++++++++++ modules/indeed/pages.py | 108 +++++++++++++++++++++++++++++++++++++ modules/indeed/test.py | 43 +++++++++++++++ 7 files changed, 345 insertions(+) create mode 100644 modules/indeed/__init__.py create mode 100644 modules/indeed/backend.py create mode 100644 modules/indeed/browser.py create mode 100644 modules/indeed/favicon.png create mode 100644 modules/indeed/job.py create mode 100644 modules/indeed/pages.py create mode 100644 modules/indeed/test.py diff --git a/modules/indeed/__init__.py b/modules/indeed/__init__.py new file mode 100644 index 00000000..038f349e --- /dev/null +++ b/modules/indeed/__init__.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from .backend import IndeedBackend + + +__all__ = ['IndeedBackend'] diff --git a/modules/indeed/backend.py b/modules/indeed/backend.py new file mode 100644 index 00000000..87c68839 --- /dev/null +++ b/modules/indeed/backend.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.backend import BaseBackend, BackendConfig +from weboob.tools.ordereddict import OrderedDict +from weboob.capabilities.job import ICapJob +from weboob.tools.value import Value +from .browser import IndeedBrowser +from .job import IndeedJobAdvert + +__all__ = ['IndeedBackend'] + + +class IndeedBackend(BaseBackend, ICapJob): + NAME = 'indeed' + DESCRIPTION = u'indeed website' + MAINTAINER = u'Bezleputh' + EMAIL = 'carton_ben@yahoo.fr' + LICENSE = 'AGPLv3+' + VERSION = '0.h' + + BROWSER = IndeedBrowser + + type_contrat_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + 'all': u'Tous les emplois', + 'fulltime': u'Temps plein', + 'parttime': u'Temps partiel', + 'contract': u'Durée indéterminée', + 'internship': u'Stage / Apprentissage', + 'temporary': u'Durée déterminée', + }.iteritems())]) + + limit_date_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({ + 'any': u'à tout moment', + '15': u'depuis 15 jours', + '7': u'depuis 7 jours', + '3': u'depuis 3 jours', + '1': u'depuis hier', + 'last': u'depuis ma dernière visite', + }.iteritems())]) + + CONFIG = BackendConfig(Value('metier', label=u'Job name', masked=False, default=''), + Value('limit_date', label=u'Date limite', choices=limit_date_choices, default=''), + Value('contrat', label=u'Contract', choices=type_contrat_choices, default='')) + + def search_job(self, pattern=None): + with self.browser: + return self.browser.search_job(pattern=pattern) + + def advanced_search_job(self): + return self.browser.advanced_search_job(metier=self.config['metier'].get(), + limit_date=self.config['limit_date'].get(), + contrat=self.config['contrat'].get(),) + + def get_job_advert(self, _id, advert=None): + with self.browser: + return self.browser.get_job_advert(_id, advert) + + def fill_obj(self, advert, fields): + self.get_job_advert(advert.id, advert) + + OBJECTS = {IndeedJobAdvert: fill_obj} diff --git a/modules/indeed/browser.py b/modules/indeed/browser.py new file mode 100644 index 00000000..2cfeae59 --- /dev/null +++ b/modules/indeed/browser.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BaseBrowser + +from weboob.tools.browser.decorators import id2url +from .pages import SearchPage, AdvertPage +from .job import IndeedJobAdvert + + +__all__ = ['IndeedBrowser'] + + +class IndeedBrowser(BaseBrowser): + PROTOCOL = 'http' + DOMAIN = 'www.indeed.fr' + ENCODING = None + PAGES = { + '%s://%s/Emplois-(.*?)' % (PROTOCOL, DOMAIN): SearchPage, + '%s://%s/emplois(.*?)' % (PROTOCOL, DOMAIN): SearchPage, + '%s://%s/cmp/(.*?)' % (PROTOCOL, DOMAIN): AdvertPage, + } + + def search_job(self, pattern=None, metier=None, place=None, contrat=None): + self.location('http://www.indeed.fr/emplois?as_and=%s&limit=50&sort=date&st=employer&sr=directhire' + % pattern.replace(' ', '+')) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts() + + def advanced_search_job(self, metier=None, contrat=None, limit_date=None): + self.location('http://www.indeed.fr/emplois?as_ttl=%s&limit=50&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s' + % (metier.replace(' ', '+'), contrat, limit_date)) + assert self.is_on_page(SearchPage) + return self.page.iter_job_adverts() + + @id2url(IndeedJobAdvert.id2url) + def get_job_advert(self, url, advert): + self.location(url) + assert self.is_on_page(AdvertPage) + return self.page.get_job_advert(url, advert) diff --git a/modules/indeed/favicon.png b/modules/indeed/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..3a7922f5dc4d20c7b63bfb4fb14508c66e821308 GIT binary patch literal 3716 zcmV-~4tw#5P)H z)t~Zx)pk*}?4LzFke~>_s{eAtEE;9`4We!X07Umy_g!9)kSxoH5g#dw&-T+UKbsbG znK#y>l-De;Iijhi29bWu2r zxv#Hj7q9!JYQkxkx?fd*aGyJUI?wAdsj*0!X3o^bsp*p=@vP}6F}B;3)>WlNLjAja zROekhk`oxe=q99oBWarMuFU;keG)RPlEkyOyQQ|ol|>n=G(&AQ>TSHLY`t@Yz*+#G zw(uV-@u-k7YK!3qkpOYkyAfnL(dkt8K$eFU*Em%f>eg2$skBJFo8ng@;&dL;6!r7Q zEH~#(fTSZ>_jrc;{*mL+%{ewt8B7m|Gy{5FV8Nb(MGIprIVQ%LCv^}-K%R#+P)kxP ztf2t_;p|pbP8e>gHGL_~iu1gi06)BAh^ucNB8iLlxdDb5u+Lmz;eHYJ zohz8r73`WJw(m4}W(P6TFC+p4dL7{B7fs;ABjWmKg!=PN)nf?iCFu7D9DkZ*q#|oS zFW~C4JBf3?JB@8G5^)6dQ)2NUF+R7XgHzuZLZDP{*X9j zae_9Vp%txjina_33A%305V--?TsDcthemG*GP&*=ebLaL-AA)2<~|-!-^lwfDnA; zj1Io`sUG@iwJ8d^imgT)_glik041cVXsfwOIhIhFQcj!#aRmZ|y#D(1n*xQmy;$>Yd|7Zt; zv~qIF_DYRrXu!Z35D`RCR4fQJ`cq?3eY<1UV{*kvS~yZFcbH{${oVMpwEW1%|77_r zj)1Fg>SO99gENj#@Z~eQ80N%ePw?Vv1{<~(XfF{1B0Tx3%{f!7UB^YK7^kD!B zQEUa$A}0WO?bTiM+2B#&xKbwF4shYBKJI=j$Dh7Ei9`2`aNU|AuD@-7UPmmSpjz|; z<_qq+at|aCEjWpPz$4>TE^;{b>W{gKiN$ z)J`K%jTlkUK`ie#{oCLVf6cJs;~gBlkKpb{GpxLAh;Bz(>`t}FQ!f$ct(-x(BZwrb z&wdgESNwW_zii42se#S-5I_64j?GiK zX8Rnnu(_YpCGJ?CL8L%s1aarzGIU}(<^6wyj9B+%)<`F9DP^-N_8icD7XfY3P!pu| zAS(j6<>3_F1UUcH1ctzt=S}Idj-pCT{p{%}gIF3oxphaOFlWO6ClRoD%CrboecW7< z*ERr6zqS>aB|F{g0Kb1I#raDU7*oj2+=+t8@oFZ|ME05la?>2OqCPPm6A^~Mya_1D zZ7cA4y(PrHm$bO60~)=j^@07@9{`VR%yG`i9hlOWU3_Td5n*_*GlSB%FDFdmgGWUr zc=FY^M?Xx76OV|S-E19I_q>PyBA19~4I1p&fF9F^tr?C!B*H-piiI>JhADC0(hdwm zWL)!N&mM64aj|8)7B7tPo`q3>%(EBpBa36~I}g@|+3^GV&#gx=BZ8putXB10-}f0z%}P~ zF-!?#Orz=be+^NUrlBY0w)!M%UaG4+zc zL{D(sJ0mQ8Z-P8m6Z0T62+&Ie_dlNF;f*<7nI`7HO>oMh7{|UNMn5IS>dA0$p~aQR zj^}q)F=Z=^?WNaBAoA$bIgU9jDnimNXT3S|B8{tS=%etOko-hXaO=7ZCma!BuRRI} zLLwlNVz+hpyW#E~=Bg$$G!|~c>lPtU@fgoP{~~9(`L!;S$j`{mCeqVOfFvp6_peS9 zTb?tx|DPFt_h5$2Qw9%wXA=9*i^fKaW*brRziW0k0on4NO(6VPOODac8h!v^UAIl0 z3wj;ktZSz6)UyTzkY+$;ibW9tI&o3KupNl8hc3!vDf%%1{J>Yx=HkJHx%3NfZS4oM6Z@UUa?^adhS6w9H&e1S7dTj(qjHO7R#z?lz+xaCNH zd2mBS3pj^hIPj(qUlW+D(S4Cny0gt2`>BV!rf7 zJlPyihDThi4;3kDFO4%lvCWFK%Zi{avf__Sk<~5*Lbl3RZ<%5Q?%|B3y@Q_cQ?uBL z6=q#oApT?3HjJWdD93K}SwDEwH=8Sq@)VNuJDhIPjV+Pd#DE?(7T0A>FV3bar#uvMH z!x;K3usoxn-Bw%71-_(6TH}D%8i-?(oxlh^A#x9}aDh;Zv;aJSLJ=7BWRU6D4g-Cl z-F8ctFr4I|Emk?+qyaB(5Ti;=)@V%ZP%H)Qm~jJ^YEh#r6#+A-cz74Uw769%6X)0l zTVkhw!m6 zskJwKphq3Q5C`5`-vIRRM2D#o4B3j_3OS4LL0~s5QyADPwQwBS11-O$a^}zqbF?!9 z^rOhj8q@ZAn`<+x6jge#WYx~N=uanUkRV53Qlp)#`rhZw)2q6I65!okVmu7>xV-M* z^Snrod~&$bAax;;itMV?94p9Cm8G5R0_82_WW>bFCQ(G90 z9*ooQd zn&(B7SRB+Q4&Z71j=>rO8*k5Pqxv1A#=uQmtR+05=hs}L0<^y1D?K>H_mX2sS|17B z^0ho;2x@;iER$xZ%iV3Da6pjVb8KgtR;j3AakHF{5*o`$u@1~8z=4E*g+zx%hxpyc zMBLR4r3unE0D2H35#VcEblUi{9wtCJqPLL}APb8;T8v3fRa$=&KbOf>vGnejvv-tQ~>XojQtwQ(0owAfx|E iQ2CY%{9E@a2l*eEEtr0QK=#-G0000. + +from weboob.capabilities.job import BaseJobAdvert + + +class IndeedJobAdvert(BaseJobAdvert): + + @classmethod + def id2url(cls, _id): + dico_car_part = {" ": "-", + "/": "-", + } + for cle, valeur in dico_car_part.items(): + _id = _id.replace(cle, valeur) + + splitted_id = _id.split('|') + return 'http://www.indeed.fr/cmp/%s/jobs/%s-%s' % (splitted_id[0], splitted_id[1], splitted_id[2]) diff --git a/modules/indeed/pages.py b/modules/indeed/pages.py new file mode 100644 index 00000000..1f849714 --- /dev/null +++ b/modules/indeed/pages.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import datetime +from HTMLParser import HTMLParser +import re +from weboob.tools.browser import BasePage +from .job import IndeedJobAdvert + +__all__ = ['SearchPage', 'AdvertPage'] + + +class SearchPage(BasePage): + def iter_job_adverts(self): + rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]') + for row in rows: + advert = self.create_job_advert(row) + if advert: + yield advert + + def create_job_advert(self, row): + + advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath') + num_id = row.attrib['id'][2:] + title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title'] + society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip() + if num_id and title and society_name and advert_from and \ + len(advert_from) > 0 and 'Indeed' in advert_from[0].text_content().strip(): + + advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) + advert.title = u'%s' % title + advert.society_name = u'%s' % society_name + advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip() + + date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip() + now = datetime.datetime.now() + number = re.search("\d+", date) + if number: + if 'heures' in date: + date = now - datetime.timedelta(hours=int(number.group(0))) + advert.publication_date = date + elif 'jour' in date: + date = now - datetime.timedelta(days=int(number.group(0))) + advert.publication_date = date + return advert + return None + + +class AdvertPage(BasePage): + def get_job_advert(self, url, advert): + job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0] + if not advert: + title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() + society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content() + num_id = url.split('-')[-1] + advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) + + advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content() + description_content = self.document.getroot().xpath('//span[@class="summary"]')[0] + advert.description = u'%s' % self.strip_tags(self.parser.tostring(description_content)) + advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() + advert.url = url + + date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip() + now = datetime.datetime.now() + number = re.search("\d+", date) + if number: + if 'heures' in date: + date = now - datetime.timedelta(hours=int(number.group(0))) + advert.publication_date = date + elif 'jour' in date: + date = now - datetime.timedelta(days=int(number.group(0))) + advert.publication_date = date + + return advert + + def strip_tags(self, html): + s = MLStripper() + s.feed(html) + return s.get_data() + + +class MLStripper(HTMLParser): + def __init__(self): + self.reset() + self.fed = [] + + def handle_data(self, d): + self.fed.append(d) + + def get_data(self): + return ''.join(self.fed) diff --git a/modules/indeed/test.py b/modules/indeed/test.py new file mode 100644 index 00000000..320133c1 --- /dev/null +++ b/modules/indeed/test.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Bezleputh +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + + +class IndeedTest(BackendTest): + BACKEND = 'indeed' + + def test_indeed_search(self): + l = list(self.backend.search_job('informaticien')) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, l[0]) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url)) + + def test_indeed_advanced_search(self): + l = list(self.backend.advanced_search_job()) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, l[0]) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url)) + + def test_indeep_info_from_id(self): + l = list(self.backend.advanced_search_job()) + assert len(l) + advert = self.backend.get_job_advert(l[0].id, None) + self.assertTrue(advert.url, 'URL for announce "%s" not found: %s' % (advert.id, advert.url))