From b9260c7bc09a50cc53721f380a39718960fd8c97 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Mon, 14 Apr 2014 13:52:34 +0200 Subject: [PATCH] [indeed] adapt to browser2 --- modules/indeed/backend.py | 23 ++++---- modules/indeed/browser.py | 49 ++++++---------- modules/indeed/job.py | 34 ----------- modules/indeed/pages.py | 118 ++++++++++++++++++-------------------- 4 files changed, 86 insertions(+), 138 deletions(-) delete mode 100644 modules/indeed/job.py diff --git a/modules/indeed/backend.py b/modules/indeed/backend.py index 86211af6..025b0b29 100644 --- a/modules/indeed/backend.py +++ b/modules/indeed/backend.py @@ -20,10 +20,9 @@ from weboob.tools.backend import BaseBackend, BackendConfig from weboob.tools.ordereddict import OrderedDict -from weboob.capabilities.job import ICapJob +from weboob.capabilities.job import ICapJob, BaseJobAdvert from weboob.tools.value import Value from .browser import IndeedBrowser -from .job import IndeedJobAdvert __all__ = ['IndeedBackend'] @@ -73,21 +72,19 @@ class IndeedBackend(BaseBackend, ICapJob): Value('radius', label=u'Radius', choices=radius_choices, default='')) def search_job(self, pattern=None): - with self.browser: - return self.browser.search_job(pattern=pattern) + return self.browser.search_job(metier=pattern) def advanced_search_job(self): - return self.browser.advanced_search_job(metier=self.config['metier'].get(), - limit_date=self.config['limit_date'].get(), - contrat=self.config['contrat'].get(), - place=self.config['place'].get(), - radius=self.config['radius'].get()) + return self.browser.search_job(metier=self.config['metier'].get(), + limit_date=self.config['limit_date'].get(), + contrat=self.config['contrat'].get(), + place=self.config['place'].get(), + radius=self.config['radius'].get()) def get_job_advert(self, _id, advert=None): - with self.browser: - return self.browser.get_job_advert(_id, advert) + return self.browser.get_job_advert(_id, advert) def fill_obj(self, advert, fields): - self.get_job_advert(advert.id, advert) + return self.get_job_advert(advert.id, advert) - OBJECTS = {IndeedJobAdvert: fill_obj} + OBJECTS = {BaseJobAdvert: fill_obj} diff --git a/modules/indeed/browser.py b/modules/indeed/browser.py index 40aad6ab..09376e50 100644 --- a/modules/indeed/browser.py +++ b/modules/indeed/browser.py @@ -17,42 +17,31 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . - -from weboob.tools.browser import BaseBrowser - -from weboob.tools.browser.decorators import id2url +from weboob.tools.browser2 import PagesBrowser, URL from .pages import SearchPage, AdvertPage -from .job import IndeedJobAdvert __all__ = ['IndeedBrowser'] -class IndeedBrowser(BaseBrowser): - PROTOCOL = 'http' - DOMAIN = 'www.indeed.fr' - ENCODING = 'UTF-8' - PAGES = { - '%s://%s/Emplois-(.*?)' % (PROTOCOL, DOMAIN): SearchPage, - '%s://%s/emplois(.*?)' % (PROTOCOL, DOMAIN): SearchPage, - '%s://%s/cmp/(.*?)' % (PROTOCOL, DOMAIN): AdvertPage, - '%s://%s/voir-emploi\?(.*?)' % (PROTOCOL, DOMAIN): AdvertPage, - } +class IndeedBrowser(PagesBrowser): - def search_job(self, pattern=None, metier=None, place=None, contrat=None): - self.location('http://www.indeed.fr/emplois?as_and=%s&limit=50&sort=date&st=employer&sr=directhire' - % pattern.replace(' ', '+')) - assert self.is_on_page(SearchPage) + BASEURL = 'http://www.indeed.fr' + + search_page = URL('/emplois(?P.*)', SearchPage) + advert_page = URL('/cmp/(?P.*)/jobs/(?P.*)-(?P<nb>.*)', AdvertPage) + + def search_job(self, metier='', contrat='', limit_date='', radius='', place=''): + params = '?as_ttl=%s&limit=10&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s&radius=%s'\ + % (metier.replace(' ', '+'), contrat, limit_date, radius) + if place: + params = '%s&l=%s' % (params, place) + self.search_page.go(parameters=params) + assert self.search_page.is_here(parameters=params) return self.page.iter_job_adverts() - def advanced_search_job(self, metier=None, contrat=None, limit_date=None, radius=None, place=None): - self.location( - 'http://www.indeed.fr/emplois?as_ttl=%s&limit=50&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s&radius=%s&l=%s' % (metier.replace(' ', '+'), contrat, limit_date, radius, place)) - assert self.is_on_page(SearchPage) - return self.page.iter_job_adverts() - - @id2url(IndeedJobAdvert.id2url) - def get_job_advert(self, url, advert): - self.location(url) - assert self.is_on_page(AdvertPage) - return self.page.get_job_advert(url, advert) + def get_job_advert(self, _id, advert): + splitted_id = _id.split('#') + return self.advert_page.go(nb=splitted_id[0], + title=splitted_id[1], + company=splitted_id[2]).get_job_advert(obj=advert) diff --git a/modules/indeed/job.py b/modules/indeed/job.py deleted file mode 100644 index 4c377a20..00000000 --- a/modules/indeed/job.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2013 Bezleputh -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see <http://www.gnu.org/licenses/>. - -from weboob.capabilities.job import BaseJobAdvert - - -class IndeedJobAdvert(BaseJobAdvert): - - @classmethod - def id2url(cls, _id): - dico_car_part = {" ": "-", - "/": "-", - } - for cle, valeur in dico_car_part.items(): - _id = _id.replace(cle, valeur) - - splitted_id = _id.split('|') - return 'http://www.indeed.fr/cmp/%s/jobs/%s-%s' % (splitted_id[0], splitted_id[1], splitted_id[2]) diff --git a/modules/indeed/pages.py b/modules/indeed/pages.py index 33bf98ee..88d13720 100644 --- a/modules/indeed/pages.py +++ b/modules/indeed/pages.py @@ -17,74 +17,70 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see <http://www.gnu.org/licenses/>. -import datetime +from datetime import timedelta, datetime import re -from weboob.tools.browser import BasePage -from weboob.tools.misc import html2text -from .job import IndeedJobAdvert +from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, pagination +from weboob.tools.browser2.filters import Filter, CleanText, Regexp, Format, Env, CleanHTML, Attr +from weboob.capabilities.job import BaseJobAdvert __all__ = ['SearchPage', 'AdvertPage'] -class SearchPage(BasePage): - def iter_job_adverts(self): - rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]') - for row in rows: - advert = self.create_job_advert(row) - if advert: - yield advert - - def create_job_advert(self, row): - - advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath') - num_id = row.attrib['id'][2:] - title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title'] - society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip() - if num_id and title and society_name and advert_from and len(advert_from) > 0: - - advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) - advert.title = u'%s' % title - advert.society_name = u'%s' % society_name - advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip() - - date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip() - now = datetime.datetime.now() - number = re.search("\d+", date) - if number: - if 'heures' in date: - date = now - datetime.timedelta(hours=int(number.group(0))) - advert.publication_date = datetime.datetime.combine(date, datetime.time()) - elif 'jour' in date: - date = now - datetime.timedelta(days=int(number.group(0))) - advert.publication_date = datetime.datetime.combine(date, datetime.time()) - return advert - return None - - -class AdvertPage(BasePage): - def get_job_advert(self, url, advert): - job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0] - if not advert: - title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() - society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content() - num_id = url.split('-')[-1] - advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id) - - advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content() - description_content = self.document.getroot().xpath('//span[@class="summary"]')[0] - advert.description = html2text(self.parser.tostring(description_content)) - advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content() - advert.url = url - - date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip() - now = datetime.datetime.now() +class IndeedDate(Filter): + def filter(self, date): + now = datetime.now() number = re.search("\d+", date) if number: if 'heures' in date: - date = now - datetime.timedelta(hours=int(number.group(0))) - advert.publication_date = date + return now - timedelta(hours=int(number.group(0))) elif 'jour' in date: - date = now - datetime.timedelta(days=int(number.group(0))) - advert.publication_date = date + return now - timedelta(days=int(number.group(0))) + return now - return advert + +class SearchPage(HTMLPage): + @pagination + @method + class iter_job_adverts(ListElement): + item_xpath = '//div[@itemtype="http://schema.org/JobPosting"]' + + def next_page(self): + for a in self.page.doc.xpath('//a'): + if a.xpath('span[@class="pn"]/span[@class="np"]') and "Suivant" in a.xpath('span[@class="pn"]/span[@class="np"]')[0].text: + return a.attrib['href'] + + class Item(ItemElement): + klass = BaseJobAdvert + + obj_id = CleanText(Format('%s#%s#%s', + Regexp(Attr('.', 'id'), '^..(.*)'), + Attr('h2/a', 'title'), + CleanText('span[@class="company"]')), + replace=[(" ", "-"), ("/", "-")]) + obj_title = Attr('h2/a', 'title') + obj_society_name = CleanText('span[@class="company"]') + obj_place = CleanText('span/span[@class="location"]') + obj_publication_date = IndeedDate(CleanText('table/tr/td/span[@class="date"]')) + + +class AdvertPage(HTMLPage): + + @method + class get_job_advert(ItemElement): + klass = BaseJobAdvert + + def parse(self, el): + self.env['url'] = self.page.url + self.env['num_id'] = self.page.url.split('-')[-1] + + obj_id = Format('%s#%s#%s', + Env('num_id'), + CleanText('//div[@id="job_header"]/b[@class="jobtitle"]'), + CleanText('//div[@id="job_header"]/span[@class="company"]'), + ) + obj_title = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]') + obj_place = CleanText('//div[@id="job_header"]/span[@class="location"]') + obj_description = CleanHTML('//span[@class="summary"]') + obj_job_name = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]') + obj_url = Env('url') + obj_publication_date = IndeedDate(CleanText('//span[@class="date"]'))