[indeed] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-14 13:52:34 +02:00
commit b9260c7bc0
4 changed files with 82 additions and 134 deletions

View file

@ -20,10 +20,9 @@
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.tools.ordereddict import OrderedDict
from weboob.capabilities.job import ICapJob
from weboob.capabilities.job import ICapJob, BaseJobAdvert
from weboob.tools.value import Value
from .browser import IndeedBrowser
from .job import IndeedJobAdvert
__all__ = ['IndeedBackend']
@ -73,21 +72,19 @@ class IndeedBackend(BaseBackend, ICapJob):
Value('radius', label=u'Radius', choices=radius_choices, default=''))
def search_job(self, pattern=None):
with self.browser:
return self.browser.search_job(pattern=pattern)
return self.browser.search_job(metier=pattern)
def advanced_search_job(self):
return self.browser.advanced_search_job(metier=self.config['metier'].get(),
limit_date=self.config['limit_date'].get(),
contrat=self.config['contrat'].get(),
place=self.config['place'].get(),
radius=self.config['radius'].get())
return self.browser.search_job(metier=self.config['metier'].get(),
limit_date=self.config['limit_date'].get(),
contrat=self.config['contrat'].get(),
place=self.config['place'].get(),
radius=self.config['radius'].get())
def get_job_advert(self, _id, advert=None):
with self.browser:
return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert)
return self.get_job_advert(advert.id, advert)
OBJECTS = {IndeedJobAdvert: fill_obj}
OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -17,42 +17,31 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from weboob.tools.browser2 import PagesBrowser, URL
from .pages import SearchPage, AdvertPage
from .job import IndeedJobAdvert
__all__ = ['IndeedBrowser']
class IndeedBrowser(BaseBrowser):
PROTOCOL = 'http'
DOMAIN = 'www.indeed.fr'
ENCODING = 'UTF-8'
PAGES = {
'%s://%s/Emplois-(.*?)' % (PROTOCOL, DOMAIN): SearchPage,
'%s://%s/emplois(.*?)' % (PROTOCOL, DOMAIN): SearchPage,
'%s://%s/cmp/(.*?)' % (PROTOCOL, DOMAIN): AdvertPage,
'%s://%s/voir-emploi\?(.*?)' % (PROTOCOL, DOMAIN): AdvertPage,
}
class IndeedBrowser(PagesBrowser):
def search_job(self, pattern=None, metier=None, place=None, contrat=None):
self.location('http://www.indeed.fr/emplois?as_and=%s&limit=50&sort=date&st=employer&sr=directhire'
% pattern.replace(' ', '+'))
assert self.is_on_page(SearchPage)
BASEURL = 'http://www.indeed.fr'
search_page = URL('/emplois(?P<parameters>.*)', SearchPage)
advert_page = URL('/cmp/(?P<company>.*)/jobs/(?P<title>.*)-(?P<nb>.*)', AdvertPage)
def search_job(self, metier='', contrat='', limit_date='', radius='', place=''):
params = '?as_ttl=%s&limit=10&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s&radius=%s'\
% (metier.replace(' ', '+'), contrat, limit_date, radius)
if place:
params = '%s&l=%s' % (params, place)
self.search_page.go(parameters=params)
assert self.search_page.is_here(parameters=params)
return self.page.iter_job_adverts()
def advanced_search_job(self, metier=None, contrat=None, limit_date=None, radius=None, place=None):
self.location(
'http://www.indeed.fr/emplois?as_ttl=%s&limit=50&sort=date&st=employer&sr=directhire&jt=%s&fromage=%s&radius=%s&l=%s' % (metier.replace(' ', '+'), contrat, limit_date, radius, place))
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts()
@id2url(IndeedJobAdvert.id2url)
def get_job_advert(self, url, advert):
self.location(url)
assert self.is_on_page(AdvertPage)
return self.page.get_job_advert(url, advert)
def get_job_advert(self, _id, advert):
splitted_id = _id.split('#')
return self.advert_page.go(nb=splitted_id[0],
title=splitted_id[1],
company=splitted_id[2]).get_job_advert(obj=advert)

View file

@ -1,34 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Bezleputh
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.job import BaseJobAdvert
class IndeedJobAdvert(BaseJobAdvert):
@classmethod
def id2url(cls, _id):
dico_car_part = {" ": "-",
"/": "-",
}
for cle, valeur in dico_car_part.items():
_id = _id.replace(cle, valeur)
splitted_id = _id.split('|')
return 'http://www.indeed.fr/cmp/%s/jobs/%s-%s' % (splitted_id[0], splitted_id[1], splitted_id[2])

View file

@ -17,74 +17,70 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from datetime import timedelta, datetime
import re
from weboob.tools.browser import BasePage
from weboob.tools.misc import html2text
from .job import IndeedJobAdvert
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, pagination
from weboob.tools.browser2.filters import Filter, CleanText, Regexp, Format, Env, CleanHTML, Attr
from weboob.capabilities.job import BaseJobAdvert
__all__ = ['SearchPage', 'AdvertPage']
class SearchPage(BasePage):
def iter_job_adverts(self):
rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]')
for row in rows:
advert = self.create_job_advert(row)
if advert:
yield advert
def create_job_advert(self, row):
advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath')
num_id = row.attrib['id'][2:]
title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title']
society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip()
if num_id and title and society_name and advert_from and len(advert_from) > 0:
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.title = u'%s' % title
advert.society_name = u'%s' % society_name
advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip()
date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip()
now = datetime.datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = datetime.datetime.combine(date, datetime.time())
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = datetime.datetime.combine(date, datetime.time())
return advert
return None
class AdvertPage(BasePage):
def get_job_advert(self, url, advert):
job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0]
if not advert:
title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content()
num_id = url.split('-')[-1]
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content()
description_content = self.document.getroot().xpath('//span[@class="summary"]')[0]
advert.description = html2text(self.parser.tostring(description_content))
advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
advert.url = url
date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip()
now = datetime.datetime.now()
class IndeedDate(Filter):
def filter(self, date):
now = datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = date
return now - timedelta(hours=int(number.group(0)))
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = date
return now - timedelta(days=int(number.group(0)))
return now
return advert
class SearchPage(HTMLPage):
@pagination
@method
class iter_job_adverts(ListElement):
item_xpath = '//div[@itemtype="http://schema.org/JobPosting"]'
def next_page(self):
for a in self.page.doc.xpath('//a'):
if a.xpath('span[@class="pn"]/span[@class="np"]') and "Suivant" in a.xpath('span[@class="pn"]/span[@class="np"]')[0].text:
return a.attrib['href']
class Item(ItemElement):
klass = BaseJobAdvert
obj_id = CleanText(Format('%s#%s#%s',
Regexp(Attr('.', 'id'), '^..(.*)'),
Attr('h2/a', 'title'),
CleanText('span[@class="company"]')),
replace=[(" ", "-"), ("/", "-")])
obj_title = Attr('h2/a', 'title')
obj_society_name = CleanText('span[@class="company"]')
obj_place = CleanText('span/span[@class="location"]')
obj_publication_date = IndeedDate(CleanText('table/tr/td/span[@class="date"]'))
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
def parse(self, el):
self.env['url'] = self.page.url
self.env['num_id'] = self.page.url.split('-')[-1]
obj_id = Format('%s#%s#%s',
Env('num_id'),
CleanText('//div[@id="job_header"]/b[@class="jobtitle"]'),
CleanText('//div[@id="job_header"]/span[@class="company"]'),
)
obj_title = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]')
obj_place = CleanText('//div[@id="job_header"]/span[@class="location"]')
obj_description = CleanHTML('//span[@class="summary"]')
obj_job_name = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]')
obj_url = Env('url')
obj_publication_date = IndeedDate(CleanText('//span[@class="date"]'))