# -*- coding: utf-8 -*- # Copyright(C) 2014 Bezleputh # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . from weboob.tools.misc import html2text from weboob.tools.browser import BasePage from .job import RegionsJobAdvert from datetime import datetime, date import re __all__ = ['SearchPage'] class SearchPage(BasePage): def iter_job_adverts(self, website): re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL) lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li') for li in lis: a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath') _id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2)) advert = RegionsJobAdvert(_id) advert.title = u'%s' % a.text society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a', method='xpath') if len(society_name) > 0: advert.society_name = u'%s' % society_name[0].text advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span', 1, method='xpath').text.strip() _date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]', 1, method='xpath').text_content() year = date.today().year splitted_date = _date.split('/') advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0])) advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span', 1, method='xpath').text yield advert class AdvertPage(BasePage): def get_job_advert(self, url, advert): re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL) if advert is None: _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2)) advert = RegionsJobAdvert(_id) advert.url = u'%s' % url div = self.document.getroot().xpath('//div[@id="annonce"]')[0] advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text content = self.parser.select(div, 'p', method='xpath') next_is_date = False next_is_pay = False description = '' for p in content: if next_is_date: m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date) if m: dd = int(m.group(1)) mm = int(m.group(2)) yyyy = int(m.group(3)) advert.publication_date = datetime.date(yyyy, mm, dd) next_is_date = False elif next_is_pay: advert.pay = html2text(self.parser.tostring(p)) next_is_pay = False elif 'class' in p.attrib: if p.attrib['class'] == 'contrat_loc': _p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath') content_p = _p.text_content().strip().split('\r\n') for el in content_p: splitted_el = el.split(':') if len(splitted_el) == 2: if splitted_el[0] == 'Entreprise': advert.society_name = splitted_el[1] elif splitted_el[0] == 'Contrat': advert.contract_type = splitted_el[1] elif splitted_el[0] == 'Localisation': advert.place = splitted_el[1] elif p.attrib['class'] == 'date_ref': next_is_date = True elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire': next_is_pay = True else: description = description + html2text(self.parser.tostring(p)) else: description = description + html2text(self.parser.tostring(p)) advert.description = u'%s' % description return advert