[regionsjob] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-08 00:08:39 +02:00 committed by Florent
commit ec07532a63
4 changed files with 81 additions and 178 deletions

View file

@ -17,100 +17,59 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.misc import html2text
from weboob.tools.browser import BasePage
from .job import RegionsJobAdvert
from datetime import datetime, date
import re
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
from weboob.tools.date import LinearDateGuesser
from weboob.capabilities.job import BaseJobAdvert
__all__ = ['SearchPage']
class SearchPage(BasePage):
def iter_job_adverts(self, website):
re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL)
lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li')
for li in lis:
a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath')
_id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2))
advert = RegionsJobAdvert(_id)
advert.title = u'%s' % a.text
class SearchPage(HTMLPage):
@method
class iter_job_adverts(ListElement):
item_xpath = '//div[@id="liste_offres"]/ul/li'
society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a',
method='xpath')
if len(society_name) > 0:
advert.society_name = u'%s' % society_name[0].text
class item(ItemElement):
klass = BaseJobAdvert
advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span',
1, method='xpath').text.strip()
_date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]',
1, method='xpath').text_content()
year = date.today().year
splitted_date = _date.split('/')
advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0]))
advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span',
1, method='xpath').text
yield advert
obj_id = Format(u'%s#%s',
Env('domain'),
Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation'))
obj_title = CleanText('div/span[@class="offres_poste"]/a')
obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a')
obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span')
obj_contract_type = CleanText('div/span[@class="offres_poste"]/span')
obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser())
class AdvertPage(BasePage):
def get_job_advert(self, url, advert):
re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL)
if advert is None:
_id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
advert = RegionsJobAdvert(_id)
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
advert.url = u'%s' % url
def parse(self, el):
if self.obj.id:
advert = self.obj
advert.url = self.page.url
advert.description = Format(u'%s\r\n%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el)
advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el)
raise SkipItem()
div = self.document.getroot().xpath('//div[@id="annonce"]')[0]
self.env['url'] = self.page.url
advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
obj_description = Format(u'%s%s',
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))
content = self.parser.select(div, 'p', method='xpath')
next_is_date = False
next_is_pay = False
description = ''
for p in content:
if next_is_date:
m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date)
if m:
dd = int(m.group(1))
mm = int(m.group(2))
yyyy = int(m.group(3))
advert.publication_date = datetime.date(yyyy, mm, dd)
next_is_date = False
elif next_is_pay:
advert.pay = html2text(self.parser.tostring(p))
next_is_pay = False
elif 'class' in p.attrib:
if p.attrib['class'] == 'contrat_loc':
_p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
content_p = _p.text_content().strip().split('\r\n')
for el in content_p:
splitted_el = el.split(':')
if len(splitted_el) == 2:
if splitted_el[0] == 'Entreprise':
advert.society_name = splitted_el[1]
elif splitted_el[0] == 'Contrat':
advert.contract_type = splitted_el[1]
elif splitted_el[0] == 'Localisation':
advert.place = splitted_el[1]
elif p.attrib['class'] == 'date_ref':
next_is_date = True
elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
next_is_pay = True
else:
description = description + html2text(self.parser.tostring(p))
else:
description = description + html2text(self.parser.tostring(p))
advert.description = u'%s' % description
return advert
obj_id = Env('_id')
obj_url = Env('url')
obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'),
'(\d{2}/\d{2}/\d{4})'))
obj_title = CleanText('//div[@id="annonce"]/h1')
obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]')
obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]')
obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]')
obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')