[monster] fix and adapt to browser2

This commit is contained in:
Bezleputh 2014-10-20 15:23:38 +02:00
commit 549551a629
4 changed files with 81 additions and 178 deletions

View file

@ -18,109 +18,64 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Page
from weboob.tools.html import html2text
import re
from datetime import datetime, time, timedelta
from .job import MonsterJobAdvert
from weboob.browser.pages import HTMLPage, pagination
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, Regexp, Filter, Env, BrowserURL, Join
from weboob.browser.filters.html import Link, CleanHTML
from weboob.capabilities.job import BaseJobAdvert
from weboob.capabilities.base import NotAvailable
class SearchPage(Page):
def iter_job_adverts(self):
re_id = re.compile('http://offre-emploi.monster.fr/(.*?).aspx', re.DOTALL)
trs = self.document.getroot().xpath("//table[@class='listingsTable']/tbody/tr")
for tr in trs:
if 'class' in tr.attrib and tr.attrib['class'] != 'aceHidden':
a = self.parser.select(tr, 'td/div/div[@class="jobTitleContainer"]/a', 1, method='xpath')
_id = u'%s' % re_id.search(a.attrib['href']).group(1)
advert = MonsterJobAdvert(_id)
advert.society_name = u'%s' % self.parser.select(tr, 'td/div/div[@class="companyContainer"]/div/a',
1, method='xpath').attrib['title']
advert.title = u'%s' % a.text
date = self.parser.select(tr, 'td/div/div[@class="fnt20"]', 1, method='xpath').text_content().strip()
now = datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - timedelta(hours=int(number.group(0)))
advert.publication_date = datetime.combine(date, time())
elif 'jour' in date:
date = now - timedelta(days=int(number.group(0)))
advert.publication_date = datetime.combine(date, time())
else:
advert.publication_date = datetime.combine(now, time.min)
place = self.parser.select(tr, 'td/div/div[@class="jobLocationSingleLine"]/a', method='xpath')
if len(place) != 0:
advert.place = u'%s' % place[0].attrib['title']
yield advert
class AdvertPage(Page):
def get_job_advert(self, url, advert):
re_id = re.compile('http://offre-emploi.monster.fr/(.*?).aspx', re.DOTALL)
if advert is None:
_id = u'%s' % re_id.search(url).group(1)
advert = MonsterJobAdvert(_id)
advert.url = url
div_normal = self.document.getroot().xpath('//div[@id="jobcopy"]')
div_special = self.document.getroot().xpath('//div[@id="divtxt"]')
if len(div_normal) > 0:
return self.fill_normal_advert(advert, div_normal[0])
elif len(div_special) > 0:
return self.fill_special_advert(advert, div_special[0])
class MonsterDate(Filter):
def filter(self, date):
now = datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - timedelta(hours=int(number.group(0)))
return datetime.combine(date, time())
elif 'jour' in date:
date = now - timedelta(days=int(number.group(0)))
return datetime.combine(date, time())
else:
return advert
return datetime.combine(now, time.min)
def fill_special_advert(self, advert, div):
advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text
description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
advert.description = html2text(self.parser.tostring(description))
titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0]
contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath')
if len(contract_type) != 0:
advert.contract_type = u'%s' % contract_type[0].text_content()
class SearchPage(HTMLPage):
@pagination
@method
class iter_job_adverts(ListElement):
item_xpath = '//table[@class="listingsTable"]/tbody/tr[@class="odd"] | //table[@class="listingsTable"]/tbody/tr[@class="even"]'
return self.fill_advert(advert, titresmenuG)
def next_page(self):
return Link('//a[@title="Suivant"]', default=None)(self)
def fill_normal_advert(self, advert, div):
advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
advert.description = html2text(self.parser.tostring(description))
class item(ItemElement):
klass = BaseJobAdvert
jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0]
contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath')
if len(contract_type) != 0:
advert.contract_type = u'%s' % contract_type[0].text_content()
obj_id = Regexp(Link('./td/div/div[@class="jobTitleContainer"]/a'),
'http://offre-emploi.monster.fr:80/(.*?).aspx')
obj_society_name = CleanText('./td/div/div[@class="companyContainer"]/div/a')
obj_title = CleanText('./td/div/div[@class="jobTitleContainer"]/a')
obj_publication_date = MonsterDate(CleanText('td/div/div[@class="fnt20"]'))
obj_place = CleanText('./td/div/div[@class="jobLocationSingleLine"]/a/@title', default=NotAvailable)
society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath')
if len(society_name) != 0:
advert.society_name = u'%s' % society_name[0].text_content()
return self.fill_advert(advert, jobsummary)
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
def fill_advert(self, advert, jobsummary):
place = self.parser.select(jobsummary, '//span[@itemprop="jobLocation"]', method='xpath')
if len(place) != 0:
advert.place = u'%s' % place[0].text_content()
pay = self.parser.select(jobsummary, '//span[@itemprop="baseSalary"]', method='xpath')
if len(pay) != 0:
advert.pay = u'%s' % pay[0].text_content()
formation = self.parser.select(jobsummary, '//span[@itemprop="educationRequirements"]', method='xpath')
if len(formation) != 0:
advert.formation = u'%s' % formation[0].text_content()
experience = self.parser.select(jobsummary, '//span[@itemprop="qualifications"]', method='xpath')
if len(experience) != 0:
advert.experience = u'%s' % experience[0].text_content()
return advert
obj_id = Env('_id')
obj_url = BrowserURL('advert', _id=Env('_id'))
obj_title = CleanText('//div[@id="jobcopy"]/h1[@itemprop="title"]')
obj_description = CleanHTML('//div[@id="jobBodyContent"]')
obj_contract_type = Join('%s ', '//dd[starts-with(@class, "multipledd")]')
obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]')
obj_place = CleanText('//span[@itemprop="jobLocation"]')
obj_pay = CleanText('//span[@itemprop="baseSalary"]')
obj_formation = CleanText('//span[@itemprop="educationRequirements"]')
obj_experience = CleanText('//span[@itemprop="qualifications"]')