[indeed] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-14 13:52:34 +02:00
commit b9260c7bc0
4 changed files with 82 additions and 134 deletions

View file

@ -17,74 +17,70 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from datetime import timedelta, datetime
import re
from weboob.tools.browser import BasePage
from weboob.tools.misc import html2text
from .job import IndeedJobAdvert
from weboob.tools.browser2.page import HTMLPage, method, ListElement, ItemElement, pagination
from weboob.tools.browser2.filters import Filter, CleanText, Regexp, Format, Env, CleanHTML, Attr
from weboob.capabilities.job import BaseJobAdvert
__all__ = ['SearchPage', 'AdvertPage']
class SearchPage(BasePage):
def iter_job_adverts(self):
rows = self.document.getroot().xpath('//div[@itemtype="http://schema.org/JobPosting"]')
for row in rows:
advert = self.create_job_advert(row)
if advert:
yield advert
def create_job_advert(self, row):
advert_from = self.parser.select(row, 'table/tr/td/div[@class="iaP"]', method='xpath')
num_id = row.attrib['id'][2:]
title = self.parser.select(row, 'h2/a', 1, method='xpath').attrib['title']
society_name = self.parser.select(row, 'span[@class="company"]', 1, method='xpath').text_content().strip()
if num_id and title and society_name and advert_from and len(advert_from) > 0:
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.title = u'%s' % title
advert.society_name = u'%s' % society_name
advert.place = u'%s' % self.parser.select(row, 'span/span[@class="location"]', 1, method='xpath').text_content().strip()
date = self.parser.select(row, 'table/tr/td/span[@class="date"]', 1, method='xpath').text_content().strip()
now = datetime.datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = datetime.datetime.combine(date, datetime.time())
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = datetime.datetime.combine(date, datetime.time())
return advert
return None
class AdvertPage(BasePage):
def get_job_advert(self, url, advert):
job_header = self.document.getroot().xpath('//div[@id="job_header"]')[0]
if not advert:
title = self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
society_name = self.parser.select(job_header, 'span[@class="company"]', 1, method='xpath').text_content()
num_id = url.split('-')[-1]
advert = IndeedJobAdvert(society_name + "|" + title + "|" + num_id)
advert.place = u'%s' % self.parser.select(job_header, 'span[@class="location"]', 1, method='xpath').text_content()
description_content = self.document.getroot().xpath('//span[@class="summary"]')[0]
advert.description = html2text(self.parser.tostring(description_content))
advert.job_name = u'%s' % self.parser.select(job_header, 'b[@class="jobtitle"]', 1, method='xpath').text_content()
advert.url = url
date = self.document.getroot().xpath('//span[@class="date"]')[0].text_content().strip()
now = datetime.datetime.now()
class IndeedDate(Filter):
def filter(self, date):
now = datetime.now()
number = re.search("\d+", date)
if number:
if 'heures' in date:
date = now - datetime.timedelta(hours=int(number.group(0)))
advert.publication_date = date
return now - timedelta(hours=int(number.group(0)))
elif 'jour' in date:
date = now - datetime.timedelta(days=int(number.group(0)))
advert.publication_date = date
return now - timedelta(days=int(number.group(0)))
return now
return advert
class SearchPage(HTMLPage):
@pagination
@method
class iter_job_adverts(ListElement):
item_xpath = '//div[@itemtype="http://schema.org/JobPosting"]'
def next_page(self):
for a in self.page.doc.xpath('//a'):
if a.xpath('span[@class="pn"]/span[@class="np"]') and "Suivant" in a.xpath('span[@class="pn"]/span[@class="np"]')[0].text:
return a.attrib['href']
class Item(ItemElement):
klass = BaseJobAdvert
obj_id = CleanText(Format('%s#%s#%s',
Regexp(Attr('.', 'id'), '^..(.*)'),
Attr('h2/a', 'title'),
CleanText('span[@class="company"]')),
replace=[(" ", "-"), ("/", "-")])
obj_title = Attr('h2/a', 'title')
obj_society_name = CleanText('span[@class="company"]')
obj_place = CleanText('span/span[@class="location"]')
obj_publication_date = IndeedDate(CleanText('table/tr/td/span[@class="date"]'))
class AdvertPage(HTMLPage):
@method
class get_job_advert(ItemElement):
klass = BaseJobAdvert
def parse(self, el):
self.env['url'] = self.page.url
self.env['num_id'] = self.page.url.split('-')[-1]
obj_id = Format('%s#%s#%s',
Env('num_id'),
CleanText('//div[@id="job_header"]/b[@class="jobtitle"]'),
CleanText('//div[@id="job_header"]/span[@class="company"]'),
)
obj_title = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]')
obj_place = CleanText('//div[@id="job_header"]/span[@class="location"]')
obj_description = CleanHTML('//span[@class="summary"]')
obj_job_name = CleanText('//div[@id="job_header"]/b[@class="jobtitle"]')
obj_url = Env('url')
obj_publication_date = IndeedDate(CleanText('//span[@class="date"]'))