[cci] adapt to browser2

This commit is contained in:
Bezleputh 2014-04-10 10:08:12 +02:00
commit bd38a16d76
3 changed files with 59 additions and 69 deletions

View file

@ -18,8 +18,9 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.capabilities.job import ICapJob, BaseJobAdvert from weboob.capabilities.job import ICapJob, BaseJobAdvert
from weboob.tools.value import Value
from .browser import CciBrowser from .browser import CciBrowser
@ -37,19 +38,18 @@ class CciBackend(BaseBackend, ICapJob):
BROWSER = CciBrowser BROWSER = CciBrowser
CONFIG = BackendConfig(Value('metier', label='Job name', masked=False, default=''))
def search_job(self, pattern=None): def search_job(self, pattern=None):
with self.browser: return self.browser.search_job(pattern)
for job_advert in self.browser.search_job(pattern):
yield job_advert
def advanced_search_job(self): def advanced_search_job(self):
return [] return self.browser.search_job(pattern=self.config['metier'].get())
def get_job_advert(self, _id, advert=None): def get_job_advert(self, _id, advert=None):
with self.browser: return self.browser.get_job_advert(_id, advert)
return self.browser.get_job_advert(_id, advert)
def fill_obj(self, advert, fields): def fill_obj(self, advert, fields):
self.get_job_advert(advert.id, advert) return self.get_job_advert(advert.id, advert)
OBJECTS = {BaseJobAdvert: fill_obj} OBJECTS = {BaseJobAdvert: fill_obj}

View file

@ -17,8 +17,8 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser2 import PagesBrowser, URL
from weboob.tools.browser import BaseBrowser from weboob.capabilities.job import BaseJobAdvert
from .pages import SearchPage from .pages import SearchPage
@ -26,21 +26,15 @@ from .pages import SearchPage
__all__ = ['CciBrowser'] __all__ = ['CciBrowser']
class CciBrowser(BaseBrowser): class CciBrowser(PagesBrowser):
PROTOCOL = 'http' BASEURL = 'http://www.cci.fr'
DOMAIN = 'www.cci.fr/web/recrutement/les-offres-d-emploi'
ENCODING = "UTF-8"
PAGES = { search_page = URL('/web/recrutement/les-offres-d-emploi', SearchPage)
'%s://%s' % (PROTOCOL, DOMAIN): SearchPage,
}
def search_job(self, pattern): def search_job(self, pattern):
self.location('%s://%s' % (self.PROTOCOL, self.DOMAIN)) return self.search_page.go().iter_job_adverts(pattern=pattern)
assert self.is_on_page(SearchPage)
return self.page.iter_job_adverts(pattern)
def get_job_advert(self, _id, advert): def get_job_advert(self, _id, advert):
self.location('%s://%s' % (self.PROTOCOL, self.DOMAIN)) if advert is None:
assert self.is_on_page(SearchPage) advert = BaseJobAdvert(_id)
return self.page.get_job_advert(_id, advert) return self.search_page.stay_or_go().get_job_advert(obj=advert)

View file

@ -17,63 +17,59 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import dateutil.parser from weboob.tools.browser2.page import HTMLPage, method, ItemElement, TableElement
from weboob.tools.browser2.filters import Filter, Link, CleanText, Format, Env, DateTime, CleanHTML, TableCell, Join
from weboob.tools.browser import BasePage
from weboob.capabilities.job import BaseJobAdvert from weboob.capabilities.job import BaseJobAdvert
__all__ = ['SearchPage'] __all__ = ['SearchPage']
class SearchPage(BasePage): class Child(Filter):
def iter_job_adverts(self, pattern): def filter(self, el):
trs = self.document.getroot().xpath("//tr[@class='texteCol2TableauClair']") \ return list(el[0].iterchildren())
+ self.document.getroot().xpath("//tr[@class='texteCol2TableauFonce']")
for tr in trs:
tds = self.parser.select(tr, 'td', method='xpath')
a = self.parser.select(tds[2], 'a', 1, method='xpath')
advert = BaseJobAdvert(a.attrib['href'].replace('#', ''))
advert.title = u'%s' % a.text_content()
advert.society_name = u'CCI %s' % tds[3].text
advert.place = u'%s' % tds[0].text
advert.job_name = u'%s' % tds[1].text
if pattern is not None:
if pattern in advert.title or pattern in advert.job_name:
yield advert
else:
yield advert
def get_job_advert(self, _id, advert): class SearchPage(HTMLPage):
if advert is None: @method
advert = BaseJobAdvert(_id) class iter_job_adverts(TableElement):
item_xpath = "//tr[(@class='texteCol2TableauClair' or @class='texteCol2TableauFonce')]"
head_xpath = "//tr[1]/td[@class='titreCol2Tableau']/text()"
items = self.document.getroot().xpath("//div[@id='divrecueil']")[0] col_place = u'Région'
keep_next = False col_job_name = u'Filière'
for item in items: col_id = u'Intitulé du poste'
col_society_name = u'CCI(R)'
if keep_next: class item(ItemElement):
if item.tag == 'div' and item.attrib['id'] == u'offre': klass = BaseJobAdvert
first_div = self.parser.select(item, 'div/span', 2, method='xpath')
advert.society_name = u'CCI %s' % first_div[0].text_content()
advert.job_name = u'%s' % first_div[1].text_content()
second_div = self.parser.select(item, 'div/fieldset', 2, method='xpath') def validate(self, advert):
if advert and 'pattern' in self.env and self.env['pattern']:
return self.env['pattern'].upper() in advert.title.upper() or \
self.env['pattern'].upper() in advert.job_name.upper()
return True
ps_1 = self.parser.select(second_div[0], 'p[@class="normal"]', method='xpath') obj_id = CleanText(Link(Child(TableCell('id'))), replace=[('#', '')])
h2s_1 = self.parser.select(second_div[0], 'h2[@class="titreParagraphe"]', method='xpath') obj_title = Format('%s - %s', CleanText(TableCell('id')), CleanText(TableCell('job_name')))
description = "" obj_society_name = Format(u'CCI %s', CleanText(TableCell('society_name')))
if len(ps_1) == 5 and len(h2s_1) == 5: obj_place = CleanText(TableCell('place'))
for i in range(0, 5): obj_job_name = CleanText(TableCell('id'))
description += "\r\n-- %s --\r\n" % h2s_1[i].text
description += "%s\r\n" % ps_1[i].text_content()
advert.description = description
advert.url = self.url + '#' + advert.id
date = self.parser.select(item, 'div/fieldset/p[@class="dateOffre"]', 1, method='xpath')
advert.publication_date = dateutil.parser.parse(date.text_content()).date()
break
if item.tag == 'a' and u'%s' % item.attrib['name'] == u'%s' % _id: @method
keep_next = True class get_job_advert(ItemElement):
klass = BaseJobAdvert
return advert obj_url = Format('%s#%s', Env('url'), Env('id'))
obj_description = Join('%s\r\n',
'div/fieldset/*[(@class="titreParagraphe" or @class="normal")]',
textCleaner=CleanHTML)
obj_title = CleanText('div/span[@class="intituleposte"]')
obj_job_name = CleanText('div/span[@class="intituleposte"]')
obj_society_name = Format('CCI %s', CleanText('div/span[@class="crci crcititle"]'))
obj_publication_date = DateTime(CleanText('div/fieldset/p[@class="dateOffre"]'), dayfirst=True)
def parse(self, el):
self.el = el.xpath("//a[@name='%s']/following-sibling::div[1]" % self.obj.id)[0]
self.env['url'] = self.page.url
self.env['id'] = self.obj.id