[monster] fix and adapt to browser2

2014-10-20 15:23:38 +02:00 · 2014-10-20 15:23:38 +02:00 · 549551a629
commit 549551a629
parent 015626b87e
4 changed files with 81 additions and 178 deletions
--- a/modules/monster/pages.py
+++ b/modules/monster/pages.py
@ -18,109 +18,64 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-from weboob.deprecated.browser import Page
-from weboob.tools.html import html2text
 import re
 from datetime import datetime, time, timedelta
-from .job import MonsterJobAdvert
+
+from weboob.browser.pages import HTMLPage, pagination
+from weboob.browser.elements import ItemElement, ListElement, method
+from weboob.browser.filters.standard import CleanText, Regexp, Filter, Env, BrowserURL, Join
+from weboob.browser.filters.html import Link, CleanHTML
+from weboob.capabilities.job import BaseJobAdvert
+from weboob.capabilities.base import NotAvailable


-class SearchPage(Page):
-    def iter_job_adverts(self):
-        re_id = re.compile('http://offre-emploi.monster.fr/(.*?).aspx', re.DOTALL)
-        trs = self.document.getroot().xpath("//table[@class='listingsTable']/tbody/tr")
-        for tr in trs:
-            if 'class' in tr.attrib and tr.attrib['class'] != 'aceHidden':
-                a = self.parser.select(tr, 'td/div/div[@class="jobTitleContainer"]/a', 1, method='xpath')
-                _id = u'%s' % re_id.search(a.attrib['href']).group(1)
-                advert = MonsterJobAdvert(_id)
-                advert.society_name = u'%s' % self.parser.select(tr, 'td/div/div[@class="companyContainer"]/div/a',
-                                                                 1, method='xpath').attrib['title']
-                advert.title = u'%s' % a.text
-
-                date = self.parser.select(tr, 'td/div/div[@class="fnt20"]', 1, method='xpath').text_content().strip()
-                now = datetime.now()
-                number = re.search("\d+", date)
-                if number:
-                    if 'heures' in date:
-                        date = now - timedelta(hours=int(number.group(0)))
-                        advert.publication_date = datetime.combine(date, time())
-                    elif 'jour' in date:
-                        date = now - timedelta(days=int(number.group(0)))
-                        advert.publication_date = datetime.combine(date, time())
-                else:
-                    advert.publication_date = datetime.combine(now, time.min)
-
-                place = self.parser.select(tr, 'td/div/div[@class="jobLocationSingleLine"]/a', method='xpath')
-                if len(place) != 0:
-                    advert.place = u'%s' % place[0].attrib['title']
-
-                yield advert
-
-
-class AdvertPage(Page):
-    def get_job_advert(self, url, advert):
-        re_id = re.compile('http://offre-emploi.monster.fr/(.*?).aspx', re.DOTALL)
-        if advert is None:
-            _id = u'%s' % re_id.search(url).group(1)
-            advert = MonsterJobAdvert(_id)
-
-        advert.url = url
-
-        div_normal = self.document.getroot().xpath('//div[@id="jobcopy"]')
-        div_special = self.document.getroot().xpath('//div[@id="divtxt"]')
-        if len(div_normal) > 0:
-            return self.fill_normal_advert(advert, div_normal[0])
-
-        elif len(div_special) > 0:
-            return self.fill_special_advert(advert, div_special[0])
-
+class MonsterDate(Filter):
+    def filter(self, date):
+        now = datetime.now()
+        number = re.search("\d+", date)
+        if number:
+            if 'heures' in date:
+                date = now - timedelta(hours=int(number.group(0)))
+                return datetime.combine(date, time())
+            elif 'jour' in date:
+                date = now - timedelta(days=int(number.group(0)))
+                return datetime.combine(date, time())
        else:
-            return advert
+            return datetime.combine(now, time.min)

-    def fill_special_advert(self, advert, div):
-        advert.title = u'%s' % self.parser.select(div, 'div[@class="poste"]', 1, method='xpath').text
-        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
-        advert.description = html2text(self.parser.tostring(description))

-        titresmenuG = self.document.getroot().xpath('//div[@id="divmenuGauche"]')[0]
-        contract_type = self.parser.select(titresmenuG, '//span[@itemprop="employmentType"]', method='xpath')
-        if len(contract_type) != 0:
-            advert.contract_type = u'%s' % contract_type[0].text_content()
+class SearchPage(HTMLPage):
+    @pagination
+    @method
+    class iter_job_adverts(ListElement):
+        item_xpath = '//table[@class="listingsTable"]/tbody/tr[@class="odd"] | //table[@class="listingsTable"]/tbody/tr[@class="even"]'

-        return self.fill_advert(advert, titresmenuG)
+        def next_page(self):
+            return Link('//a[@title="Suivant"]', default=None)(self)

-    def fill_normal_advert(self, advert, div):
-        advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
-        description = self.parser.select(div, 'div[@id="jobBodyContent"]', 1, method='xpath')
-        advert.description = html2text(self.parser.tostring(description))
+        class item(ItemElement):
+            klass = BaseJobAdvert

-        jobsummary = self.document.getroot().xpath('//div[@id="jobsummary_content"]')[0]
-        contract_type = self.parser.select(jobsummary, 'dl/dd[@class="multipleddlast"]/span', method='xpath')
-        if len(contract_type) != 0:
-            advert.contract_type = u'%s' % contract_type[0].text_content()
+            obj_id = Regexp(Link('./td/div/div[@class="jobTitleContainer"]/a'),
+                            'http://offre-emploi.monster.fr:80/(.*?).aspx')
+            obj_society_name = CleanText('./td/div/div[@class="companyContainer"]/div/a')
+            obj_title = CleanText('./td/div/div[@class="jobTitleContainer"]/a')
+            obj_publication_date = MonsterDate(CleanText('td/div/div[@class="fnt20"]'))
+            obj_place = CleanText('./td/div/div[@class="jobLocationSingleLine"]/a/@title', default=NotAvailable)

-        society_name = self.parser.select(jobsummary, '//span[@itemprop="name"]', method='xpath')
-        if len(society_name) != 0:
-            advert.society_name = u'%s' % society_name[0].text_content()

-        return self.fill_advert(advert, jobsummary)
+class AdvertPage(HTMLPage):
+    @method
+    class get_job_advert(ItemElement):
+        klass = BaseJobAdvert

-    def fill_advert(self, advert, jobsummary):
-        place = self.parser.select(jobsummary, '//span[@itemprop="jobLocation"]', method='xpath')
-        if len(place) != 0:
-            advert.place = u'%s' % place[0].text_content()
-
-        pay = self.parser.select(jobsummary, '//span[@itemprop="baseSalary"]', method='xpath')
-        if len(pay) != 0:
-            advert.pay = u'%s' % pay[0].text_content()
-
-        formation = self.parser.select(jobsummary, '//span[@itemprop="educationRequirements"]', method='xpath')
-        if len(formation) != 0:
-            advert.formation = u'%s' % formation[0].text_content()
-
-        experience = self.parser.select(jobsummary, '//span[@itemprop="qualifications"]', method='xpath')
-        if len(experience) != 0:
-            advert.experience = u'%s' % experience[0].text_content()
-
-        return advert
+        obj_id = Env('_id')
+        obj_url = BrowserURL('advert', _id=Env('_id'))
+        obj_title = CleanText('//div[@id="jobcopy"]/h1[@itemprop="title"]')
+        obj_description = CleanHTML('//div[@id="jobBodyContent"]')
+        obj_contract_type = Join('%s ', '//dd[starts-with(@class, "multipledd")]')
+        obj_society_name = CleanText('//dd[@itemprop="hiringOrganization"]')
+        obj_place = CleanText('//span[@itemprop="jobLocation"]')
+        obj_pay = CleanText('//span[@itemprop="baseSalary"]')
+        obj_formation = CleanText('//span[@itemprop="educationRequirements"]')
+        obj_experience = CleanText('//span[@itemprop="qualifications"]')