[regionsjob] adapt to browser2

2014-04-08 00:08:39 +02:00 · 2014-04-08 00:08:39 +02:00 · ec07532a63
commit ec07532a63
parent 34a7481895
4 changed files with 81 additions and 178 deletions
--- a/modules/regionsjob/pages.py
+++ b/modules/regionsjob/pages.py
@ -17,100 +17,59 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.

-from weboob.tools.misc import html2text
-from weboob.tools.browser import BasePage
-from .job import RegionsJobAdvert
-from datetime import datetime, date
-import re
+from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement
+from weboob.tools.browser2.filters import Link, CleanText, Regexp, Format, Env, DateGuesser, CleanHTML, DateTime
+from weboob.tools.date import LinearDateGuesser
+from weboob.capabilities.job import BaseJobAdvert

 __all__ = ['SearchPage']


-class SearchPage(BasePage):
-    def iter_job_adverts(self, website):
-        re_id = re.compile('(.*?)numoffre=(.*?)&de=consultation', re.DOTALL)
-        lis = self.document.getroot().xpath('//div[@id="liste_offres"]/ul/li')
-        for li in lis:
-            a = self.parser.select(li, 'div/span[@class="offres_poste"]/a', 1, method='xpath')
-            _id = u'%s|%s' % (website, re_id.search(a.attrib['href']).group(2))
-            advert = RegionsJobAdvert(_id)
-            advert.title = u'%s' % a.text
+class SearchPage(HTMLPage):
+    @method
+    class iter_job_adverts(ListElement):
+        item_xpath = '//div[@id="liste_offres"]/ul/li'

-            society_name = self.parser.select(li, 'div/span[@class="offres_entreprise"]/span/a',
-                                              method='xpath')
-            if len(society_name) > 0:
-                advert.society_name = u'%s' % society_name[0].text
+        class item(ItemElement):
+            klass = BaseJobAdvert

-            advert.place = u'%s' % self.parser.select(li, 'div/span[@class="offres_ville"]/span/span/span',
-                                                      1, method='xpath').text.strip()
-            _date = u'%s' % self.parser.select(li, 'div/span[@class="offres_date"]',
-                                               1, method='xpath').text_content()
-            year = date.today().year
-            splitted_date = _date.split('/')
-            advert.publication_date = datetime(year, int(splitted_date[1]), int(splitted_date[0]))
-            advert.contract_type = u'%s' % self.parser.select(li, 'div/span[@class="offres_poste"]/span',
-                                                              1, method='xpath').text
-            yield advert
+            obj_id = Format(u'%s#%s',
+                            Env('domain'),
+                            Regexp(Link('div/span[@class="offres_poste"]/a'), '.*?numoffre=(.*?)&de=consultation'))
+            obj_title = CleanText('div/span[@class="offres_poste"]/a')
+            obj_society_name = CleanText('div/span[@class="offres_entreprise"]/span/a')
+            obj_place = CleanText('div/span[@class="offres_ville"]/span/span/span')
+            obj_contract_type = CleanText('div/span[@class="offres_poste"]/span')
+            obj_publication_date = DateGuesser(CleanText('div/span[@class="offres_date"]'), LinearDateGuesser())


-class AdvertPage(BasePage):
-    def get_job_advert(self, url, advert):
-        re_id = re.compile('http://(.*?)/offre_emploi/detailoffre.aspx\?numoffre=(.*?)&de=consultation', re.DOTALL)
-        if advert is None:
-            _id = u'%s|%s' % (re_id.search(url).group(1), re_id.search(url).group(2))
-            advert = RegionsJobAdvert(_id)
+class AdvertPage(HTMLPage):
+    @method
+    class get_job_advert(ItemElement):
+        klass = BaseJobAdvert

-        advert.url = u'%s' % url
+        def parse(self, el):
+            if self.obj.id:
+                advert = self.obj
+                advert.url = self.page.url
+                advert.description = Format(u'%s\r\n%s',
+                                            CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
+                                            CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))(el)
+                advert.pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')(el)
+                raise SkipItem()

-        div = self.document.getroot().xpath('//div[@id="annonce"]')[0]
+            self.env['url'] = self.page.url

-        advert.title = u'%s' % self.parser.select(div, 'h1', 1, method='xpath').text
+        obj_description = Format(u'%s%s',
+                                 CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]'),
+                                 CleanHTML('//div[@id="annonce"]/p[@id="description_annonce"]/following-sibling::p[1]'))

-        content = self.parser.select(div, 'p', method='xpath')
-
-        next_is_date = False
-        next_is_pay = False
-        description = ''
-
-        for p in content:
-            if next_is_date:
-                m = re.match('(\d{2})\s(\d{2})\s(\d{4})', date)
-                if m:
-                    dd = int(m.group(1))
-                    mm = int(m.group(2))
-                    yyyy = int(m.group(3))
-                    advert.publication_date = datetime.date(yyyy, mm, dd)
-                next_is_date = False
-
-            elif next_is_pay:
-                advert.pay = html2text(self.parser.tostring(p))
-                next_is_pay = False
-
-            elif 'class' in p.attrib:
-                if p.attrib['class'] == 'contrat_loc':
-                    _p = self.parser.select(div, 'p[@class="contrat_loc"]', 1, method='xpath')
-                    content_p = _p.text_content().strip().split('\r\n')
-                    for el in content_p:
-                        splitted_el = el.split(':')
-                        if len(splitted_el) == 2:
-                            if splitted_el[0] == 'Entreprise':
-                                advert.society_name = splitted_el[1]
-                            elif splitted_el[0] == 'Contrat':
-                                advert.contract_type = splitted_el[1]
-                            elif splitted_el[0] == 'Localisation':
-                                advert.place = splitted_el[1]
-
-                elif p.attrib['class'] == 'date_ref':
-                    next_is_date = True
-
-                elif p.attrib['class'] == 'rubrique_annonce' and p.text == 'Salaire':
-                    next_is_pay = True
-
-                else:
-                    description = description + html2text(self.parser.tostring(p))
-            else:
-                description = description + html2text(self.parser.tostring(p))
-
-        advert.description = u'%s' % description
-
-        return advert
+        obj_id = Env('_id')
+        obj_url = Env('url')
+        obj_publication_date = DateTime(Regexp(CleanText('//div[@id="annonce"]/p[@class="date_ref"]'),
+                                               '(\d{2}/\d{2}/\d{4})'))
+        obj_title = CleanText('//div[@id="annonce"]/h1')
+        obj_society_name = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[1]')
+        obj_contract_type = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[2]')
+        obj_place = CleanText('//div[@id="annonce"]/p[@class="contrat_loc"]/strong[3]')
+        obj_pay = CleanText('//div[@id="annonce"]/p[@class="rubrique_annonce"]/following-sibling::p[1]')