[pap] adapt to browser2

2014-10-09 15:53:42 +02:00 · 2014-10-09 15:53:42 +02:00 · f00c9233c3
commit f00c9233c3
parent c3c2ed404a
3 changed files with 144 additions and 145 deletions
--- a/modules/pap/browser.py
+++ b/modules/pap/browser.py
@ -20,35 +20,37 @@

 import urllib

-from weboob.tools.json import json
-
-from weboob.deprecated.browser import Browser
+from weboob.browser import PagesBrowser, URL
 from weboob.capabilities.housing import Query

-from .pages import SearchResultsPage, HousingPage
+from .pages import SearchResultsPage, HousingPage, CitiesPage


 __all__ = ['PapBrowser']


-class PapBrowser(Browser):
-    PROTOCOL = 'http'
-    DOMAIN = 'www.pap.fr'
-    ENCODING = 'utf-8'
-    PAGES = {
-         'http://www.pap.fr/annonce/.*':  SearchResultsPage,
-         'http://www.pap.fr/annonces/.*': HousingPage,
-        }
+class PapBrowser(PagesBrowser):
+
+    BASEURL = 'http://www.pap.fr'
+    search_page = URL('annonce/.*', SearchResultsPage)
+    housing = URL('annonces/(?P<_id>.*)', HousingPage)
+    cities = URL('index/ac-geo2\?q=(?P<pattern>.*)', CitiesPage)

    def search_geo(self, pattern):
-        fp = self.openurl(self.buildurl('http://www.pap.fr/index/ac-geo', q=pattern.encode('utf-8')))
-        return json.load(fp)
+        return self.cities.open(pattern=pattern).iter_cities()

    TYPES = {Query.TYPE_RENT: 'location',
-             Query.TYPE_SALE: 'vente',
-            }
+             Query.TYPE_SALE: 'vente'}
+
+    RET = {Query.HOUSE_TYPES.HOUSE: 'maison',
+           Query.HOUSE_TYPES.APART: 'appartement',
+           Query.HOUSE_TYPES.LAND: 'terrain',
+           Query.HOUSE_TYPES.PARKING: 'garage-parking',
+           Query.HOUSE_TYPES.OTHER: 'divers'}
+
+    def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max, house_types):
+        self.session.headers.update({'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'})

-    def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max):
        data = {'geo_objets_ids': ','.join(cities),
                'surface[min]':   area_min or '',
                'surface[max]':   area_max or '',
@ -57,21 +59,19 @@ class PapBrowser(Browser):
                'produit':        self.TYPES.get(type, 'location'),
                'recherche':      1,
                'nb_resultats_par_page': 40,
-                'submit':         'rechercher',
-                'typesbien[]':    'appartement',
-               }
+                }

        if nb_rooms:
            data['nb_pieces[min]'] = nb_rooms
            data['nb_pieces[max]'] = nb_rooms

-        self.location('/annonce/', urllib.urlencode(data))
-        assert self.is_on_page(SearchResultsPage)
+        ret = []
+        for house_type in house_types:
+            if house_type in self.RET:
+                ret.append(self.RET.get(house_type))

-        return self.page.iter_housings()
+        _data = '%s%s%s' % (urllib.urlencode(data), '&typesbien%5B%5D=', '&typesbien%5B%5D='.join(ret))
+        return self.search_page.go(data=_data).iter_housings()

-    def get_housing(self, housing):
-        self.location('/annonces/%s' % urllib.quote(housing))
-
-        assert self.is_on_page(HousingPage)
-        return self.page.get_housing()
+    def get_housing(self, _id, housing=None):
+        return self.housing.go(_id=_id).get_housing(obj=housing)
--- a/modules/pap/module.py
+++ b/modules/pap/module.py
@ -18,8 +18,7 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-
-from weboob.capabilities.housing import CapHousing, City, Housing, HousingPhoto
+from weboob.capabilities.housing import CapHousing, Housing, HousingPhoto
 from weboob.tools.backend import Module

 from .browser import PapBrowser
@ -38,41 +37,35 @@ class PapModule(Module, CapHousing):
    BROWSER = PapBrowser

    def search_housings(self, query):
-        cities = [c.id for c in query.cities if c.backend == self.name]
+        cities = ['%s' % c.id for c in query.cities if c.backend == self.name]
        if len(cities) == 0:
            return list()

-        with self.browser:
-            return self.browser.search_housings(query.type, cities, query.nb_rooms,
-                                                query.area_min, query.area_max,
-                                                query.cost_min, query.cost_max)
+        return self.browser.search_housings(query.type, cities, query.nb_rooms,
+                                            query.area_min, query.area_max,
+                                            query.cost_min, query.cost_max,
+                                            query.house_types)

    def get_housing(self, housing):
        if isinstance(housing, Housing):
            id = housing.id
        else:
            id = housing
+            housing = None

-        with self.browser:
-            return self.browser.get_housing(id)
+        return self.browser.get_housing(id, housing)

    def search_city(self, pattern):
-        with self.browser:
-            for city in self.browser.search_geo(pattern):
-                c = City(city['id'])
-                c.name = unicode(city['name'])
-                yield c
+        return self.browser.search_geo(pattern)

    def fill_housing(self, housing, fields):
-        with self.browser:
-            return self.browser.get_housing(housing.id)
+        return self.browser.get_housing(housing.id, housing)

    def fill_photo(self, photo, fields):
-        with self.browser:
-            if 'data' in fields and photo.url and not photo.data:
-                photo.data = self.browser.readurl(photo.url)
+        if 'data' in fields and photo.url and not photo.data:
+            photo.data = self.browser.readurl(photo.url)
        return photo

    OBJECTS = {Housing: fill_housing,
               HousingPhoto: fill_photo,
-              }
+               }
--- a/modules/pap/pages.py
+++ b/modules/pap/pages.py
@ -18,105 +18,111 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-import re
 from decimal import Decimal
-from dateutil.parser import parse as parse_date

-from weboob.deprecated.browser import Page
+from weboob.tools.date import parse_french_date
+from weboob.browser.pages import HTMLPage, JsonPage, pagination
+from weboob.browser.elements import ItemElement, ListElement, method
+from weboob.browser.filters.standard import CleanText, CleanDecimal, Regexp, Env, BrowserURL, Format
+from weboob.browser.filters.html import Link, XPath, CleanHTML
+from weboob.browser.filters.json import Dict
 from weboob.capabilities.base import NotAvailable
-from weboob.capabilities.housing import Housing
+from weboob.capabilities.housing import Housing, City, HousingPhoto


-class SearchResultsPage(Page):
-    DATE_RE = re.compile('Annonce \w+ du (.*)')
-    MONTHS = {u'janvier':   'january',
-              u'février':   'february',
-              u'mars':      'march',
-              u'avril':     'april',
-              u'mai':       'may',
-              u'juin':      'june',
-              u'juillet':   'july',
-              u'août':      'august',
-              u'septembre': 'september',
-              u'octobre':   'october',
-              u'novembre':  'november',
-              u'décembre':  'december',
-             }
-
-    def iter_housings(self):
-        for div in self.document.getroot().cssselect('div.annonce-resume'):
-            a = div.cssselect('td.lien-annonce')[0].find('a')
-            if a is None:
-                # not a real announce.
-                continue
-
-            id = a.attrib['href'].split('-')[-1]
-            housing = Housing(id)
-            housing.title = a.text.strip()
-            m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
-            if m:
-                housing.area = Decimal(m.group(3))
-
-            housing.cost = Decimal(div.cssselect('td.prix')[0].text.strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.'))
-            housing.currency = u'€'
-
-            m = self.DATE_RE.match(div.cssselect('p.date-publication')[0].text.strip())
-            if m:
-                date = m.group(1)
-                for fr, en in self.MONTHS.iteritems():
-                    date = date.replace(fr, en)
-                housing.date = parse_date(date)
-
-            metro = div.cssselect('p.metro')
-            if len(metro) > 0:
-                housing.station = unicode(metro[0].text.strip())
-            else:
-                housing.station = NotAvailable
-
-            p = div.cssselect('p.annonce-resume-texte')[0]
-            b = p.findall('b')
-            if len(b) > 0:
-                housing.text = b[0].tail.strip()
-                housing.location = unicode(b[0].text)
-            else:
-                housing.text = p.text.strip()
-
-            housing.photos = NotAvailable
-
-            yield housing
-
-
-class HousingPage(Page):
-    def get_housing(self):
-        div = self.parser.select(self.document.getroot(), 'div#annonce_detail', 1)
-        housing = Housing(self.url.split('-')[-1])
-
-        parts = div.find('h1').text.split(' - ')
-        housing.title = parts[0].strip()
-        housing.cost = Decimal(parts[1].strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.'))
-        housing.currency = u'€'
-
-        m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
-        if m:
-            housing.area = Decimal(m.group(3))
-
-        housing.date = housing.station = housing.location = housing.phone = NotAvailable
-
-        metro = div.cssselect('p.metro')
-        if len(metro) > 0:
-            housing.station = metro[0].text.strip()
-
-        p = div.cssselect('p.annonce-detail-texte')[0]
-        b = p.findall('b')
-        if len(b) > 0:
-            housing.text = b[0].tail.strip()
-            housing.location = unicode(b[0].text)
-            if len(b) > 1:
-                housing.phone = b[1].text
+class DictElement(ListElement):
+    def find_elements(self):
+        if self.item_xpath is not None:
+            for el in self.el:
+                yield el
        else:
-            housing.text = p.text.strip()
+            yield self.el

-        housing.details = NotAvailable
-        housing.photos = NotAvailable

-        return housing
+class CitiesPage(JsonPage):
+    @method
+    class iter_cities(DictElement):
+        item_xpath = '.'
+
+        class item(ItemElement):
+            klass = City
+
+            obj_id = Dict('id')
+            obj_name = Dict('name')
+
+
+class SearchResultsPage(HTMLPage):
+    @pagination
+    @method
+    class iter_housings(ListElement):
+        item_xpath = '//li[@class="annonce"]'
+
+        def next_page(self):
+            return Link('//ul[@class="pagination"]/li[@class="next"]/a')(self)
+
+        class item(ItemElement):
+            klass = Housing
+
+            obj_id = Regexp(Link('./div[@class="header-annonce"]/a'), '/annonces/(.*)')
+            obj_title = CleanText('./div[@class="header-annonce"]/a')
+            obj_area = CleanDecimal(Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="desc"]'),
+                                           '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
+            obj_cost = CleanDecimal(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
+                                    replace_dots=(',', '.'), default=Decimal(0))
+            obj_currency = Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
+                                  '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€')
+
+            def obj_date(self):
+                _date = CleanText('./div[@class="header-annonce"]/span[@class="date"]')(self)
+                return parse_french_date(_date)
+
+            obj_station = CleanText('./div/div/div[@cladd=metro]', default=NotAvailable)
+            obj_location = CleanText('./div[@class="clearfix"]/div/a/span/img/@alt')
+            obj_text = CleanText('./div[@class="clearfix"]/div[@class="description clearfix"]/p')
+
+            def obj_photos(self):
+                photos = []
+                for img in XPath('//div[@class="vignette-annonce"]/a/span/img/@src')(self):
+                    photos.append(HousingPhoto(u'%s' % img))
+                return photos
+
+
+class HousingPage(HTMLPage):
+    @method
+    class get_housing(ItemElement):
+        klass = Housing
+
+        obj_id = Env('_id')
+        obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]')
+        obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]')
+        obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'),
+                              '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€')
+        obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'),
+                                '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
+        obj_location = CleanText('//div[@class="text-annonce"]/h2')
+        obj_text = CleanHTML('//div[@class="text-annonce"]/p')
+        obj_station = CleanText('//div[@class="metro"]')
+        obj_phone = CleanText('//span[@class="telephone hide-tel"]')
+        obj_url = BrowserURL('housing', _id=Env('_id'))
+
+        def obj_details(self):
+            details = dict()
+            for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self):
+                key = CleanText('./span[@class="label"]')(item)
+                value = CleanText('.', replace=[(key, '')])(item)
+                if value and key:
+                    details[key] = value
+
+            key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self)
+            value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'),
+                           CleanText('//div[@class="classe-energie-content"]/div/@class',
+                                     replace=[('-', ' ')]))(self)
+            if value and key:
+                details[key] = value
+            return details
+
+        def obj_photos(self):
+            photos = []
+            for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self):
+                photos.append(HousingPhoto(u'%s' % img))
+            return photos