From 2a860200b962763df5576ffbdfcb5b92fa20e2d5 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Tue, 14 Feb 2012 16:52:30 +0100 Subject: [PATCH] use mobile API to parse housing informations --- modules/seloger/browser.py | 11 ++---- modules/seloger/pages.py | 75 ++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 47 deletions(-) diff --git a/modules/seloger/browser.py b/modules/seloger/browser.py index b4bf3516..f6abb863 100644 --- a/modules/seloger/browser.py +++ b/modules/seloger/browser.py @@ -20,10 +20,9 @@ import json -from weboob.capabilities.base import NotAvailable from weboob.tools.browser import BaseBrowser -from .pages import SearchResultsPage, HousingPage, HousingPhotosPage +from .pages import SearchResultsPage, HousingPage __all__ = ['SeLogerBrowser'] @@ -36,8 +35,7 @@ class SeLogerBrowser(BaseBrowser): PAGES = { 'http://www.seloger.com/(pre)?recherche.htm.*': SearchResultsPage, 'http://www.seloger.com/annonces.htm.*': SearchResultsPage, - 'http://www.seloger.com/annonces/.*': HousingPage, - 'http://www.seloger.com/\d+/incl_detail_annonce_load_diapo_new.htm': HousingPhotosPage, + 'http://ws.seloger.com/annonceDetail.xml\?idAnnonce=(\d+)': HousingPage, } def search_geo(self, pattern): @@ -61,12 +59,9 @@ class SeLogerBrowser(BaseBrowser): return self.page.iter_housings() def get_housing(self, id, obj=None): - self.location('/%d/detail_new.htm' % int(id)) + self.location(self.buildurl('http://ws.seloger.com/annonceDetail.xml', idAnnonce=id)) assert self.is_on_page(HousingPage) housing = self.page.get_housing(obj) - self.location('/%d/incl_detail_annonce_load_diapo_new.htm' % int(id)) - housing.photos = list(self.page.iter_photos()) or NotAvailable - return housing diff --git a/modules/seloger/pages.py b/modules/seloger/pages.py index 5f7284b4..96eb5bd8 100644 --- a/modules/seloger/pages.py +++ b/modules/seloger/pages.py @@ -18,21 +18,21 @@ # along with weboob. If not, see . -import re from datetime import date +from dateutil.parser import parse as parse_date -from weboob.tools.browser import BasePage, BrokenPageError +from weboob.tools.browser import BasePage from weboob.capabilities.base import NotAvailable from weboob.capabilities.housing import Housing, HousingPhoto -__all__ = ['SearchResultsPage', 'HousingPage', 'HousingPhotosPage'] +__all__ = ['SearchResultsPage', 'HousingPage'] -def sanitarize_cost(t): - return int(float(t.strip(u' \t\u20ac\xa0c€\n\r').replace(u'\xa0', u'').replace(',', '.'))) - class SearchResultsPage(BasePage): + def sanitarize_cost(t): + return int(float(t.strip(u' \t\u20ac\xa0c€\n\r').replace(u'\xa0', u'').replace(',', '.'))) + def iter_housings(self): for div in self.document.getroot().cssselect('div.ann_ann_border'): id = div.find('a').attrib['id'][3:] @@ -42,11 +42,11 @@ class SearchResultsPage(BasePage): housing.title = head.xpath('.//span[@class="mea1"]/a')[0].text.strip() parts = head.xpath('.//span[@class="mea2"]')[0].text.strip().split('+') - housing.cost = sanitarize_cost(parts[0]) + housing.cost = self.sanitarize_cost(parts[0]) if len(parts) > 1: for span in head.xpath('.//span[@class="addprixfr"]/span/strong'): if span.text.strip() == u'Charges\xa0:': - housing.cost += sanitarize_cost(span.tail) + housing.cost += self.sanitarize_cost(span.tail) housing.currency = u'€' sub = div.xpath('.//div[@class="rech_desc_right_photo"]')[0] @@ -63,45 +63,42 @@ class SearchResultsPage(BasePage): class HousingPage(BasePage): def get_housing(self, housing=None): - doc = self.document.getroot() if housing is None: - housing = Housing(self.url.split('/')[-1].rstrip('.htm')) + housing = Housing(self.groups[0]) - housing.title = doc.xpath('//head/title')[0].text - housing.text = doc.xpath('//head/meta[@name="description"]')[0].attrib['content'] - txt = doc.xpath('//meta[@itemprop="price"]')[0].attrib['content'].strip() - m = re.match(u'(\d+)\xa0\u20ac(\+ch|cc)(Charges: (\d+)\u20ac)?', txt) - if not m: - raise BrokenPageError('Unable to parse price %r' % txt) + details = self.document.getroot().xpath('//detailannonce')[0] + if details.find('titre') is None: + return None - housing.cost = sanitarize_cost(m.group(1)) - if m.group(4): - housing.cost += sanitarize_cost(m.group(4)) + housing.title = details.find('titre').text + housing.text = details.find('descriptif').text.strip() + housing.cost = int(details.find('prix').text) housing.currency = u'€' - - housing.date = date(*map(int, reversed(doc.xpath('//span[@class="maj"]/b')[0].text.split(' / ')))) - housing.area = int(doc.xpath('//li[@title="Surface"]/b')[0].text.strip(u'\xa0m\xb2')) + housing.date = parse_date(details.find('dtfraicheur').text) + housing.area = float(details.find('surface').text) + housing.phone = details.find('contact').find('telephone').text try: - housing.station = doc.xpath('//dd[@class="metro_paris"]')[0].text.strip() - except IndexError: + housing.station = details.find('proximite').text + except AttributeError: housing.station = NotAvailable - try: - housing.phone = doc.xpath('//div[@class="tel"]')[0].text.strip() - except IndexError: - housing.phone = NotAvailable - - try: - housing.location = doc.xpath('//div[@class="adresse"]/b')[0].tail.strip().replace('\r\n', ' ') - except IndexError: + housing.location = details.find('adresse').text + if not housing.location and details.find('quartier') is not None: + housing.location = details.find('quartier').text + if not housing.location: housing.location = NotAvailable - return housing + housing.photos = [] + for photo in details.xpath('./photos/photo'): + if photo.find('bigurl').text: + url = photo.find('bigurl').text + else: + url = photo.find('stdurl').text + housing.photos.append(HousingPhoto(url)) -class HousingPhotosPage(BasePage): - def iter_photos(self): - for i, li in enumerate(self.document.getroot().xpath('//li')): - photo = HousingPhoto(i) - photo.url = li.attrib['rel'].split('|')[0] - yield photo + housing.details = {} + for detail in details.xpath('./details/detail'): + housing.details[detail.find('libelle').text.strip()] = detail.find('valeur').text or 'N/A' + + return housing