diff --git a/modules/pap/browser.py b/modules/pap/browser.py index 0816be73..8f1c3edb 100644 --- a/modules/pap/browser.py +++ b/modules/pap/browser.py @@ -20,35 +20,37 @@ import urllib -from weboob.tools.json import json - -from weboob.deprecated.browser import Browser +from weboob.browser import PagesBrowser, URL from weboob.capabilities.housing import Query -from .pages import SearchResultsPage, HousingPage +from .pages import SearchResultsPage, HousingPage, CitiesPage __all__ = ['PapBrowser'] -class PapBrowser(Browser): - PROTOCOL = 'http' - DOMAIN = 'www.pap.fr' - ENCODING = 'utf-8' - PAGES = { - 'http://www.pap.fr/annonce/.*': SearchResultsPage, - 'http://www.pap.fr/annonces/.*': HousingPage, - } +class PapBrowser(PagesBrowser): + + BASEURL = 'http://www.pap.fr' + search_page = URL('annonce/.*', SearchResultsPage) + housing = URL('annonces/(?P<_id>.*)', HousingPage) + cities = URL('index/ac-geo2\?q=(?P.*)', CitiesPage) def search_geo(self, pattern): - fp = self.openurl(self.buildurl('http://www.pap.fr/index/ac-geo', q=pattern.encode('utf-8'))) - return json.load(fp) + return self.cities.open(pattern=pattern).iter_cities() TYPES = {Query.TYPE_RENT: 'location', - Query.TYPE_SALE: 'vente', - } + Query.TYPE_SALE: 'vente'} + + RET = {Query.HOUSE_TYPES.HOUSE: 'maison', + Query.HOUSE_TYPES.APART: 'appartement', + Query.HOUSE_TYPES.LAND: 'terrain', + Query.HOUSE_TYPES.PARKING: 'garage-parking', + Query.HOUSE_TYPES.OTHER: 'divers'} + + def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max, house_types): + self.session.headers.update({'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'}) - def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max): data = {'geo_objets_ids': ','.join(cities), 'surface[min]': area_min or '', 'surface[max]': area_max or '', @@ -57,21 +59,19 @@ class PapBrowser(Browser): 'produit': self.TYPES.get(type, 'location'), 'recherche': 1, 'nb_resultats_par_page': 40, - 'submit': 'rechercher', - 'typesbien[]': 'appartement', - } + } if nb_rooms: data['nb_pieces[min]'] = nb_rooms data['nb_pieces[max]'] = nb_rooms - self.location('/annonce/', urllib.urlencode(data)) - assert self.is_on_page(SearchResultsPage) + ret = [] + for house_type in house_types: + if house_type in self.RET: + ret.append(self.RET.get(house_type)) - return self.page.iter_housings() + _data = '%s%s%s' % (urllib.urlencode(data), '&typesbien%5B%5D=', '&typesbien%5B%5D='.join(ret)) + return self.search_page.go(data=_data).iter_housings() - def get_housing(self, housing): - self.location('/annonces/%s' % urllib.quote(housing)) - - assert self.is_on_page(HousingPage) - return self.page.get_housing() + def get_housing(self, _id, housing=None): + return self.housing.go(_id=_id).get_housing(obj=housing) diff --git a/modules/pap/module.py b/modules/pap/module.py index a8057671..0c422eda 100644 --- a/modules/pap/module.py +++ b/modules/pap/module.py @@ -18,8 +18,7 @@ # along with weboob. If not, see . - -from weboob.capabilities.housing import CapHousing, City, Housing, HousingPhoto +from weboob.capabilities.housing import CapHousing, Housing, HousingPhoto from weboob.tools.backend import Module from .browser import PapBrowser @@ -38,41 +37,35 @@ class PapModule(Module, CapHousing): BROWSER = PapBrowser def search_housings(self, query): - cities = [c.id for c in query.cities if c.backend == self.name] + cities = ['%s' % c.id for c in query.cities if c.backend == self.name] if len(cities) == 0: return list() - with self.browser: - return self.browser.search_housings(query.type, cities, query.nb_rooms, - query.area_min, query.area_max, - query.cost_min, query.cost_max) + return self.browser.search_housings(query.type, cities, query.nb_rooms, + query.area_min, query.area_max, + query.cost_min, query.cost_max, + query.house_types) def get_housing(self, housing): if isinstance(housing, Housing): id = housing.id else: id = housing + housing = None - with self.browser: - return self.browser.get_housing(id) + return self.browser.get_housing(id, housing) def search_city(self, pattern): - with self.browser: - for city in self.browser.search_geo(pattern): - c = City(city['id']) - c.name = unicode(city['name']) - yield c + return self.browser.search_geo(pattern) def fill_housing(self, housing, fields): - with self.browser: - return self.browser.get_housing(housing.id) + return self.browser.get_housing(housing.id, housing) def fill_photo(self, photo, fields): - with self.browser: - if 'data' in fields and photo.url and not photo.data: - photo.data = self.browser.readurl(photo.url) + if 'data' in fields and photo.url and not photo.data: + photo.data = self.browser.readurl(photo.url) return photo OBJECTS = {Housing: fill_housing, HousingPhoto: fill_photo, - } + } diff --git a/modules/pap/pages.py b/modules/pap/pages.py index 30d744c2..03d6211e 100644 --- a/modules/pap/pages.py +++ b/modules/pap/pages.py @@ -18,105 +18,111 @@ # along with weboob. If not, see . -import re from decimal import Decimal -from dateutil.parser import parse as parse_date -from weboob.deprecated.browser import Page +from weboob.tools.date import parse_french_date +from weboob.browser.pages import HTMLPage, JsonPage, pagination +from weboob.browser.elements import ItemElement, ListElement, method +from weboob.browser.filters.standard import CleanText, CleanDecimal, Regexp, Env, BrowserURL, Format +from weboob.browser.filters.html import Link, XPath, CleanHTML +from weboob.browser.filters.json import Dict from weboob.capabilities.base import NotAvailable -from weboob.capabilities.housing import Housing +from weboob.capabilities.housing import Housing, City, HousingPhoto -class SearchResultsPage(Page): - DATE_RE = re.compile('Annonce \w+ du (.*)') - MONTHS = {u'janvier': 'january', - u'février': 'february', - u'mars': 'march', - u'avril': 'april', - u'mai': 'may', - u'juin': 'june', - u'juillet': 'july', - u'août': 'august', - u'septembre': 'september', - u'octobre': 'october', - u'novembre': 'november', - u'décembre': 'december', - } - - def iter_housings(self): - for div in self.document.getroot().cssselect('div.annonce-resume'): - a = div.cssselect('td.lien-annonce')[0].find('a') - if a is None: - # not a real announce. - continue - - id = a.attrib['href'].split('-')[-1] - housing = Housing(id) - housing.title = a.text.strip() - m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title) - if m: - housing.area = Decimal(m.group(3)) - - housing.cost = Decimal(div.cssselect('td.prix')[0].text.strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.')) - housing.currency = u'€' - - m = self.DATE_RE.match(div.cssselect('p.date-publication')[0].text.strip()) - if m: - date = m.group(1) - for fr, en in self.MONTHS.iteritems(): - date = date.replace(fr, en) - housing.date = parse_date(date) - - metro = div.cssselect('p.metro') - if len(metro) > 0: - housing.station = unicode(metro[0].text.strip()) - else: - housing.station = NotAvailable - - p = div.cssselect('p.annonce-resume-texte')[0] - b = p.findall('b') - if len(b) > 0: - housing.text = b[0].tail.strip() - housing.location = unicode(b[0].text) - else: - housing.text = p.text.strip() - - housing.photos = NotAvailable - - yield housing - - -class HousingPage(Page): - def get_housing(self): - div = self.parser.select(self.document.getroot(), 'div#annonce_detail', 1) - housing = Housing(self.url.split('-')[-1]) - - parts = div.find('h1').text.split(' - ') - housing.title = parts[0].strip() - housing.cost = Decimal(parts[1].strip(u' \t\u20ac\xa0€\n\r').replace('.', '').replace(',', '.')) - housing.currency = u'€' - - m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title) - if m: - housing.area = Decimal(m.group(3)) - - housing.date = housing.station = housing.location = housing.phone = NotAvailable - - metro = div.cssselect('p.metro') - if len(metro) > 0: - housing.station = metro[0].text.strip() - - p = div.cssselect('p.annonce-detail-texte')[0] - b = p.findall('b') - if len(b) > 0: - housing.text = b[0].tail.strip() - housing.location = unicode(b[0].text) - if len(b) > 1: - housing.phone = b[1].text +class DictElement(ListElement): + def find_elements(self): + if self.item_xpath is not None: + for el in self.el: + yield el else: - housing.text = p.text.strip() + yield self.el - housing.details = NotAvailable - housing.photos = NotAvailable - return housing +class CitiesPage(JsonPage): + @method + class iter_cities(DictElement): + item_xpath = '.' + + class item(ItemElement): + klass = City + + obj_id = Dict('id') + obj_name = Dict('name') + + +class SearchResultsPage(HTMLPage): + @pagination + @method + class iter_housings(ListElement): + item_xpath = '//li[@class="annonce"]' + + def next_page(self): + return Link('//ul[@class="pagination"]/li[@class="next"]/a')(self) + + class item(ItemElement): + klass = Housing + + obj_id = Regexp(Link('./div[@class="header-annonce"]/a'), '/annonces/(.*)') + obj_title = CleanText('./div[@class="header-annonce"]/a') + obj_area = CleanDecimal(Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="desc"]'), + '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) + obj_cost = CleanDecimal(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'), + replace_dots=(',', '.'), default=Decimal(0)) + obj_currency = Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'), + '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') + + def obj_date(self): + _date = CleanText('./div[@class="header-annonce"]/span[@class="date"]')(self) + return parse_french_date(_date) + + obj_station = CleanText('./div/div/div[@cladd=metro]', default=NotAvailable) + obj_location = CleanText('./div[@class="clearfix"]/div/a/span/img/@alt') + obj_text = CleanText('./div[@class="clearfix"]/div[@class="description clearfix"]/p') + + def obj_photos(self): + photos = [] + for img in XPath('//div[@class="vignette-annonce"]/a/span/img/@src')(self): + photos.append(HousingPhoto(u'%s' % img)) + return photos + + +class HousingPage(HTMLPage): + @method + class get_housing(ItemElement): + klass = Housing + + obj_id = Env('_id') + obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]') + obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]') + obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'), + '.*([%s%s%s])' % (u'€', u'$', u'£'), default=u'€') + obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'), + '(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable) + obj_location = CleanText('//div[@class="text-annonce"]/h2') + obj_text = CleanHTML('//div[@class="text-annonce"]/p') + obj_station = CleanText('//div[@class="metro"]') + obj_phone = CleanText('//span[@class="telephone hide-tel"]') + obj_url = BrowserURL('housing', _id=Env('_id')) + + def obj_details(self): + details = dict() + for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self): + key = CleanText('./span[@class="label"]')(item) + value = CleanText('.', replace=[(key, '')])(item) + if value and key: + details[key] = value + + key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self) + value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'), + CleanText('//div[@class="classe-energie-content"]/div/@class', + replace=[('-', ' ')]))(self) + if value and key: + details[key] = value + return details + + def obj_photos(self): + photos = [] + for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self): + photos.append(HousingPhoto(u'%s' % img)) + return photos