[pap] adapt to browser2

This commit is contained in:
Bezleputh 2014-10-09 15:53:42 +02:00
commit f00c9233c3
3 changed files with 144 additions and 145 deletions

View file

@ -20,35 +20,37 @@
import urllib
from weboob.tools.json import json
from weboob.deprecated.browser import Browser
from weboob.browser import PagesBrowser, URL
from weboob.capabilities.housing import Query
from .pages import SearchResultsPage, HousingPage
from .pages import SearchResultsPage, HousingPage, CitiesPage
__all__ = ['PapBrowser']
class PapBrowser(Browser):
PROTOCOL = 'http'
DOMAIN = 'www.pap.fr'
ENCODING = 'utf-8'
PAGES = {
'http://www.pap.fr/annonce/.*': SearchResultsPage,
'http://www.pap.fr/annonces/.*': HousingPage,
}
class PapBrowser(PagesBrowser):
BASEURL = 'http://www.pap.fr'
search_page = URL('annonce/.*', SearchResultsPage)
housing = URL('annonces/(?P<_id>.*)', HousingPage)
cities = URL('index/ac-geo2\?q=(?P<pattern>.*)', CitiesPage)
def search_geo(self, pattern):
fp = self.openurl(self.buildurl('http://www.pap.fr/index/ac-geo', q=pattern.encode('utf-8')))
return json.load(fp)
return self.cities.open(pattern=pattern).iter_cities()
TYPES = {Query.TYPE_RENT: 'location',
Query.TYPE_SALE: 'vente',
}
Query.TYPE_SALE: 'vente'}
RET = {Query.HOUSE_TYPES.HOUSE: 'maison',
Query.HOUSE_TYPES.APART: 'appartement',
Query.HOUSE_TYPES.LAND: 'terrain',
Query.HOUSE_TYPES.PARKING: 'garage-parking',
Query.HOUSE_TYPES.OTHER: 'divers'}
def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max, house_types):
self.session.headers.update({'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'})
def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max):
data = {'geo_objets_ids': ','.join(cities),
'surface[min]': area_min or '',
'surface[max]': area_max or '',
@ -57,21 +59,19 @@ class PapBrowser(Browser):
'produit': self.TYPES.get(type, 'location'),
'recherche': 1,
'nb_resultats_par_page': 40,
'submit': 'rechercher',
'typesbien[]': 'appartement',
}
}
if nb_rooms:
data['nb_pieces[min]'] = nb_rooms
data['nb_pieces[max]'] = nb_rooms
self.location('/annonce/', urllib.urlencode(data))
assert self.is_on_page(SearchResultsPage)
ret = []
for house_type in house_types:
if house_type in self.RET:
ret.append(self.RET.get(house_type))
return self.page.iter_housings()
_data = '%s%s%s' % (urllib.urlencode(data), '&typesbien%5B%5D=', '&typesbien%5B%5D='.join(ret))
return self.search_page.go(data=_data).iter_housings()
def get_housing(self, housing):
self.location('/annonces/%s' % urllib.quote(housing))
assert self.is_on_page(HousingPage)
return self.page.get_housing()
def get_housing(self, _id, housing=None):
return self.housing.go(_id=_id).get_housing(obj=housing)

View file

@ -18,8 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.housing import CapHousing, City, Housing, HousingPhoto
from weboob.capabilities.housing import CapHousing, Housing, HousingPhoto
from weboob.tools.backend import Module
from .browser import PapBrowser
@ -38,41 +37,35 @@ class PapModule(Module, CapHousing):
BROWSER = PapBrowser
def search_housings(self, query):
cities = [c.id for c in query.cities if c.backend == self.name]
cities = ['%s' % c.id for c in query.cities if c.backend == self.name]
if len(cities) == 0:
return list()
with self.browser:
return self.browser.search_housings(query.type, cities, query.nb_rooms,
query.area_min, query.area_max,
query.cost_min, query.cost_max)
return self.browser.search_housings(query.type, cities, query.nb_rooms,
query.area_min, query.area_max,
query.cost_min, query.cost_max,
query.house_types)
def get_housing(self, housing):
if isinstance(housing, Housing):
id = housing.id
else:
id = housing
housing = None
with self.browser:
return self.browser.get_housing(id)
return self.browser.get_housing(id, housing)
def search_city(self, pattern):
with self.browser:
for city in self.browser.search_geo(pattern):
c = City(city['id'])
c.name = unicode(city['name'])
yield c
return self.browser.search_geo(pattern)
def fill_housing(self, housing, fields):
with self.browser:
return self.browser.get_housing(housing.id)
return self.browser.get_housing(housing.id, housing)
def fill_photo(self, photo, fields):
with self.browser:
if 'data' in fields and photo.url and not photo.data:
photo.data = self.browser.readurl(photo.url)
if 'data' in fields and photo.url and not photo.data:
photo.data = self.browser.readurl(photo.url)
return photo
OBJECTS = {Housing: fill_housing,
HousingPhoto: fill_photo,
}
}

View file

@ -18,105 +18,111 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
from decimal import Decimal
from dateutil.parser import parse as parse_date
from weboob.deprecated.browser import Page
from weboob.tools.date import parse_french_date
from weboob.browser.pages import HTMLPage, JsonPage, pagination
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, CleanDecimal, Regexp, Env, BrowserURL, Format
from weboob.browser.filters.html import Link, XPath, CleanHTML
from weboob.browser.filters.json import Dict
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.housing import Housing
from weboob.capabilities.housing import Housing, City, HousingPhoto
class SearchResultsPage(Page):
DATE_RE = re.compile('Annonce \w+ du (.*)')
MONTHS = {u'janvier': 'january',
u'février': 'february',
u'mars': 'march',
u'avril': 'april',
u'mai': 'may',
u'juin': 'june',
u'juillet': 'july',
u'août': 'august',
u'septembre': 'september',
u'octobre': 'october',
u'novembre': 'november',
u'décembre': 'december',
}
def iter_housings(self):
for div in self.document.getroot().cssselect('div.annonce-resume'):
a = div.cssselect('td.lien-annonce')[0].find('a')
if a is None:
# not a real announce.
continue
id = a.attrib['href'].split('-')[-1]
housing = Housing(id)
housing.title = a.text.strip()
m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
if m:
housing.area = Decimal(m.group(3))
housing.cost = Decimal(div.cssselect('td.prix')[0].text.strip(u' \t\u20ac\xa0\n\r').replace('.', '').replace(',', '.'))
housing.currency = u''
m = self.DATE_RE.match(div.cssselect('p.date-publication')[0].text.strip())
if m:
date = m.group(1)
for fr, en in self.MONTHS.iteritems():
date = date.replace(fr, en)
housing.date = parse_date(date)
metro = div.cssselect('p.metro')
if len(metro) > 0:
housing.station = unicode(metro[0].text.strip())
else:
housing.station = NotAvailable
p = div.cssselect('p.annonce-resume-texte')[0]
b = p.findall('b')
if len(b) > 0:
housing.text = b[0].tail.strip()
housing.location = unicode(b[0].text)
else:
housing.text = p.text.strip()
housing.photos = NotAvailable
yield housing
class HousingPage(Page):
def get_housing(self):
div = self.parser.select(self.document.getroot(), 'div#annonce_detail', 1)
housing = Housing(self.url.split('-')[-1])
parts = div.find('h1').text.split(' - ')
housing.title = parts[0].strip()
housing.cost = Decimal(parts[1].strip(u' \t\u20ac\xa0\n\r').replace('.', '').replace(',', '.'))
housing.currency = u''
m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
if m:
housing.area = Decimal(m.group(3))
housing.date = housing.station = housing.location = housing.phone = NotAvailable
metro = div.cssselect('p.metro')
if len(metro) > 0:
housing.station = metro[0].text.strip()
p = div.cssselect('p.annonce-detail-texte')[0]
b = p.findall('b')
if len(b) > 0:
housing.text = b[0].tail.strip()
housing.location = unicode(b[0].text)
if len(b) > 1:
housing.phone = b[1].text
class DictElement(ListElement):
def find_elements(self):
if self.item_xpath is not None:
for el in self.el:
yield el
else:
housing.text = p.text.strip()
yield self.el
housing.details = NotAvailable
housing.photos = NotAvailable
return housing
class CitiesPage(JsonPage):
@method
class iter_cities(DictElement):
item_xpath = '.'
class item(ItemElement):
klass = City
obj_id = Dict('id')
obj_name = Dict('name')
class SearchResultsPage(HTMLPage):
@pagination
@method
class iter_housings(ListElement):
item_xpath = '//li[@class="annonce"]'
def next_page(self):
return Link('//ul[@class="pagination"]/li[@class="next"]/a')(self)
class item(ItemElement):
klass = Housing
obj_id = Regexp(Link('./div[@class="header-annonce"]/a'), '/annonces/(.*)')
obj_title = CleanText('./div[@class="header-annonce"]/a')
obj_area = CleanDecimal(Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="desc"]'),
'(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
obj_cost = CleanDecimal(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
replace_dots=(',', '.'), default=Decimal(0))
obj_currency = Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
'.*([%s%s%s])' % (u'', u'$', u'£'), default=u'')
def obj_date(self):
_date = CleanText('./div[@class="header-annonce"]/span[@class="date"]')(self)
return parse_french_date(_date)
obj_station = CleanText('./div/div/div[@cladd=metro]', default=NotAvailable)
obj_location = CleanText('./div[@class="clearfix"]/div/a/span/img/@alt')
obj_text = CleanText('./div[@class="clearfix"]/div[@class="description clearfix"]/p')
def obj_photos(self):
photos = []
for img in XPath('//div[@class="vignette-annonce"]/a/span/img/@src')(self):
photos.append(HousingPhoto(u'%s' % img))
return photos
class HousingPage(HTMLPage):
@method
class get_housing(ItemElement):
klass = Housing
obj_id = Env('_id')
obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]')
obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]')
obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'),
'.*([%s%s%s])' % (u'', u'$', u'£'), default=u'')
obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'),
'(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
obj_location = CleanText('//div[@class="text-annonce"]/h2')
obj_text = CleanHTML('//div[@class="text-annonce"]/p')
obj_station = CleanText('//div[@class="metro"]')
obj_phone = CleanText('//span[@class="telephone hide-tel"]')
obj_url = BrowserURL('housing', _id=Env('_id'))
def obj_details(self):
details = dict()
for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self):
key = CleanText('./span[@class="label"]')(item)
value = CleanText('.', replace=[(key, '')])(item)
if value and key:
details[key] = value
key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self)
value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'),
CleanText('//div[@class="classe-energie-content"]/div/@class',
replace=[('-', ' ')]))(self)
if value and key:
details[key] = value
return details
def obj_photos(self):
photos = []
for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self):
photos.append(HousingPhoto(u'%s' % img))
return photos