[pap] adapt to browser2

This commit is contained in:
Bezleputh 2014-10-09 15:53:42 +02:00
commit f00c9233c3
3 changed files with 144 additions and 145 deletions

View file

@ -20,35 +20,37 @@
import urllib import urllib
from weboob.tools.json import json from weboob.browser import PagesBrowser, URL
from weboob.deprecated.browser import Browser
from weboob.capabilities.housing import Query from weboob.capabilities.housing import Query
from .pages import SearchResultsPage, HousingPage from .pages import SearchResultsPage, HousingPage, CitiesPage
__all__ = ['PapBrowser'] __all__ = ['PapBrowser']
class PapBrowser(Browser): class PapBrowser(PagesBrowser):
PROTOCOL = 'http'
DOMAIN = 'www.pap.fr' BASEURL = 'http://www.pap.fr'
ENCODING = 'utf-8' search_page = URL('annonce/.*', SearchResultsPage)
PAGES = { housing = URL('annonces/(?P<_id>.*)', HousingPage)
'http://www.pap.fr/annonce/.*': SearchResultsPage, cities = URL('index/ac-geo2\?q=(?P<pattern>.*)', CitiesPage)
'http://www.pap.fr/annonces/.*': HousingPage,
}
def search_geo(self, pattern): def search_geo(self, pattern):
fp = self.openurl(self.buildurl('http://www.pap.fr/index/ac-geo', q=pattern.encode('utf-8'))) return self.cities.open(pattern=pattern).iter_cities()
return json.load(fp)
TYPES = {Query.TYPE_RENT: 'location', TYPES = {Query.TYPE_RENT: 'location',
Query.TYPE_SALE: 'vente', Query.TYPE_SALE: 'vente'}
}
RET = {Query.HOUSE_TYPES.HOUSE: 'maison',
Query.HOUSE_TYPES.APART: 'appartement',
Query.HOUSE_TYPES.LAND: 'terrain',
Query.HOUSE_TYPES.PARKING: 'garage-parking',
Query.HOUSE_TYPES.OTHER: 'divers'}
def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max, house_types):
self.session.headers.update({'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'})
def search_housings(self, type, cities, nb_rooms, area_min, area_max, cost_min, cost_max):
data = {'geo_objets_ids': ','.join(cities), data = {'geo_objets_ids': ','.join(cities),
'surface[min]': area_min or '', 'surface[min]': area_min or '',
'surface[max]': area_max or '', 'surface[max]': area_max or '',
@ -57,21 +59,19 @@ class PapBrowser(Browser):
'produit': self.TYPES.get(type, 'location'), 'produit': self.TYPES.get(type, 'location'),
'recherche': 1, 'recherche': 1,
'nb_resultats_par_page': 40, 'nb_resultats_par_page': 40,
'submit': 'rechercher', }
'typesbien[]': 'appartement',
}
if nb_rooms: if nb_rooms:
data['nb_pieces[min]'] = nb_rooms data['nb_pieces[min]'] = nb_rooms
data['nb_pieces[max]'] = nb_rooms data['nb_pieces[max]'] = nb_rooms
self.location('/annonce/', urllib.urlencode(data)) ret = []
assert self.is_on_page(SearchResultsPage) for house_type in house_types:
if house_type in self.RET:
ret.append(self.RET.get(house_type))
return self.page.iter_housings() _data = '%s%s%s' % (urllib.urlencode(data), '&typesbien%5B%5D=', '&typesbien%5B%5D='.join(ret))
return self.search_page.go(data=_data).iter_housings()
def get_housing(self, housing): def get_housing(self, _id, housing=None):
self.location('/annonces/%s' % urllib.quote(housing)) return self.housing.go(_id=_id).get_housing(obj=housing)
assert self.is_on_page(HousingPage)
return self.page.get_housing()

View file

@ -18,8 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.housing import CapHousing, Housing, HousingPhoto
from weboob.capabilities.housing import CapHousing, City, Housing, HousingPhoto
from weboob.tools.backend import Module from weboob.tools.backend import Module
from .browser import PapBrowser from .browser import PapBrowser
@ -38,41 +37,35 @@ class PapModule(Module, CapHousing):
BROWSER = PapBrowser BROWSER = PapBrowser
def search_housings(self, query): def search_housings(self, query):
cities = [c.id for c in query.cities if c.backend == self.name] cities = ['%s' % c.id for c in query.cities if c.backend == self.name]
if len(cities) == 0: if len(cities) == 0:
return list() return list()
with self.browser: return self.browser.search_housings(query.type, cities, query.nb_rooms,
return self.browser.search_housings(query.type, cities, query.nb_rooms, query.area_min, query.area_max,
query.area_min, query.area_max, query.cost_min, query.cost_max,
query.cost_min, query.cost_max) query.house_types)
def get_housing(self, housing): def get_housing(self, housing):
if isinstance(housing, Housing): if isinstance(housing, Housing):
id = housing.id id = housing.id
else: else:
id = housing id = housing
housing = None
with self.browser: return self.browser.get_housing(id, housing)
return self.browser.get_housing(id)
def search_city(self, pattern): def search_city(self, pattern):
with self.browser: return self.browser.search_geo(pattern)
for city in self.browser.search_geo(pattern):
c = City(city['id'])
c.name = unicode(city['name'])
yield c
def fill_housing(self, housing, fields): def fill_housing(self, housing, fields):
with self.browser: return self.browser.get_housing(housing.id, housing)
return self.browser.get_housing(housing.id)
def fill_photo(self, photo, fields): def fill_photo(self, photo, fields):
with self.browser: if 'data' in fields and photo.url and not photo.data:
if 'data' in fields and photo.url and not photo.data: photo.data = self.browser.readurl(photo.url)
photo.data = self.browser.readurl(photo.url)
return photo return photo
OBJECTS = {Housing: fill_housing, OBJECTS = {Housing: fill_housing,
HousingPhoto: fill_photo, HousingPhoto: fill_photo,
} }

View file

@ -18,105 +18,111 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
from decimal import Decimal from decimal import Decimal
from dateutil.parser import parse as parse_date
from weboob.deprecated.browser import Page from weboob.tools.date import parse_french_date
from weboob.browser.pages import HTMLPage, JsonPage, pagination
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import CleanText, CleanDecimal, Regexp, Env, BrowserURL, Format
from weboob.browser.filters.html import Link, XPath, CleanHTML
from weboob.browser.filters.json import Dict
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.capabilities.housing import Housing from weboob.capabilities.housing import Housing, City, HousingPhoto
class SearchResultsPage(Page): class DictElement(ListElement):
DATE_RE = re.compile('Annonce \w+ du (.*)') def find_elements(self):
MONTHS = {u'janvier': 'january', if self.item_xpath is not None:
u'février': 'february', for el in self.el:
u'mars': 'march', yield el
u'avril': 'april',
u'mai': 'may',
u'juin': 'june',
u'juillet': 'july',
u'août': 'august',
u'septembre': 'september',
u'octobre': 'october',
u'novembre': 'november',
u'décembre': 'december',
}
def iter_housings(self):
for div in self.document.getroot().cssselect('div.annonce-resume'):
a = div.cssselect('td.lien-annonce')[0].find('a')
if a is None:
# not a real announce.
continue
id = a.attrib['href'].split('-')[-1]
housing = Housing(id)
housing.title = a.text.strip()
m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
if m:
housing.area = Decimal(m.group(3))
housing.cost = Decimal(div.cssselect('td.prix')[0].text.strip(u' \t\u20ac\xa0\n\r').replace('.', '').replace(',', '.'))
housing.currency = u''
m = self.DATE_RE.match(div.cssselect('p.date-publication')[0].text.strip())
if m:
date = m.group(1)
for fr, en in self.MONTHS.iteritems():
date = date.replace(fr, en)
housing.date = parse_date(date)
metro = div.cssselect('p.metro')
if len(metro) > 0:
housing.station = unicode(metro[0].text.strip())
else:
housing.station = NotAvailable
p = div.cssselect('p.annonce-resume-texte')[0]
b = p.findall('b')
if len(b) > 0:
housing.text = b[0].tail.strip()
housing.location = unicode(b[0].text)
else:
housing.text = p.text.strip()
housing.photos = NotAvailable
yield housing
class HousingPage(Page):
def get_housing(self):
div = self.parser.select(self.document.getroot(), 'div#annonce_detail', 1)
housing = Housing(self.url.split('-')[-1])
parts = div.find('h1').text.split(' - ')
housing.title = parts[0].strip()
housing.cost = Decimal(parts[1].strip(u' \t\u20ac\xa0\n\r').replace('.', '').replace(',', '.'))
housing.currency = u''
m = re.match('(\w+) (.+) (\d+)\xa0m\xb2 (.*)', housing.title)
if m:
housing.area = Decimal(m.group(3))
housing.date = housing.station = housing.location = housing.phone = NotAvailable
metro = div.cssselect('p.metro')
if len(metro) > 0:
housing.station = metro[0].text.strip()
p = div.cssselect('p.annonce-detail-texte')[0]
b = p.findall('b')
if len(b) > 0:
housing.text = b[0].tail.strip()
housing.location = unicode(b[0].text)
if len(b) > 1:
housing.phone = b[1].text
else: else:
housing.text = p.text.strip() yield self.el
housing.details = NotAvailable
housing.photos = NotAvailable
return housing class CitiesPage(JsonPage):
@method
class iter_cities(DictElement):
item_xpath = '.'
class item(ItemElement):
klass = City
obj_id = Dict('id')
obj_name = Dict('name')
class SearchResultsPage(HTMLPage):
@pagination
@method
class iter_housings(ListElement):
item_xpath = '//li[@class="annonce"]'
def next_page(self):
return Link('//ul[@class="pagination"]/li[@class="next"]/a')(self)
class item(ItemElement):
klass = Housing
obj_id = Regexp(Link('./div[@class="header-annonce"]/a'), '/annonces/(.*)')
obj_title = CleanText('./div[@class="header-annonce"]/a')
obj_area = CleanDecimal(Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="desc"]'),
'(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
obj_cost = CleanDecimal(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
replace_dots=(',', '.'), default=Decimal(0))
obj_currency = Regexp(CleanText('./div[@class="header-annonce"]/a/span[@class="prix"]'),
'.*([%s%s%s])' % (u'', u'$', u'£'), default=u'')
def obj_date(self):
_date = CleanText('./div[@class="header-annonce"]/span[@class="date"]')(self)
return parse_french_date(_date)
obj_station = CleanText('./div/div/div[@cladd=metro]', default=NotAvailable)
obj_location = CleanText('./div[@class="clearfix"]/div/a/span/img/@alt')
obj_text = CleanText('./div[@class="clearfix"]/div[@class="description clearfix"]/p')
def obj_photos(self):
photos = []
for img in XPath('//div[@class="vignette-annonce"]/a/span/img/@src')(self):
photos.append(HousingPhoto(u'%s' % img))
return photos
class HousingPage(HTMLPage):
@method
class get_housing(ItemElement):
klass = Housing
obj_id = Env('_id')
obj_title = CleanText('//h1[@class="desc clearfix"]/span[@class="title"]')
obj_cost = CleanDecimal('//h1[@class="desc clearfix"]/span[@class="prix"]')
obj_currency = Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="prix"]'),
'.*([%s%s%s])' % (u'', u'$', u'£'), default=u'')
obj_area = CleanDecimal(Regexp(CleanText('//h1[@class="desc clearfix"]/span[@class="title"]'),
'(.*?)(\d*) m\xb2(.*?)', '\\2'), default=NotAvailable)
obj_location = CleanText('//div[@class="text-annonce"]/h2')
obj_text = CleanHTML('//div[@class="text-annonce"]/p')
obj_station = CleanText('//div[@class="metro"]')
obj_phone = CleanText('//span[@class="telephone hide-tel"]')
obj_url = BrowserURL('housing', _id=Env('_id'))
def obj_details(self):
details = dict()
for item in XPath('//div[@class="footer-descriptif"]/ul/li')(self):
key = CleanText('./span[@class="label"]')(item)
value = CleanText('.', replace=[(key, '')])(item)
if value and key:
details[key] = value
key = CleanText('//div[@class="classe-energie-content"]/div/div/span')(self)
value = Format('%s(%s)', CleanText('//div[@class="classe-energie-content"]/div/div/p'),
CleanText('//div[@class="classe-energie-content"]/div/@class',
replace=[('-', ' ')]))(self)
if value and key:
details[key] = value
return details
def obj_photos(self):
photos = []
for img in XPath('//div[@class="showcase-thumbnail"]/img/@src')(self):
photos.append(HousingPhoto(u'%s' % img))
return photos