Support pagination

Signed-off-by: Vicnet <vo.publique@gmail.com>
This commit is contained in:
Vicnet 2014-02-19 14:04:53 +01:00 committed by Florent
commit 39ada3289f
2 changed files with 32 additions and 7 deletions

View file

@ -19,6 +19,7 @@
import urllib import urllib
import re
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
@ -51,18 +52,31 @@ class LaCentraleBrowser(BaseBrowser):
def iter_prices(self, product): def iter_prices(self, product):
if not self.is_on_page(ListingAutoPage): if not self.is_on_page(ListingAutoPage):
url = '/listing_auto.php?witchSearch=0' url = '/listing_auto.php?num=1&witchSearch=0'
url += self.buildUrl(product, 'Citadine={}','urban') url += self.buildUrl(product, 'Citadine={}','urban')
url += self.buildUrl(product, 'prix_maxi={}','maxprice') url += self.buildUrl(product, 'prix_maxi={}','maxprice')
url += self.buildUrl(product, 'km_maxi={}','maxdist') url += self.buildUrl(product, 'km_maxi={}','maxdist')
url += self.buildUrl(product, 'nbportes=%3D{}','nbdoors') url += self.buildUrl(product, 'nbportes=%3D{}','nbdoors')
url += self.buildUrl(product, 'cp={}','dept') url += self.buildUrl(product, 'cp={}','dept')
url += self.buildUrl(product, 'origin={}','origin') url += self.buildUrl(product, 'origin={}','origin')
print url #print url
self.location(url) self.location(url)
assert self.is_on_page(ListingAutoPage) assert self.is_on_page(ListingAutoPage)
return self.page.iter_prices()
numpage = 1
while True:
# parse the current page
for price in self.page.iter_prices(numpage):
yield price
# check if next page
numpage = self.page.get_next()
if not numpage:
break
url = re.sub('num=(\d+)','num={}'.format(numpage),url)
self.location(url)
assert self.is_on_page(ListingAutoPage)
# def iter_prices(self, zipcode, product): # def iter_prices(self, zipcode, product):
# data = {'aff_param_0_0': '', # data = {'aff_param_0_0': '',

View file

@ -25,6 +25,7 @@ from weboob.tools.browser import BasePage, BrokenPageError
from weboob.capabilities import NotAvailable from weboob.capabilities import NotAvailable
from weboob.capabilities.pricecomparison import Product, Price, Shop from weboob.capabilities.pricecomparison import Product, Price, Shop
import re import re
from decimal import Decimal
__all__ = ['MainPage','ListingAutoPage'] __all__ = ['MainPage','ListingAutoPage']
@ -38,6 +39,7 @@ class MainPage(BasePage):
yield product yield product
class ListingAutoPage(BasePage): class ListingAutoPage(BasePage):
def _extract(self, tr, name): def _extract(self, tr, name):
'Extract content from td element with class name' 'Extract content from td element with class name'
td = tr.cssselect('td.' + name + ' a') td = tr.cssselect('td.' + name + ' a')
@ -45,9 +47,9 @@ class ListingAutoPage(BasePage):
return '' return ''
return td[-1].text_content().strip() return td[-1].text_content().strip()
def iter_prices(self): def iter_prices(self, numpage):
for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id]'): for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'):
id = tr.attrib['id'][3:] id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:])
title = self._extract(tr, 'lcbrand') title = self._extract(tr, 'lcbrand')
if not title: if not title:
continue continue
@ -61,13 +63,22 @@ class ListingAutoPage(BasePage):
cost = ', ' + self._extract(tr, 'lcprice') cost = ', ' + self._extract(tr, 'lcprice')
price = Price(id) price = Price(id)
price.cost = int(re.findall(r'\d+',cost.replace(' ',''))[0]) price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0])
price.currency = u'' price.currency = u''
price.message = unicode(title) price.message = unicode(title)
price.set_empty_fields(NotAvailable) price.set_empty_fields(NotAvailable)
yield price yield price
def get_next(self):
for a in self.document.getroot().cssselect('a.page'):
s = a.getprevious()
if s is not None and s.tag=='span':
m = re.search('num=(\d+)', a.get('href'))
if not m:
return None
return int(m.group(1))
return None
#class ComparisonResultsPage(BasePage): #class ComparisonResultsPage(BasePage):
#def get_product_name(self): #def get_product_name(self):