Better id and some improvment

Signed-off-by: Vicnet <vo.publique@gmail.com>
This commit is contained in:
Vicnet 2014-03-14 14:14:00 +01:00 committed by Florent
commit 8f09828ebf
3 changed files with 138 additions and 95 deletions

View file

@ -28,6 +28,7 @@ from .browser import LaCentraleBrowser
__all__ = ['LaCentraleBackend'] __all__ = ['LaCentraleBackend']
# I implement capability
class LaCentraleBackend(BaseBackend, ICapPriceComparison): class LaCentraleBackend(BaseBackend, ICapPriceComparison):
NAME = 'lacentrale' NAME = 'lacentrale'
MAINTAINER = u'Vicnet' MAINTAINER = u'Vicnet'
@ -66,29 +67,35 @@ class LaCentraleBackend(BaseBackend, ICapPriceComparison):
for product in self.browser.iter_products(criteria): for product in self.browser.iter_products(criteria):
yield product yield product
# inherited from ICapPriceComparison
def iter_prices(self, product): def iter_prices(self, product):
# inherited from ICapPriceComparison # inherited from ICapPriceComparison
with self.browser: with self.browser:
return self.browser.iter_prices(product) return self.browser.iter_prices(product)
# def get_price(self, id):
# inherited from ICapPriceComparison # inherited from ICapPriceComparison
def get_price(self, id):
# id is a url code part for one car page
with self.browser:
return self.browser.get_price(id)
## inherited from ICapPriceComparison
#with self.browser: #with self.browser:
#if isinstance(id, Price): #if isinstance(id, Price):
#print "get_price by price", id
#price = id #price = id
#else: #else:
# p_id, s_id = id.split('.', 2) ##p_id, s_id = id.split('.', 2)
# product = Product(p_id) ##product = Product(p_id)
# for price in self.iter_prices(product): ##for price in self.iter_prices(product):
# if price.id == id: ##if price.id == id:
# break ##break
# else: ##else:
#return None #return None
##price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1])
# price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1])
#return price #return price
#def fill_price(self, price, fields): #def fill_price(self, price, fields):
#print "VO lacentrale fill_price", price
#return self.get_price(price) #return self.get_price(price)
#OBJECTS = {Price: fill_price, } #OBJECTS = {Price: fill_price, }

View file

@ -23,12 +23,13 @@ import re
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from .pages import MainPage, ListingAutoPage from .pages import MainPage, ListingAutoPage, AnnoncePage
__all__ = ['LaCentraleBrowser'] __all__ = ['LaCentraleBrowser']
# I manage urls and page location, then trasnfert to page
class LaCentraleBrowser(BaseBrowser): class LaCentraleBrowser(BaseBrowser):
PROTOCOL = 'http' PROTOCOL = 'http'
DOMAIN = 'www.lacentrale.fr' DOMAIN = 'www.lacentrale.fr'
@ -36,6 +37,7 @@ class LaCentraleBrowser(BaseBrowser):
PAGES = { PAGES = {
'http://www.lacentrale.fr/': MainPage, 'http://www.lacentrale.fr/': MainPage,
'http://www.lacentrale.fr/listing_auto.php?.*': ListingAutoPage, 'http://www.lacentrale.fr/listing_auto.php?.*': ListingAutoPage,
'http://www.lacentrale.fr/auto-occasion-annonce-.*': AnnoncePage,
} }
def iter_products(self, criteria): def iter_products(self, criteria):
@ -44,20 +46,22 @@ class LaCentraleBrowser(BaseBrowser):
assert self.is_on_page(MainPage) assert self.is_on_page(MainPage)
return self.page.iter_products(criteria) return self.page.iter_products(criteria)
def buildUrl(self, product, request, criteria): def _buildUrl(self, product, request, criteria):
if product._criteria.has_key(criteria): if product._criteria.has_key(criteria):
return '&' + request.format(product._criteria.get(criteria)) return '&' + request.format(product._criteria.get(criteria))
return '' return ''
def iter_prices(self, product): def iter_prices(self, product):
# convert product criteria to url encoding
if not self.is_on_page(ListingAutoPage): if not self.is_on_page(ListingAutoPage):
#TODO use urllib.urlencode(data) ?
url = '/listing_auto.php?num=1&witchSearch=0' url = '/listing_auto.php?num=1&witchSearch=0'
url += self.buildUrl(product, 'Citadine={}','urban') url += self._buildUrl(product, 'Citadine={}','urban')
url += self.buildUrl(product, 'prix_maxi={}','maxprice') url += self._buildUrl(product, 'prix_maxi={}','maxprice')
url += self.buildUrl(product, 'km_maxi={}','maxdist') url += self._buildUrl(product, 'km_maxi={}','maxdist')
url += self.buildUrl(product, 'nbportes=%3D{}','nbdoors') url += self._buildUrl(product, 'nbportes=%3D{}','nbdoors')
url += self.buildUrl(product, 'cp={}','dept') url += self._buildUrl(product, 'cp={}','dept')
url += self.buildUrl(product, 'origin={}','origin') url += self._buildUrl(product, 'origine={}','origin')
#print url #print url
self.location(url) self.location(url)
@ -77,23 +81,8 @@ class LaCentraleBrowser(BaseBrowser):
self.location(url) self.location(url)
assert self.is_on_page(ListingAutoPage) assert self.is_on_page(ListingAutoPage)
# def iter_prices(self, zipcode, product): def get_price(self, id):
# data = {'aff_param_0_0': '', #/auto-occasion-annonce-23440064.html
# 'aff_param_0_1': 'les points de vente', self.location('/auto-occasion-annonce-'+id+'.html')
# 'aff_param_0_3': zipcode, assert self.is_on_page(AnnoncePage)
# 'changeNbPerPage': 'off', return self.page.get_price(id)
# 'toDelete': -1,
# }
# self.location('/index.php?module=dbgestion&action=search', urllib.urlencode(data))
#
# assert self.is_on_page(ComparisonResultsPage)
# return self.page.iter_results(product)
#
# def get_shop_info(self, id):
# data = {'pdv_id': id,
# 'module': 'dbgestion',
# 'action': 'getPopupInfo'}
# self.location('/index.php?module=dbgestion&action=getPopupInfo', urllib.urlencode(data))
#
# assert self.is_on_page(ShopInfoPage)
# return self.page.get_info()

View file

@ -22,13 +22,14 @@
#import re #import re
from weboob.tools.browser import BasePage, BrokenPageError from weboob.tools.browser import BasePage, BrokenPageError
from weboob.capabilities import NotAvailable from weboob.capabilities import NotAvailable, NotLoaded
from weboob.capabilities.pricecomparison import Product, Price, Shop from weboob.capabilities.pricecomparison import Product, Price, Shop
import re import re
from decimal import Decimal from decimal import Decimal
__all__ = ['MainPage','ListingAutoPage'] __all__ = ['MainPage','ListingAutoPage']
# I manage main page, ie do nothing yet
class MainPage(BasePage): class MainPage(BasePage):
def iter_products(self, criteria): def iter_products(self, criteria):
product = Product(1) product = Product(1)
@ -38,6 +39,23 @@ class MainPage(BasePage):
product._criteria = criteria product._criteria = criteria
yield product yield product
def get_decimal(s):
return re.findall(r'\d+', s.replace(' ',''))[0]
def new_price(id, product, cost, title):
price = Price(id)
price.product = product
price.cost = Decimal(get_decimal(cost))
price.currency = u''
price.message = unicode(title)
price.set_empty_fields(NotAvailable)
price.shop = Shop(price.id)
price.shop.set_empty_fields(NotAvailable) # NotLoaded
return price
# I manage listing page and extract information
class ListingAutoPage(BasePage): class ListingAutoPage(BasePage):
def _extract(self, tr, name): def _extract(self, tr, name):
@ -47,9 +65,17 @@ class ListingAutoPage(BasePage):
return '' return ''
return td[-1].text_content().strip() return td[-1].text_content().strip()
def _extract_id(self, tr):
tdas = tr.cssselect('td.lcbrand a')
if tdas is None or len(tdas)==0: return None
tda = tdas[0]
m = re.search('annonce-(\d+)\.html', tda.get('href'))
if not m: return None
return m.group(1)
def iter_prices(self, product, numpage): def iter_prices(self, product, numpage):
for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'): for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id],tr.lclineB[id]'):
id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:]) id = self._extract_id(tr)
title = self._extract(tr, 'lcbrand') title = self._extract(tr, 'lcbrand')
if not title: if not title:
continue continue
@ -62,16 +88,7 @@ class ListingAutoPage(BasePage):
cost = ', ' + self._extract(tr, 'lcprice') cost = ', ' + self._extract(tr, 'lcprice')
price = Price(id) yield new_price(id, product, cost, title)
price.product = product
price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0])
price.currency = u''
price.message = unicode(title)
price.shop = Shop(price.id)
price.shop.set_empty_fields(NotAvailable)
price.set_empty_fields(NotAvailable)
yield price
def get_next(self): def get_next(self):
for a in self.document.getroot().cssselect('a.page'): for a in self.document.getroot().cssselect('a.page'):
@ -83,38 +100,68 @@ class ListingAutoPage(BasePage):
return int(m.group(1)) return int(m.group(1))
return None return None
#class ComparisonResultsPage(BasePage): # I manage one car page (annonce) )and extract information
#def get_product_name(self): class AnnoncePage(BasePage):
#try:
#div = self.parser.select(self.document.getroot(), 'div#moins_plus_ariane', 1)
#except BrokenPageError:
#return NotAvailable
#else:
#m = re.match('Carburant : ([\w\-]+) | .*', div.text)
#return m.group(1)
#def iter_results(self, product=None): def _extract(self, e, name):
#price = None 'Extract content from li element with class name'
#product.name = self.get_product_name() li = e.cssselect('li.' + name)
#for tr in self.document.getroot().cssselect('table#tab_resultat tr'): if not li:
#if tr.attrib.get('id', '').startswith('pdv'): return ''
#price = Price('%s.%s' % (product.id, tr.attrib['id'][3:])) return li[0].text_content().strip()
#price.product = product def _extract_info(self, e, name):
'Extract content from InfoLib'
for td in e.cssselect('td.InfoLib'):
if name in td.text_content():
ntd = td.getnext()
if ntd is None: continue
return ntd.text_content().strip()
return None
#tds = tr.findall('td') def _extract_vendor(self, e, name):
#price.cost = Decimal(tds[4].text.replace(',', '.')) 'Extract content from VendorLib'
#price.currency = u'€' for span in e.cssselect('span.VendeurLib'):
if name in span.text_content():
li = span.getparent()
if li is None: continue
# get all text
s = li.text_content()
# get text without header
s = s[len(span.text_content())+1:]
# special case for not pro
if '\n' in s:
s = s[:s.find('\n')]
return s.strip()
return None
#shop = Shop(price.id) def get_shop(self, id):
#shop.name = unicode(tds[2].text.strip()) shop = Shop(id)
#shop.location = unicode(tds[0].text.strip()) for e in self.document.getroot().cssselect('div#Vendeur'):
shop.name = self._extract_vendor(e,'Nom') + '(' + self._extract_vendor(e,'Vendeur') + ')'
shop.location = ''
for adr in self.document.getroot().cssselect('span#AdresseL1,span#AdresseL2'):
if shop.location:
shop.location += ', '
shop.location += adr.text_content().strip()
for tel in self.document.getroot().cssselect('span.Tel'):
s = tel.text_content().strip()
if shop.location:
shop.location += ', '
shop.location += re.sub('\s+', ' ', s)
shop.set_empty_fields(NotAvailable)
return shop
#price.shop = shop def get_price(self, id):
#price.set_empty_fields(NotAvailable) for e in self.document.getroot().cssselect('div#DescBar'):
#yield price product = Product(1)
product.name = unicode('Occasion')
cost = self._extract(e,'PriceLc')
#class ShopInfoPage(BasePage): title = self._extract(e,'BrandLc')
#def get_info(self): title += ', ' + self._extract(e,'modeleCom')
#return self.parser.tostring(self.parser.select(self.document.getroot(), 'div.colg', 1)) title += ', ' + self._extract_info(e,'Version')
title += ', ' + self._extract_info(e,'Ann')
title += ', ' + get_decimal(self._extract_info(e,'Kilom')) + 'km'
price = new_price(id, product, cost, title)
price.shop = self.get_shop(id)
return price