Better id and some improvment

Signed-off-by: Vicnet <vo.publique@gmail.com>
This commit is contained in:
Vicnet 2014-03-14 14:14:00 +01:00 committed by Florent
commit 8f09828ebf
3 changed files with 138 additions and 95 deletions

View file

@ -22,13 +22,14 @@
#import re
from weboob.tools.browser import BasePage, BrokenPageError
from weboob.capabilities import NotAvailable
from weboob.capabilities import NotAvailable, NotLoaded
from weboob.capabilities.pricecomparison import Product, Price, Shop
import re
from decimal import Decimal
__all__ = ['MainPage','ListingAutoPage']
# I manage main page, ie do nothing yet
class MainPage(BasePage):
def iter_products(self, criteria):
product = Product(1)
@ -38,6 +39,23 @@ class MainPage(BasePage):
product._criteria = criteria
yield product
def get_decimal(s):
return re.findall(r'\d+', s.replace(' ',''))[0]
def new_price(id, product, cost, title):
price = Price(id)
price.product = product
price.cost = Decimal(get_decimal(cost))
price.currency = u''
price.message = unicode(title)
price.set_empty_fields(NotAvailable)
price.shop = Shop(price.id)
price.shop.set_empty_fields(NotAvailable) # NotLoaded
return price
# I manage listing page and extract information
class ListingAutoPage(BasePage):
def _extract(self, tr, name):
@ -47,9 +65,17 @@ class ListingAutoPage(BasePage):
return ''
return td[-1].text_content().strip()
def _extract_id(self, tr):
tdas = tr.cssselect('td.lcbrand a')
if tdas is None or len(tdas)==0: return None
tda = tdas[0]
m = re.search('annonce-(\d+)\.html', tda.get('href'))
if not m: return None
return m.group(1)
def iter_prices(self, product, numpage):
for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'):
id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:])
for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id],tr.lclineB[id]'):
id = self._extract_id(tr)
title = self._extract(tr, 'lcbrand')
if not title:
continue
@ -62,16 +88,7 @@ class ListingAutoPage(BasePage):
cost = ', ' + self._extract(tr, 'lcprice')
price = Price(id)
price.product = product
price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0])
price.currency = u''
price.message = unicode(title)
price.shop = Shop(price.id)
price.shop.set_empty_fields(NotAvailable)
price.set_empty_fields(NotAvailable)
yield price
yield new_price(id, product, cost, title)
def get_next(self):
for a in self.document.getroot().cssselect('a.page'):
@ -83,38 +100,68 @@ class ListingAutoPage(BasePage):
return int(m.group(1))
return None
#class ComparisonResultsPage(BasePage):
#def get_product_name(self):
#try:
#div = self.parser.select(self.document.getroot(), 'div#moins_plus_ariane', 1)
#except BrokenPageError:
#return NotAvailable
#else:
#m = re.match('Carburant : ([\w\-]+) | .*', div.text)
#return m.group(1)
# I manage one car page (annonce) )and extract information
class AnnoncePage(BasePage):
#def iter_results(self, product=None):
#price = None
#product.name = self.get_product_name()
#for tr in self.document.getroot().cssselect('table#tab_resultat tr'):
#if tr.attrib.get('id', '').startswith('pdv'):
#price = Price('%s.%s' % (product.id, tr.attrib['id'][3:]))
def _extract(self, e, name):
'Extract content from li element with class name'
li = e.cssselect('li.' + name)
if not li:
return ''
return li[0].text_content().strip()
#price.product = product
def _extract_info(self, e, name):
'Extract content from InfoLib'
for td in e.cssselect('td.InfoLib'):
if name in td.text_content():
ntd = td.getnext()
if ntd is None: continue
return ntd.text_content().strip()
return None
#tds = tr.findall('td')
#price.cost = Decimal(tds[4].text.replace(',', '.'))
#price.currency = u'€'
def _extract_vendor(self, e, name):
'Extract content from VendorLib'
for span in e.cssselect('span.VendeurLib'):
if name in span.text_content():
li = span.getparent()
if li is None: continue
# get all text
s = li.text_content()
# get text without header
s = s[len(span.text_content())+1:]
# special case for not pro
if '\n' in s:
s = s[:s.find('\n')]
return s.strip()
return None
#shop = Shop(price.id)
#shop.name = unicode(tds[2].text.strip())
#shop.location = unicode(tds[0].text.strip())
def get_shop(self, id):
shop = Shop(id)
for e in self.document.getroot().cssselect('div#Vendeur'):
shop.name = self._extract_vendor(e,'Nom') + '(' + self._extract_vendor(e,'Vendeur') + ')'
shop.location = ''
for adr in self.document.getroot().cssselect('span#AdresseL1,span#AdresseL2'):
if shop.location:
shop.location += ', '
shop.location += adr.text_content().strip()
for tel in self.document.getroot().cssselect('span.Tel'):
s = tel.text_content().strip()
if shop.location:
shop.location += ', '
shop.location += re.sub('\s+', ' ', s)
shop.set_empty_fields(NotAvailable)
return shop
#price.shop = shop
#price.set_empty_fields(NotAvailable)
#yield price
#class ShopInfoPage(BasePage):
#def get_info(self):
#return self.parser.tostring(self.parser.select(self.document.getroot(), 'div.colg', 1))
def get_price(self, id):
for e in self.document.getroot().cssselect('div#DescBar'):
product = Product(1)
product.name = unicode('Occasion')
cost = self._extract(e,'PriceLc')
title = self._extract(e,'BrandLc')
title += ', ' + self._extract(e,'modeleCom')
title += ', ' + self._extract_info(e,'Version')
title += ', ' + self._extract_info(e,'Ann')
title += ', ' + get_decimal(self._extract_info(e,'Kilom')) + 'km'
price = new_price(id, product, cost, title)
price.shop = self.get_shop(id)
return price