From 8f09828ebf46c293b2171878ee3fa5e56ea00648 Mon Sep 17 00:00:00 2001 From: Vicnet Date: Fri, 14 Mar 2014 14:14:00 +0100 Subject: [PATCH] Better id and some improvment Signed-off-by: Vicnet --- modules/lacentrale/backend.py | 45 +++++++----- modules/lacentrale/browser.py | 45 +++++------- modules/lacentrale/pages.py | 133 +++++++++++++++++++++++----------- 3 files changed, 133 insertions(+), 90 deletions(-) diff --git a/modules/lacentrale/backend.py b/modules/lacentrale/backend.py index 934175f4..d45e7840 100644 --- a/modules/lacentrale/backend.py +++ b/modules/lacentrale/backend.py @@ -28,6 +28,7 @@ from .browser import LaCentraleBrowser __all__ = ['LaCentraleBackend'] +# I implement capability class LaCentraleBackend(BaseBackend, ICapPriceComparison): NAME = 'lacentrale' MAINTAINER = u'Vicnet' @@ -66,29 +67,35 @@ class LaCentraleBackend(BaseBackend, ICapPriceComparison): for product in self.browser.iter_products(criteria): yield product + # inherited from ICapPriceComparison def iter_prices(self, product): # inherited from ICapPriceComparison with self.browser: return self.browser.iter_prices(product) -# def get_price(self, id): - # inherited from ICapPriceComparison -# with self.browser: -# if isinstance(id, Price): -# price = id -# else: -# p_id, s_id = id.split('.', 2) -# product = Product(p_id) -# for price in self.iter_prices(product): -# if price.id == id: -# break -# else: -# return None + # inherited from ICapPriceComparison + def get_price(self, id): + # id is a url code part for one car page + with self.browser: + return self.browser.get_price(id) + ## inherited from ICapPriceComparison + #with self.browser: + #if isinstance(id, Price): + #print "get_price by price", id + #price = id + #else: + ##p_id, s_id = id.split('.', 2) + ##product = Product(p_id) + ##for price in self.iter_prices(product): + ##if price.id == id: + ##break + ##else: + #return None + ##price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1]) + #return price -# price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1]) -# return price + #def fill_price(self, price, fields): + #print "VO lacentrale fill_price", price + #return self.get_price(price) - # def fill_price(self, price, fields): - # return self.get_price(price) - - # OBJECTS = {Price: fill_price, } + #OBJECTS = {Price: fill_price, } diff --git a/modules/lacentrale/browser.py b/modules/lacentrale/browser.py index 6f26c465..a06369f0 100644 --- a/modules/lacentrale/browser.py +++ b/modules/lacentrale/browser.py @@ -23,12 +23,13 @@ import re from weboob.tools.browser import BaseBrowser -from .pages import MainPage, ListingAutoPage +from .pages import MainPage, ListingAutoPage, AnnoncePage __all__ = ['LaCentraleBrowser'] +# I manage urls and page location, then trasnfert to page class LaCentraleBrowser(BaseBrowser): PROTOCOL = 'http' DOMAIN = 'www.lacentrale.fr' @@ -36,6 +37,7 @@ class LaCentraleBrowser(BaseBrowser): PAGES = { 'http://www.lacentrale.fr/': MainPage, 'http://www.lacentrale.fr/listing_auto.php?.*': ListingAutoPage, + 'http://www.lacentrale.fr/auto-occasion-annonce-.*': AnnoncePage, } def iter_products(self, criteria): @@ -44,20 +46,22 @@ class LaCentraleBrowser(BaseBrowser): assert self.is_on_page(MainPage) return self.page.iter_products(criteria) - def buildUrl(self, product, request, criteria): + def _buildUrl(self, product, request, criteria): if product._criteria.has_key(criteria): return '&' + request.format(product._criteria.get(criteria)) return '' def iter_prices(self, product): + # convert product criteria to url encoding if not self.is_on_page(ListingAutoPage): + #TODO use urllib.urlencode(data) ? url = '/listing_auto.php?num=1&witchSearch=0' - url += self.buildUrl(product, 'Citadine={}','urban') - url += self.buildUrl(product, 'prix_maxi={}','maxprice') - url += self.buildUrl(product, 'km_maxi={}','maxdist') - url += self.buildUrl(product, 'nbportes=%3D{}','nbdoors') - url += self.buildUrl(product, 'cp={}','dept') - url += self.buildUrl(product, 'origin={}','origin') + url += self._buildUrl(product, 'Citadine={}','urban') + url += self._buildUrl(product, 'prix_maxi={}','maxprice') + url += self._buildUrl(product, 'km_maxi={}','maxdist') + url += self._buildUrl(product, 'nbportes=%3D{}','nbdoors') + url += self._buildUrl(product, 'cp={}','dept') + url += self._buildUrl(product, 'origine={}','origin') #print url self.location(url) @@ -77,23 +81,8 @@ class LaCentraleBrowser(BaseBrowser): self.location(url) assert self.is_on_page(ListingAutoPage) -# def iter_prices(self, zipcode, product): -# data = {'aff_param_0_0': '', -# 'aff_param_0_1': 'les points de vente', -# 'aff_param_0_3': zipcode, -# 'changeNbPerPage': 'off', -# 'toDelete': -1, -# } -# self.location('/index.php?module=dbgestion&action=search', urllib.urlencode(data)) -# -# assert self.is_on_page(ComparisonResultsPage) -# return self.page.iter_results(product) -# -# def get_shop_info(self, id): -# data = {'pdv_id': id, -# 'module': 'dbgestion', -# 'action': 'getPopupInfo'} -# self.location('/index.php?module=dbgestion&action=getPopupInfo', urllib.urlencode(data)) -# -# assert self.is_on_page(ShopInfoPage) -# return self.page.get_info() + def get_price(self, id): + #/auto-occasion-annonce-23440064.html + self.location('/auto-occasion-annonce-'+id+'.html') + assert self.is_on_page(AnnoncePage) + return self.page.get_price(id) diff --git a/modules/lacentrale/pages.py b/modules/lacentrale/pages.py index d6732e85..3a4956fa 100644 --- a/modules/lacentrale/pages.py +++ b/modules/lacentrale/pages.py @@ -22,13 +22,14 @@ #import re from weboob.tools.browser import BasePage, BrokenPageError -from weboob.capabilities import NotAvailable +from weboob.capabilities import NotAvailable, NotLoaded from weboob.capabilities.pricecomparison import Product, Price, Shop import re from decimal import Decimal __all__ = ['MainPage','ListingAutoPage'] +# I manage main page, ie do nothing yet class MainPage(BasePage): def iter_products(self, criteria): product = Product(1) @@ -38,6 +39,23 @@ class MainPage(BasePage): product._criteria = criteria yield product +def get_decimal(s): + return re.findall(r'\d+', s.replace(' ',''))[0] + +def new_price(id, product, cost, title): + price = Price(id) + price.product = product + price.cost = Decimal(get_decimal(cost)) + price.currency = u'€' + price.message = unicode(title) + price.set_empty_fields(NotAvailable) + + price.shop = Shop(price.id) + price.shop.set_empty_fields(NotAvailable) # NotLoaded + + return price + +# I manage listing page and extract information class ListingAutoPage(BasePage): def _extract(self, tr, name): @@ -47,9 +65,17 @@ class ListingAutoPage(BasePage): return '' return td[-1].text_content().strip() + def _extract_id(self, tr): + tdas = tr.cssselect('td.lcbrand a') + if tdas is None or len(tdas)==0: return None + tda = tdas[0] + m = re.search('annonce-(\d+)\.html', tda.get('href')) + if not m: return None + return m.group(1) + def iter_prices(self, product, numpage): - for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'): - id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:]) + for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id],tr.lclineB[id]'): + id = self._extract_id(tr) title = self._extract(tr, 'lcbrand') if not title: continue @@ -62,16 +88,7 @@ class ListingAutoPage(BasePage): cost = ', ' + self._extract(tr, 'lcprice') - price = Price(id) - price.product = product - price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0]) - price.currency = u'€' - price.message = unicode(title) - price.shop = Shop(price.id) - price.shop.set_empty_fields(NotAvailable) - - price.set_empty_fields(NotAvailable) - yield price + yield new_price(id, product, cost, title) def get_next(self): for a in self.document.getroot().cssselect('a.page'): @@ -83,38 +100,68 @@ class ListingAutoPage(BasePage): return int(m.group(1)) return None -#class ComparisonResultsPage(BasePage): - #def get_product_name(self): - #try: - #div = self.parser.select(self.document.getroot(), 'div#moins_plus_ariane', 1) - #except BrokenPageError: - #return NotAvailable - #else: - #m = re.match('Carburant : ([\w\-]+) | .*', div.text) - #return m.group(1) +# I manage one car page (annonce) )and extract information +class AnnoncePage(BasePage): - #def iter_results(self, product=None): - #price = None - #product.name = self.get_product_name() - #for tr in self.document.getroot().cssselect('table#tab_resultat tr'): - #if tr.attrib.get('id', '').startswith('pdv'): - #price = Price('%s.%s' % (product.id, tr.attrib['id'][3:])) + def _extract(self, e, name): + 'Extract content from li element with class name' + li = e.cssselect('li.' + name) + if not li: + return '' + return li[0].text_content().strip() - #price.product = product + def _extract_info(self, e, name): + 'Extract content from InfoLib' + for td in e.cssselect('td.InfoLib'): + if name in td.text_content(): + ntd = td.getnext() + if ntd is None: continue + return ntd.text_content().strip() + return None - #tds = tr.findall('td') - #price.cost = Decimal(tds[4].text.replace(',', '.')) - #price.currency = u'€' + def _extract_vendor(self, e, name): + 'Extract content from VendorLib' + for span in e.cssselect('span.VendeurLib'): + if name in span.text_content(): + li = span.getparent() + if li is None: continue + # get all text + s = li.text_content() + # get text without header + s = s[len(span.text_content())+1:] + # special case for not pro + if '\n' in s: + s = s[:s.find('\n')] + return s.strip() + return None - #shop = Shop(price.id) - #shop.name = unicode(tds[2].text.strip()) - #shop.location = unicode(tds[0].text.strip()) + def get_shop(self, id): + shop = Shop(id) + for e in self.document.getroot().cssselect('div#Vendeur'): + shop.name = self._extract_vendor(e,'Nom') + '(' + self._extract_vendor(e,'Vendeur') + ')' + shop.location = '' + for adr in self.document.getroot().cssselect('span#AdresseL1,span#AdresseL2'): + if shop.location: + shop.location += ', ' + shop.location += adr.text_content().strip() + for tel in self.document.getroot().cssselect('span.Tel'): + s = tel.text_content().strip() + if shop.location: + shop.location += ', ' + shop.location += re.sub('\s+', ' ', s) + shop.set_empty_fields(NotAvailable) + return shop - #price.shop = shop - #price.set_empty_fields(NotAvailable) - #yield price - - -#class ShopInfoPage(BasePage): - #def get_info(self): - #return self.parser.tostring(self.parser.select(self.document.getroot(), 'div.colg', 1)) + def get_price(self, id): + for e in self.document.getroot().cssselect('div#DescBar'): + product = Product(1) + product.name = unicode('Occasion') + cost = self._extract(e,'PriceLc') + title = self._extract(e,'BrandLc') + title += ', ' + self._extract(e,'modeleCom') + title += ', ' + self._extract_info(e,'Version') + title += ', ' + self._extract_info(e,'Ann') + title += ', ' + get_decimal(self._extract_info(e,'Kilom')) + 'km' + price = new_price(id, product, cost, title) + price.shop = self.get_shop(id) + return price