Better id and some improvment

Signed-off-by: Vicnet <vo.publique@gmail.com>
2014-03-14 14:14:00 +01:00 · 2014-03-14 14:14:00 +01:00 · 8f09828ebf
commit 8f09828ebf
parent 4584384db7
3 changed files with 138 additions and 95 deletions
--- a/modules/lacentrale/backend.py
+++ b/modules/lacentrale/backend.py
@ -28,6 +28,7 @@ from .browser import LaCentraleBrowser
 __all__ = ['LaCentraleBackend']
 # I implement capability
 class LaCentraleBackend(BaseBackend, ICapPriceComparison):
    NAME = 'lacentrale'
    MAINTAINER = u'Vicnet'
@ -66,29 +67,35 @@ class LaCentraleBackend(BaseBackend, ICapPriceComparison):
            for product in self.browser.iter_products(criteria):
                yield product
    # inherited from ICapPriceComparison
    def iter_prices(self, product):
        # inherited from ICapPriceComparison
        with self.browser:
            return self.browser.iter_prices(product)
 #    def get_price(self, id):
    # inherited from ICapPriceComparison
    def get_price(self, id):
        # id is a url code part for one car page
        with self.browser:
            return self.browser.get_price(id)
        ## inherited from ICapPriceComparison
        #with self.browser:
           #if isinstance(id, Price):
                #print "get_price by price", id
                #price = id
           #else:
-#                p_id, s_id = id.split('.', 2)
+               ##p_id, s_id = id.split('.', 2)
-#                product = Product(p_id)
+               ##product = Product(p_id)
-#                for price in self.iter_prices(product):
+               ##for price in self.iter_prices(product):
-#                    if price.id == id:
+                   ##if price.id == id:
-#                        break
+                       ##break
-#                else:
+               ##else:
               #return None
-
+           ##price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1])
 #            price.shop.info = self.browser.get_shop_info(price.id.split('.', 2)[-1])
           #return price
    #def fill_price(self, price, fields):
        #print "VO lacentrale fill_price", price
       #return self.get_price(price)
    #OBJECTS = {Price: fill_price, }
--- a/modules/lacentrale/browser.py
+++ b/modules/lacentrale/browser.py
@ -23,12 +23,13 @@ import re
 from weboob.tools.browser import BaseBrowser
-from .pages import MainPage, ListingAutoPage
+from .pages import MainPage, ListingAutoPage, AnnoncePage
 __all__ = ['LaCentraleBrowser']
 # I manage urls and page location, then trasnfert to page
 class LaCentraleBrowser(BaseBrowser):
    PROTOCOL = 'http'
    DOMAIN = 'www.lacentrale.fr'
@ -36,6 +37,7 @@ class LaCentraleBrowser(BaseBrowser):
    PAGES = {
         'http://www.lacentrale.fr/': MainPage,
         'http://www.lacentrale.fr/listing_auto.php?.*': ListingAutoPage,
         'http://www.lacentrale.fr/auto-occasion-annonce-.*': AnnoncePage,
        }
    def iter_products(self, criteria):
@ -44,20 +46,22 @@ class LaCentraleBrowser(BaseBrowser):
        assert self.is_on_page(MainPage)
        return self.page.iter_products(criteria)
-    def buildUrl(self, product, request, criteria):
+    def _buildUrl(self, product, request, criteria):
        if product._criteria.has_key(criteria):
            return '&' + request.format(product._criteria.get(criteria))
        return ''
    def iter_prices(self, product):
        # convert product criteria to url encoding
        if not self.is_on_page(ListingAutoPage):
            #TODO use urllib.urlencode(data) ?
            url = '/listing_auto.php?num=1&witchSearch=0'
-            url += self.buildUrl(product, 'Citadine={}','urban')
+            url += self._buildUrl(product, 'Citadine={}','urban')
-            url += self.buildUrl(product, 'prix_maxi={}','maxprice')
+            url += self._buildUrl(product, 'prix_maxi={}','maxprice')
-            url += self.buildUrl(product, 'km_maxi={}','maxdist')
+            url += self._buildUrl(product, 'km_maxi={}','maxdist')
-            url += self.buildUrl(product, 'nbportes=%3D{}','nbdoors')
+            url += self._buildUrl(product, 'nbportes=%3D{}','nbdoors')
-            url += self.buildUrl(product, 'cp={}','dept')
+            url += self._buildUrl(product, 'cp={}','dept')
-            url += self.buildUrl(product, 'origin={}','origin')
+            url += self._buildUrl(product, 'origine={}','origin')
            #print url
            self.location(url)
@ -77,23 +81,8 @@ class LaCentraleBrowser(BaseBrowser):
            self.location(url)
            assert self.is_on_page(ListingAutoPage)
-#    def iter_prices(self, zipcode, product):
+    def get_price(self, id):
-#        data = {'aff_param_0_0':            '',
+        #/auto-occasion-annonce-23440064.html
-#                'aff_param_0_1':            'les points de vente',
+        self.location('/auto-occasion-annonce-'+id+'.html')
-#                'aff_param_0_3':            zipcode,
+        assert self.is_on_page(AnnoncePage)
-#                'changeNbPerPage':          'off',
+        return self.page.get_price(id)
 #                'toDelete':                 -1,
 #               }
 #        self.location('/index.php?module=dbgestion&action=search', urllib.urlencode(data))
 #
 #        assert self.is_on_page(ComparisonResultsPage)
 #        return self.page.iter_results(product)
 #
 #    def get_shop_info(self, id):
 #        data = {'pdv_id': id,
 #                'module':   'dbgestion',
 #                'action':   'getPopupInfo'}
 #        self.location('/index.php?module=dbgestion&action=getPopupInfo', urllib.urlencode(data))
 #
 #        assert self.is_on_page(ShopInfoPage)
 #        return self.page.get_info()
--- a/modules/lacentrale/pages.py
+++ b/modules/lacentrale/pages.py
@ -22,13 +22,14 @@
 #import re
 from weboob.tools.browser import BasePage, BrokenPageError
-from weboob.capabilities import NotAvailable
+from weboob.capabilities import NotAvailable, NotLoaded
 from weboob.capabilities.pricecomparison import Product, Price, Shop
 import re
 from decimal import Decimal
 __all__ = ['MainPage','ListingAutoPage']
 # I manage main page, ie do nothing yet
 class MainPage(BasePage):
    def iter_products(self, criteria):
        product = Product(1)
@ -38,6 +39,23 @@ class MainPage(BasePage):
        product._criteria = criteria
        yield product
 def get_decimal(s):
    return re.findall(r'\d+', s.replace(' ',''))[0]
 def new_price(id, product, cost, title):
    price = Price(id)
    price.product = product
    price.cost = Decimal(get_decimal(cost))
    price.currency = u'€'
    price.message = unicode(title)
    price.set_empty_fields(NotAvailable)
    price.shop = Shop(price.id)
    price.shop.set_empty_fields(NotAvailable) # NotLoaded
    return price
 # I manage listing page and extract information
 class ListingAutoPage(BasePage):
    def _extract(self, tr, name):
@ -47,9 +65,17 @@ class ListingAutoPage(BasePage):
            return ''
        return td[-1].text_content().strip()
    def _extract_id(self, tr):
        tdas = tr.cssselect('td.lcbrand a')
        if tdas is None or len(tdas)==0: return None
        tda = tdas[0]
        m = re.search('annonce-(\d+)\.html', tda.get('href'))
        if not m: return None
        return m.group(1)
    def iter_prices(self, product, numpage):
-        for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'):
+        for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id],tr.lclineB[id]'):
-            id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:])
+            id = self._extract_id(tr)
            title = self._extract(tr, 'lcbrand')
            if not title:
                continue
@ -62,16 +88,7 @@ class ListingAutoPage(BasePage):
            cost = ', ' + self._extract(tr, 'lcprice')
-            price = Price(id)
+            yield new_price(id, product, cost, title)
            price.product = product
            price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0])
            price.currency = u'€'
            price.message = unicode(title)
            price.shop = Shop(price.id)
            price.shop.set_empty_fields(NotAvailable)
            price.set_empty_fields(NotAvailable)
            yield price
    def get_next(self):
        for a in self.document.getroot().cssselect('a.page'):
@ -83,38 +100,68 @@ class ListingAutoPage(BasePage):
                return int(m.group(1))
        return None
-#class ComparisonResultsPage(BasePage):
+# I manage one car page (annonce) )and extract information
-    #def get_product_name(self):
+class AnnoncePage(BasePage):
        #try:
            #div = self.parser.select(self.document.getroot(), 'div#moins_plus_ariane', 1)
        #except BrokenPageError:
            #return NotAvailable
        #else:
            #m = re.match('Carburant : ([\w\-]+) | .*', div.text)
            #return m.group(1)
-    #def iter_results(self, product=None):
+    def _extract(self, e, name):
-        #price = None
+        'Extract content from li element with class name'
-        #product.name = self.get_product_name()
+        li = e.cssselect('li.' + name)
-        #for tr in self.document.getroot().cssselect('table#tab_resultat tr'):
+        if not li:
-            #if tr.attrib.get('id', '').startswith('pdv'):
+            return ''
-                #price = Price('%s.%s' % (product.id, tr.attrib['id'][3:]))
+        return li[0].text_content().strip()
-                #price.product = product
+    def _extract_info(self, e, name):
        'Extract content from InfoLib'
        for td in e.cssselect('td.InfoLib'):
            if name in td.text_content():
                ntd = td.getnext()
                if ntd is None: continue
                return ntd.text_content().strip()
        return None
-                #tds = tr.findall('td')
+    def _extract_vendor(self, e, name):
-                #price.cost = Decimal(tds[4].text.replace(',', '.'))
+        'Extract content from VendorLib'
-                #price.currency = u'€'
+        for span in e.cssselect('span.VendeurLib'):
            if name in span.text_content():
                li = span.getparent()
                if li is None: continue
                # get all text
                s = li.text_content()
                # get text without header
                s = s[len(span.text_content())+1:]
                # special case for not pro
                if '\n' in s:
                    s = s[:s.find('\n')]
                return s.strip()
        return None
-                #shop = Shop(price.id)
+    def get_shop(self, id):
-                #shop.name = unicode(tds[2].text.strip())
+        shop = Shop(id)
-                #shop.location = unicode(tds[0].text.strip())
+        for e in self.document.getroot().cssselect('div#Vendeur'):
            shop.name = self._extract_vendor(e,'Nom') + '(' + self._extract_vendor(e,'Vendeur') + ')'
        shop.location = ''
        for adr in self.document.getroot().cssselect('span#AdresseL1,span#AdresseL2'):
            if shop.location:
                shop.location += ', '
            shop.location += adr.text_content().strip()
        for tel in self.document.getroot().cssselect('span.Tel'):
            s = tel.text_content().strip()
            if shop.location:
                shop.location += ', '
            shop.location += re.sub('\s+', ' ', s)
        shop.set_empty_fields(NotAvailable)
        return shop
-                #price.shop = shop
+    def get_price(self, id):
-                #price.set_empty_fields(NotAvailable)
+        for e in self.document.getroot().cssselect('div#DescBar'):
-                #yield price
+            product = Product(1)
-
+            product.name = unicode('Occasion')
-
+            cost = self._extract(e,'PriceLc')
-#class ShopInfoPage(BasePage):
+            title = self._extract(e,'BrandLc')
-    #def get_info(self):
+            title += ', ' + self._extract(e,'modeleCom')
-        #return self.parser.tostring(self.parser.select(self.document.getroot(), 'div.colg', 1))
+            title += ', ' + self._extract_info(e,'Version')
            title += ', ' + self._extract_info(e,'Ann')
            title += ', ' + get_decimal(self._extract_info(e,'Kilom')) + 'km'
            price = new_price(id, product, cost, title)
            price.shop = self.get_shop(id)
            return price