Better id and some improvment

Signed-off-by: Vicnet <vo.publique@gmail.com>
2014-03-14 14:14:00 +01:00 · 2014-03-14 14:14:00 +01:00 · 8f09828ebf
commit 8f09828ebf
parent 4584384db7
3 changed files with 138 additions and 95 deletions
--- a/modules/lacentrale/pages.py
+++ b/modules/lacentrale/pages.py
@ -22,13 +22,14 @@
 #import re

 from weboob.tools.browser import BasePage, BrokenPageError
-from weboob.capabilities import NotAvailable
+from weboob.capabilities import NotAvailable, NotLoaded
 from weboob.capabilities.pricecomparison import Product, Price, Shop
 import re
 from decimal import Decimal

 __all__ = ['MainPage','ListingAutoPage']

+# I manage main page, ie do nothing yet
 class MainPage(BasePage):
    def iter_products(self, criteria):
        product = Product(1)
@ -38,6 +39,23 @@ class MainPage(BasePage):
        product._criteria = criteria
        yield product

+def get_decimal(s):
+    return re.findall(r'\d+', s.replace(' ',''))[0]
+
+def new_price(id, product, cost, title):
+    price = Price(id)
+    price.product = product
+    price.cost = Decimal(get_decimal(cost))
+    price.currency = u'€'
+    price.message = unicode(title)
+    price.set_empty_fields(NotAvailable)
+    
+    price.shop = Shop(price.id)
+    price.shop.set_empty_fields(NotAvailable) # NotLoaded
+    
+    return price
+
+# I manage listing page and extract information
 class ListingAutoPage(BasePage):

    def _extract(self, tr, name):
@ -47,9 +65,17 @@ class ListingAutoPage(BasePage):
            return ''
        return td[-1].text_content().strip()

+    def _extract_id(self, tr):
+        tdas = tr.cssselect('td.lcbrand a')
+        if tdas is None or len(tdas)==0: return None
+        tda = tdas[0]
+        m = re.search('annonce-(\d+)\.html', tda.get('href'))
+        if not m: return None
+        return m.group(1)
+
    def iter_prices(self, product, numpage):
-        for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id]'):
-            id = '{numpage}.{id}'.format(numpage=numpage, id=tr.attrib['id'][3:])
+        for tr in self.document.getroot().cssselect('tr.lcline[id],tr.lclineJB[id],tr.lclineJ[id],tr.lclineB[id]'):
+            id = self._extract_id(tr)
            title = self._extract(tr, 'lcbrand')
            if not title:
                continue
@ -62,16 +88,7 @@ class ListingAutoPage(BasePage):

            cost = ', ' + self._extract(tr, 'lcprice')

-            price = Price(id)
-            price.product = product
-            price.cost = Decimal(re.findall(r'\d+',cost.replace(' ',''))[0])
-            price.currency = u'€'
-            price.message = unicode(title)
-            price.shop = Shop(price.id)
-            price.shop.set_empty_fields(NotAvailable)
-
-            price.set_empty_fields(NotAvailable)
-            yield price
+            yield new_price(id, product, cost, title)

    def get_next(self):
        for a in self.document.getroot().cssselect('a.page'):
@ -83,38 +100,68 @@ class ListingAutoPage(BasePage):
                return int(m.group(1))
        return None

-#class ComparisonResultsPage(BasePage):
-    #def get_product_name(self):
-        #try:
-            #div = self.parser.select(self.document.getroot(), 'div#moins_plus_ariane', 1)
-        #except BrokenPageError:
-            #return NotAvailable
-        #else:
-            #m = re.match('Carburant : ([\w\-]+) | .*', div.text)
-            #return m.group(1)
+# I manage one car page (annonce) )and extract information
+class AnnoncePage(BasePage):

-    #def iter_results(self, product=None):
-        #price = None
-        #product.name = self.get_product_name()
-        #for tr in self.document.getroot().cssselect('table#tab_resultat tr'):
-            #if tr.attrib.get('id', '').startswith('pdv'):
-                #price = Price('%s.%s' % (product.id, tr.attrib['id'][3:]))
+    def _extract(self, e, name):
+        'Extract content from li element with class name'
+        li = e.cssselect('li.' + name)
+        if not li:
+            return ''
+        return li[0].text_content().strip()

-                #price.product = product
+    def _extract_info(self, e, name):
+        'Extract content from InfoLib'
+        for td in e.cssselect('td.InfoLib'):
+            if name in td.text_content():
+                ntd = td.getnext()
+                if ntd is None: continue
+                return ntd.text_content().strip()
+        return None

-                #tds = tr.findall('td')
-                #price.cost = Decimal(tds[4].text.replace(',', '.'))
-                #price.currency = u'€'
+    def _extract_vendor(self, e, name):
+        'Extract content from VendorLib'
+        for span in e.cssselect('span.VendeurLib'):
+            if name in span.text_content():
+                li = span.getparent()
+                if li is None: continue
+                # get all text
+                s = li.text_content()
+                # get text without header
+                s = s[len(span.text_content())+1:]
+                # special case for not pro
+                if '\n' in s:
+                    s = s[:s.find('\n')]
+                return s.strip()
+        return None

-                #shop = Shop(price.id)
-                #shop.name = unicode(tds[2].text.strip())
-                #shop.location = unicode(tds[0].text.strip())
+    def get_shop(self, id):
+        shop = Shop(id)
+        for e in self.document.getroot().cssselect('div#Vendeur'):
+            shop.name = self._extract_vendor(e,'Nom') + '(' + self._extract_vendor(e,'Vendeur') + ')'
+        shop.location = ''
+        for adr in self.document.getroot().cssselect('span#AdresseL1,span#AdresseL2'):
+            if shop.location:
+                shop.location += ', '
+            shop.location += adr.text_content().strip()
+        for tel in self.document.getroot().cssselect('span.Tel'):
+            s = tel.text_content().strip()
+            if shop.location:
+                shop.location += ', '
+            shop.location += re.sub('\s+', ' ', s)
+        shop.set_empty_fields(NotAvailable)
+        return shop

-                #price.shop = shop
-                #price.set_empty_fields(NotAvailable)
-                #yield price
-
-
-#class ShopInfoPage(BasePage):
-    #def get_info(self):
-        #return self.parser.tostring(self.parser.select(self.document.getroot(), 'div.colg', 1))
+    def get_price(self, id):
+        for e in self.document.getroot().cssselect('div#DescBar'):
+            product = Product(1)
+            product.name = unicode('Occasion')
+            cost = self._extract(e,'PriceLc')
+            title = self._extract(e,'BrandLc')
+            title += ', ' + self._extract(e,'modeleCom')
+            title += ', ' + self._extract_info(e,'Version')
+            title += ', ' + self._extract_info(e,'Ann')
+            title += ', ' + get_decimal(self._extract_info(e,'Kilom')) + 'km'
+            price = new_price(id, product, cost, title)
+            price.shop = self.get_shop(id)
+            return price