gdfsuez-dolcevita.fr website

Signed-off-by: Mathieu Jourdan <mathieu.jourdan@gresille.org> Signed-off-by: Romain Bignon <romain@symlink.me>
2013-05-08 19:45:09 +02:00 · 2013-05-08 19:45:09 +02:00 · 79d1bcfea6
commit 79d1bcfea6
parent 2c633a43c8
7 changed files with 536 additions and 0 deletions
--- a/modules/gdfsuez/pages/history.py
+++ b/modules/gdfsuez/pages/history.py
@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2013 Mathieu Jourdan
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+import os
+import subprocess
+import tempfile
+import shutil
+
+from datetime import datetime, date
+from decimal import Decimal
+
+from weboob.tools.browser import BasePage
+from weboob.capabilities.base import NotAvailable
+from weboob.capabilities.bill import Detail, Bill
+
+__all__ = ['HistoryPage', 'PdfPage']
+
+class HistoryPage(BasePage):
+
+    def on_loaded(self):
+        self.details = []
+        self.bills = []
+
+        # Latest bill
+        div = self.document.xpath('//div[@class="consulter_dernierefacture"]')[0]
+        bdate = div.xpath('p[@class="date"]/span[@class="textetertiaire"]')[0].text
+        bprice = div.xpath('p[@class="montant"]/span[@class="textetertiaire"]')[0].text
+        link = div.xpath('a[@id="display_popin"]')[0].attrib['href']
+        mydate = date(*reversed([int(x) for x in bdate.split("/")]))
+        price = Decimal(bprice.strip(u' € TTC').replace(',', '.'))
+        self.bills.append(self._create_bill(mydate, price, link))
+
+        # Previous bills
+        table = self.document.xpath('//table[@class="afficher_factures"]')[0]
+        for tr in table[0].xpath('//tbody/tr'):
+            cells = tr.xpath('td')
+            bdate = unicode(cells[0].text.strip())
+            mydate = date(*reversed([int(x) for x in bdate.split("/")]))
+            bprice = unicode(cells[1].text)
+            price = Decimal(bprice.strip(u' €').replace(',', '.'))
+            link = cells[3].xpath('a')[0].attrib['href']
+            self.bills.append(self._create_bill(mydate, price, link))
+
+    def _create_bill(self, date, price, link):
+        bill = Bill()
+        bill.id = date.__str__().replace('-', '') 
+        bill.date = date
+        bill._price = price
+        bill._url = link
+        bill.format = u'pdf'
+        bill.label = unicode(price)
+        return bill
+        
+    def get_details(self):
+        return self.details
+
+    def get_bills(self):
+        return self.bills
+
+class PdfPage():
+
+    def __init__(self, file):
+        self.pdf = file
+
+    def _parse_pdf(self):
+        pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf')
+        temptxt = pdffile.name.replace('.pdf', '.txt')
+        cmd = "ebook-convert"
+        stdout = open("/dev/null", "w")
+        shutil.copyfileobj(self.pdf, pdffile)
+        pdffile.flush()
+        subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout)
+        pdffile.close()
+        txtfile = open(temptxt, 'r')
+        txt = txtfile.read()
+        txtfile.close()
+        os.remove(temptxt)
+        return txt
+
+    def _parse_page(self, page):
+
+        # Regexp
+        footnote = re.compile(r'\([0-9]\) ')                # (f)
+        ht = re.compile('HT par mois')
+        base = re.compile('la base de')
+        begindate = re.compile(' \d\d\/\d\d ')              # MM/DD
+        enddate = re.compile('\d\d\/\d\d\/\d\d')            # YY/MM/DD
+        endwithdigit = re.compile('\d+$')                   # blah blah 42
+        endwitheuro = re.compile('€$')                      # blah 00,00 €
+        textwithcoma = re.compile('([a-z]|\d{4})\,')        # blah 2012, blah blah
+
+        # Parsing
+        details = []
+        for title in [  'Abonnement',
+                        'Consommation',
+                        'Contributions et taxes liées à l\'énergie']:
+            section = page.split(title,1)[1].split('Total ')[0]
+
+            # When a line holds '(0)', a newline is missing.
+            section = re.sub(footnote,'\n', section)
+
+            lines = section.split('\n')
+            lines = [x for x in lines if len(x) > 0]  # Remove empty lines
+            detail = None
+
+            for line in lines:
+                if re.match('[A-Za-z]', line[0]):
+
+                    # Things we want to merge with the one just before
+                    if 'facturées' in line:
+                        # Long lines are sometimes split, so we try to join them
+                        # That is the case for:
+                        # 'Déduction du montant des consommations
+                        # estimées facturées du 00/00/00 au 00/00/00'
+                        detail.label = detail.label + u' ' + unicode(line, encoding='utf-8')
+
+                    # Things for which we want a new detail
+                    else:
+                        # Entering here, we will instantiate a new detail.
+                        # We hadn't so before because of fragmented lines.
+                        if detail is not None and detail.label is not NotAvailable:
+                            # We have a new element, return the other one
+                            details.append(detail)
+                        detail = Detail()
+                        detail.price = Decimal(0)
+
+                        # If the coma is not a decimal separator, then
+                        # this is is probably a loooong sentence.
+                        # When it comes to jokes, keep it short and sweet.
+                        line = re.split(textwithcoma, line)[0]
+
+                        # Things we want for sure
+                        if re.findall(enddate, line):
+                            # When a line has been badly split after a date,
+                            # We want the label to end after the date, and maybe
+                            # the second part to be the info
+                            mydate = re.search(enddate, line).group(0)
+                            mylist = line.rpartition(mydate)
+                            label = mylist[0] + mylist[1]
+                            detail.label = unicode(label, encoding='utf-8')
+                        elif re.findall(endwithdigit, line):
+                            # What is this stupid number at the end of the line?
+                            # Line should have been split before the number
+                            detail.label = unicode(re.split(endwithdigit, line)[0], encoding='utf-8')
+                        # Things we don't want for sure
+                        elif ')' in line and '(' not in line:
+                            # First part of the parenthesis should have been drop before
+                            # Avoid to create a new empty detail
+                            detail.label = NotAvailable
+                        elif re.match(base, line):
+                            # This string should come always after a date,
+                            # usually, it will match one of the cases above.
+                            # Sometimes, it appears on a new line we don't need.
+                            detail.label = NotAvailable
+                        elif re.match(ht, line):
+                            # '00,00 € HT par mois' may have been split after HT
+                            # We don't need of the second line
+                            detail.label = NotAvailable
+                        # Things we probably want to keep
+                        else:
+                            # Well, maybe our line is correct, after all.
+                            # Not much to do.
+                            detail.label = unicode(line, encoding='utf-8')
+                        detail.infos = NotAvailable
+                elif ' %' in line:
+                    if isinstance(detail, Detail):
+                        # Sometimes the vat is not on a new line:
+                        # '00,00 00,0 %' instead of '00,0 %'
+                        vat = line.split()[line.count(' ')-1].replace(',', '.')
+                        detail.infos = unicode('TVA: ' + vat)
+                elif ' €' in line:
+                    price = line.replace(',','.')
+                    if isinstance(detail, Detail):
+                        detail.price = Decimal(price.strip(' €'))
+                elif re.match(enddate, line):
+                    # Line holding dates may have been mixed up
+                    label = detail.label.split(' au ')[0] + u' au ' + unicode(line, encoding='utf-8')
+                    detail.label = label
+            if detail.label is not NotAvailable:
+                # Do not append empty details to the list
+                # It seemed easier to create details anyway than dealing
+                # with None objects
+                details.append(detail)
+        return details
+
+    def get_details(self, label):
+        txt = self._parse_pdf()
+        page = None
+        if label == u'Gaz naturel':
+            page = txt.split('GAZ NATUREL')[1].split('TOTAL GAZ NATUREL TTC')[0]
+        elif label == u'Electricité':
+            page = txt.split('ELECTRICITE')[1].split('TOTAL ELECTRICITE TTC')[0]
+        else:
+            pass
+        return self._parse_page(page)
+