# -*- coding: utf-8 -*- # Copyright(C) 2013 Mathieu Jourdan # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . import re import os import subprocess import tempfile import shutil from datetime import date from decimal import Decimal from weboob.tools.browser import BasePage from weboob.capabilities.base import NotAvailable from weboob.capabilities.bill import Detail, Bill __all__ = ['HistoryPage', 'PdfPage'] class HistoryPage(BasePage): def on_loaded(self): self.details = [] self.bills = [] # Latest bill div = self.document.xpath('//div[@class="consulter_dernierefacture"]')[0] bdate = div.xpath('p[@class="date"]/span[@class="textetertiaire"]')[0].text bprice = div.xpath('p[@class="montant"]/span[@class="textetertiaire"]')[0].text link = div.xpath('a[@id="display_popin"]')[0].attrib['href'] mydate = date(*reversed([int(x) for x in bdate.split("/")])) price = Decimal(bprice.strip(u' € TTC').replace(',', '.')) self.bills.append(self._create_bill(mydate, price, link)) # Previous bills table = self.document.xpath('//table[@class="afficher_factures"]')[0] for tr in table[0].xpath('//tbody/tr'): cells = tr.xpath('td') bdate = unicode(cells[0].text.strip()) mydate = date(*reversed([int(x) for x in bdate.split("/")])) bprice = unicode(cells[1].text) price = Decimal(bprice.strip(u' €').replace(',', '.')) link = cells[3].xpath('a')[0].attrib['href'] self.bills.append(self._create_bill(mydate, price, link)) def _create_bill(self, date, price, link): bill = Bill() bill.id = date.__str__().replace('-', '') bill.date = date bill._price = price bill._url = link bill.format = u'pdf' bill.label = unicode(price) return bill def get_details(self): return self.details def get_bills(self): return self.bills class PdfPage(): def __init__(self, file): self.pdf = file def _parse_pdf(self): pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf') temptxt = pdffile.name.replace('.pdf', '.txt') cmd = "ebook-convert" stdout = open("/dev/null", "w") shutil.copyfileobj(self.pdf, pdffile) pdffile.flush() subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout) pdffile.close() txtfile = open(temptxt, 'r') txt = txtfile.read() txtfile.close() os.remove(temptxt) return txt def _parse_page(self, page): # Regexp footnote = re.compile(r'\([0-9]\) ') # (f) ht = re.compile('HT par mois') base = re.compile('la base de') enddate = re.compile('\d\d\/\d\d\/\d\d') # YY/MM/DD endwithdigit = re.compile('\d+$') # blah blah 42 textwithcoma = re.compile('([a-z]|\d{4})\,') # blah 2012, blah blah # Parsing details = [] for title in ['Abonnement', 'Consommation', 'Contributions et taxes liées à l\'énergie']: section = page.split(title, 1)[1].split('Total ')[0] # When a line holds '(0)', a newline is missing. section = re.sub(footnote, '\n', section) lines = section.split('\n') lines = [x for x in lines if len(x) > 0] # Remove empty lines detail = None for line in lines: if re.match('[A-Za-z]', line[0]): # Things we want to merge with the one just before if 'facturées' in line: # Long lines are sometimes split, so we try to join them # That is the case for: # 'Déduction du montant des consommations # estimées facturées du 00/00/00 au 00/00/00' detail.label = detail.label + u' ' + unicode(line, encoding='utf-8') # Things for which we want a new detail else: # Entering here, we will instantiate a new detail. # We hadn't so before because of fragmented lines. if detail is not None and detail.label is not NotAvailable: # We have a new element, return the other one details.append(detail) detail = Detail() detail.price = Decimal(0) # If the coma is not a decimal separator, then # this is is probably a loooong sentence. # When it comes to jokes, keep it short and sweet. line = re.split(textwithcoma, line)[0] # Things we want for sure if re.findall(enddate, line): # When a line has been badly split after a date, # We want the label to end after the date, and maybe # the second part to be the info mydate = re.search(enddate, line).group(0) mylist = line.rpartition(mydate) label = mylist[0] + mylist[1] detail.label = unicode(label, encoding='utf-8') elif re.findall(endwithdigit, line): # What is this stupid number at the end of the line? # Line should have been split before the number detail.label = unicode(re.split(endwithdigit, line)[0], encoding='utf-8') # Things we don't want for sure elif ')' in line and '(' not in line: # First part of the parenthesis should have been drop before # Avoid to create a new empty detail detail.label = NotAvailable elif re.match(base, line): # This string should come always after a date, # usually, it will match one of the cases above. # Sometimes, it appears on a new line we don't need. detail.label = NotAvailable elif re.match(ht, line): # '00,00 € HT par mois' may have been split after HT # We don't need of the second line detail.label = NotAvailable # Things we probably want to keep else: # Well, maybe our line is correct, after all. # Not much to do. detail.label = unicode(line, encoding='utf-8') detail.infos = NotAvailable elif ' %' in line: if isinstance(detail, Detail): # Sometimes the vat is not on a new line: # '00,00 00,0 %' instead of '00,0 %' vat = line.split()[line.count(' ')-1].replace(',', '.') detail.infos = unicode('TVA: ' + vat) elif ' €' in line: price = line.replace(',', '.') if isinstance(detail, Detail): detail.price = Decimal(price.strip(' €')) elif re.match(enddate, line): # Line holding dates may have been mixed up label = detail.label.split(' au ')[0] + u' au ' + unicode(line, encoding='utf-8') detail.label = label if detail.label is not NotAvailable: # Do not append empty details to the list # It seemed easier to create details anyway than dealing # with None objects details.append(detail) return details def get_details(self, label): txt = self._parse_pdf() page = None if label == u'Gaz naturel': page = txt.split('GAZ NATUREL')[1].split('TOTAL GAZ NATUREL TTC')[0] elif label == u'Electricité': page = txt.split('ELECTRICITE')[1].split('TOTAL ELECTRICITE TTC')[0] else: pass return self._parse_page(page)