weboob-devel/modules/gdfsuez/pages/history.py
2014-10-12 02:30:45 +02:00

210 lines
8.9 KiB
Python

# -*- coding: utf-8 -*-
# Copyright(C) 2013 Mathieu Jourdan
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import os
import subprocess
import tempfile
import shutil
from datetime import date
from decimal import Decimal
from weboob.deprecated.browser import Page
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.bill import Detail, Bill
class HistoryPage(Page):
def on_loaded(self):
self.details = []
self.bills = []
# Latest bill
div = self.document.xpath('//div[@class="consulter_dernierefacture"]')[0]
bdate = div.xpath('p[@class="date"]/span[@class="textetertiaire"]')[0].text
bprice = div.xpath('p[@class="montant"]/span[@class="textetertiaire"]')[0].text
link = div.xpath('a[@id="display_popin"]')[0].attrib['href']
mydate = date(*reversed([int(x) for x in bdate.split("/")]))
price = Decimal(bprice.strip(u' € TTC').replace(',', '.'))
self.bills.append(self._create_bill(mydate, price, link))
# Previous bills
table = self.document.xpath('//table[@class="afficher_factures"]')[0]
for tr in table[0].xpath('//tbody/tr'):
cells = tr.xpath('td')
bdate = unicode(cells[0].text.strip())
mydate = date(*reversed([int(x) for x in bdate.split("/")]))
bprice = unicode(cells[1].text)
price = Decimal(bprice.strip(u'').replace(',', '.'))
link = cells[3].xpath('a')[0].attrib['href']
self.bills.append(self._create_bill(mydate, price, link))
def _create_bill(self, date, price, link):
bill = Bill()
bill.id = date.__str__().replace('-', '')
bill.date = date
bill._price = price
bill._url = link
bill.format = u'pdf'
bill.label = unicode(price)
return bill
def get_details(self):
return self.details
def get_bills(self):
return self.bills
class PdfPage():
def __init__(self, file):
self.pdf = file
def _parse_pdf(self):
pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf')
temptxt = pdffile.name.replace('.pdf', '.txt')
cmd = "ebook-convert"
stdout = open("/dev/null", "w")
shutil.copyfileobj(self.pdf, pdffile)
pdffile.flush()
subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout)
pdffile.close()
txtfile = open(temptxt, 'r')
txt = txtfile.read()
txtfile.close()
os.remove(temptxt)
return txt
def _parse_page(self, page):
# Regexp
footnote = re.compile(r'\([0-9]\) ') # (f)
ht = re.compile('HT par mois')
base = re.compile('la base de')
enddate = re.compile('\d\d\/\d\d\/\d\d') # YY/MM/DD
endwithdigit = re.compile('\d+$') # blah blah 42
textwithcoma = re.compile('([a-z]|\d{4})\,') # blah 2012, blah blah
# Parsing
details = []
for title in ['Abonnement',
'Consommation',
'Contributions et taxes liées à l\'énergie']:
section = page.split(title, 1)[1].split('Total ')[0]
# When a line holds '(0)', a newline is missing.
section = re.sub(footnote, '\n', section)
lines = section.split('\n')
lines = [x for x in lines if len(x) > 0] # Remove empty lines
detail = None
for line in lines:
if re.match('[A-Za-z]', line[0]):
# Things we want to merge with the one just before
if 'facturées' in line:
# Long lines are sometimes split, so we try to join them
# That is the case for:
# 'Déduction du montant des consommations
# estimées facturées du 00/00/00 au 00/00/00'
detail.label = detail.label + u' ' + unicode(line, encoding='utf-8')
# Things for which we want a new detail
else:
# Entering here, we will instantiate a new detail.
# We hadn't so before because of fragmented lines.
if detail is not None and detail.label is not NotAvailable:
# We have a new element, return the other one
details.append(detail)
detail = Detail()
detail.price = Decimal(0)
# If the coma is not a decimal separator, then
# this is is probably a loooong sentence.
# When it comes to jokes, keep it short and sweet.
line = re.split(textwithcoma, line)[0]
# Things we want for sure
if re.findall(enddate, line):
# When a line has been badly split after a date,
# We want the label to end after the date, and maybe
# the second part to be the info
mydate = re.search(enddate, line).group(0)
mylist = line.rpartition(mydate)
label = mylist[0] + mylist[1]
detail.label = unicode(label, encoding='utf-8')
elif re.findall(endwithdigit, line):
# What is this stupid number at the end of the line?
# Line should have been split before the number
detail.label = unicode(re.split(endwithdigit, line)[0], encoding='utf-8')
# Things we don't want for sure
elif ')' in line and '(' not in line:
# First part of the parenthesis should have been drop before
# Avoid to create a new empty detail
detail.label = NotAvailable
elif re.match(base, line):
# This string should come always after a date,
# usually, it will match one of the cases above.
# Sometimes, it appears on a new line we don't need.
detail.label = NotAvailable
elif re.match(ht, line):
# '00,00 € HT par mois' may have been split after HT
# We don't need of the second line
detail.label = NotAvailable
# Things we probably want to keep
else:
# Well, maybe our line is correct, after all.
# Not much to do.
detail.label = unicode(line, encoding='utf-8')
detail.infos = NotAvailable
elif ' %' in line:
if isinstance(detail, Detail):
# Sometimes the vat is not on a new line:
# '00,00 00,0 %' instead of '00,0 %'
vat = line.split()[line.count(' ')-1].replace(',', '.')
detail.infos = unicode('TVA: ' + vat)
elif '' in line:
price = line.replace(',', '.')
if isinstance(detail, Detail):
detail.price = Decimal(price.strip(''))
elif re.match(enddate, line):
# Line holding dates may have been mixed up
label = detail.label.split(' au ')[0] + u' au ' + unicode(line, encoding='utf-8')
detail.label = label
if detail.label is not NotAvailable:
# Do not append empty details to the list
# It seemed easier to create details anyway than dealing
# with None objects
details.append(detail)
return details
def get_details(self, label):
txt = self._parse_pdf()
page = None
if label == u'Gaz naturel':
page = txt.split('GAZ NATUREL')[1].split('TOTAL GAZ NATUREL TTC')[0]
elif label == u'Electricité':
page = txt.split('ELECTRICITE')[1].split('TOTAL ELECTRICITE TTC')[0]
else:
pass
return self._parse_page(page)