191 lines
7.2 KiB
Python
191 lines
7.2 KiB
Python
# -*- coding: utf-8 -*-
|
||
|
||
# Copyright(C) 2012 Florent Fourcot
|
||
#
|
||
# This file is part of weboob.
|
||
#
|
||
# weboob is free software: you can redistribute it and/or modify
|
||
# it under the terms of the GNU Affero General Public License as published by
|
||
# the Free Software Foundation, either version 3 of the License, or
|
||
# (at your option) any later version.
|
||
#
|
||
# weboob is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
# GNU Affero General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU Affero General Public License
|
||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||
|
||
import re
|
||
import os
|
||
import subprocess
|
||
import tempfile
|
||
import shutil
|
||
|
||
from datetime import datetime, date, time
|
||
from decimal import Decimal
|
||
|
||
from weboob.tools.browser import BasePage
|
||
from weboob.capabilities.bill import Detail, Bill
|
||
|
||
|
||
__all__ = ['HistoryPage', 'PdfPage']
|
||
|
||
|
||
def _get_date(detail):
|
||
return detail.datetime
|
||
|
||
|
||
class PdfPage():
|
||
def __init__(self, file):
|
||
self.pdf = file
|
||
|
||
def _parse_pdf(self):
|
||
pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf')
|
||
temptxt = pdffile.name.replace('.pdf', '.txt')
|
||
cmd = "ebook-convert"
|
||
stdout = open("/dev/null", "w")
|
||
shutil.copyfileobj(self.pdf, pdffile)
|
||
pdffile.flush()
|
||
subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout)
|
||
pdffile.close()
|
||
txtfile = open(temptxt, 'r')
|
||
txt = txtfile.read()
|
||
txtfile.close()
|
||
os.remove(temptxt)
|
||
return txt
|
||
|
||
def get_details(self):
|
||
txt = self._parse_pdf()
|
||
page = txt.split('CONSOMMATION')[2].split('ACTIVITE DETAILLEE')[0]
|
||
lines = page.split('\n')
|
||
lines = [x for x in lines if len(x) > 0] # Remove empty lines
|
||
details = []
|
||
detail = None
|
||
lines.pop(-1) # Line to describes pictures
|
||
twolines = False
|
||
for line in lines:
|
||
if "Votre consommation" in line:
|
||
line = line.split(": ", 1)[1]
|
||
if twolines:
|
||
twolines = False
|
||
detail.infos = unicode(line, encoding='utf-8')
|
||
elif re.match('[A-Za-z]', line[0]):
|
||
# We have a new element, return the other one
|
||
if detail is not None:
|
||
details.append(detail)
|
||
detail = Detail()
|
||
split = re.split("(\d)", line, maxsplit=1)
|
||
detail.price = Decimal(0)
|
||
if len(split) > 2:
|
||
detail.infos = unicode(split[1] + split[2], encoding='utf-8')
|
||
else:
|
||
twolines = True
|
||
if '€' in line:
|
||
specialprice = split[1] + split[2]
|
||
detail.price = Decimal(specialprice.replace('€', ''))
|
||
detail.label = unicode(split[0], encoding='utf-8')
|
||
elif '€' in line:
|
||
detail.price = Decimal(line.replace('€', ''))
|
||
else:
|
||
detail.infos = unicode(line, encoding='utf-8')
|
||
details.append(detail)
|
||
return details
|
||
|
||
def get_balance(self):
|
||
for calls in self.get_calls():
|
||
if "Votre solde" in calls.label:
|
||
detail = Detail()
|
||
detail.price = calls.price
|
||
detail.label = u"Balance"
|
||
return detail
|
||
|
||
# Standard pdf text extractor take text line by line
|
||
# But the position in the file is not always the "real" position to display...
|
||
# It produce some unsorted and unparsable data
|
||
# Example of bad software: pdfminer and others python tools
|
||
# This is why we have to use "ebook-convert" from calibre software,
|
||
# it is the only one to 'reflow" text and give some relevant results
|
||
# The bad new is that ebook-convert doesn't support simple use with stdin/stdout
|
||
def get_calls(self):
|
||
txt = self._parse_pdf()
|
||
pages = txt.split("DEBIT")
|
||
pages.pop(0) # remove headers
|
||
details = []
|
||
for page in pages:
|
||
page = page.split('RÉGLO MOBILE')[0].split('N.B. Prévoir')[0] # remove footers
|
||
lines = page.split('\n')
|
||
lines = [x for x in lines if len(x) > 0] # Remove empty lines
|
||
numitems = (len(lines) + 1) / 4 # Each line has five columns
|
||
lines.pop(0) # remove the extra € symbol
|
||
modif = 0
|
||
i = 0
|
||
while i < numitems:
|
||
if modif != 0:
|
||
numitems = ((len(lines) + 1 + modif) / 4)
|
||
base = i * 4 - modif
|
||
dateop = base
|
||
corres = base + 1
|
||
duree = base + 2
|
||
price = base + 3
|
||
if "Changement vers le Forfait" in lines[base]:
|
||
modif += 1
|
||
i += 1
|
||
continue
|
||
# Special case with 5 columns, the operation date is not in the first one
|
||
if len(re.split("(\d+\/\d+\/\d+)", lines[dateop])) < 2:
|
||
lines[base + 1] = lines[base] + " " + lines[base + 1]
|
||
dateop = base + 1
|
||
corres = base + 2
|
||
duree = base + 3
|
||
price = base + 4
|
||
modif -= 1
|
||
detail = Detail()
|
||
splits = re.split("(\d+\/\d+\/\d+)", lines[dateop])
|
||
mydate = date(*reversed([int(x) for x in splits[1].split("/")]))
|
||
mytime = time(*[int(x) for x in splits[2].split(":")])
|
||
detail.datetime = datetime.combine(mydate, mytime)
|
||
if lines[corres] == '-':
|
||
lines[corres] = ""
|
||
if lines[duree] == '-':
|
||
lines[duree] = ''
|
||
detail.label = unicode(splits[0], encoding='utf-8', errors='replace') + u" " + lines[corres] + u" " + lines[duree]
|
||
# Special case with only 3 columns, we insert a price
|
||
if "Activation de votre ligne" in detail.label or u"Résiliation" in detail.label:
|
||
lines.insert(price, '0')
|
||
try:
|
||
detail.price = Decimal(lines[price].replace(',', '.'))
|
||
except:
|
||
# In some special cases, there are no price column. Try to detect it
|
||
if "Inclus" not in lines[price]:
|
||
modif += 1
|
||
detail.price = Decimal(0)
|
||
|
||
details.append(detail)
|
||
i += 1
|
||
return sorted(details, key=_get_date, reverse=True)
|
||
|
||
|
||
class HistoryPage(BasePage):
|
||
def on_loaded(self):
|
||
pass
|
||
|
||
def getmaxid(self):
|
||
max = 1
|
||
while len(self.document.xpath('//li[@id="liMois%s"]' % max)) > 0:
|
||
max += 1
|
||
return max - 1
|
||
|
||
def date_bills(self, parentid):
|
||
max = 1
|
||
while len(self.document.xpath('//li[@id="liMois%s"]' % max)) > 0:
|
||
li = self.document.xpath('//li[@id="liMois%s"]' % max)[0]
|
||
max += 1
|
||
link = li.xpath('a')[0]
|
||
bill = Bill()
|
||
bill._url = link.attrib['href']
|
||
bill.label = unicode(link.text)
|
||
bill.format = u"pdf"
|
||
bill.id = parentid + bill.label.replace(' ', '')
|
||
yield bill
|