weboob-devel/modules/leclercmobile/pages/history.py
2014-03-26 21:03:44 +01:00

191 lines
7.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# Copyright(C) 2012 Florent Fourcot
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import os
import subprocess
import tempfile
import shutil
from datetime import datetime, date, time
from decimal import Decimal
from weboob.tools.browser import BasePage
from weboob.capabilities.bill import Detail, Bill
__all__ = ['HistoryPage', 'PdfPage']
def _get_date(detail):
return detail.datetime
class PdfPage():
def __init__(self, file):
self.pdf = file
def _parse_pdf(self):
pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf')
temptxt = pdffile.name.replace('.pdf', '.txt')
cmd = "ebook-convert"
stdout = open("/dev/null", "w")
shutil.copyfileobj(self.pdf, pdffile)
pdffile.flush()
subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout)
pdffile.close()
txtfile = open(temptxt, 'r')
txt = txtfile.read()
txtfile.close()
os.remove(temptxt)
return txt
def get_details(self):
txt = self._parse_pdf()
page = txt.split('CONSOMMATION')[2].split('ACTIVITE DETAILLEE')[0]
lines = page.split('\n')
lines = [x for x in lines if len(x) > 0] # Remove empty lines
details = []
detail = None
lines.pop(-1) # Line to describes pictures
twolines = False
for line in lines:
if "Votre consommation" in line:
line = line.split(": ", 1)[1]
if twolines:
twolines = False
detail.infos = unicode(line, encoding='utf-8')
elif re.match('[A-Za-z]', line[0]):
# We have a new element, return the other one
if detail is not None:
details.append(detail)
detail = Detail()
split = re.split("(\d)", line, maxsplit=1)
detail.price = Decimal(0)
if len(split) > 2:
detail.infos = unicode(split[1] + split[2], encoding='utf-8')
else:
twolines = True
if '' in line:
specialprice = split[1] + split[2]
detail.price = Decimal(specialprice.replace('', ''))
detail.label = unicode(split[0], encoding='utf-8')
elif '' in line:
detail.price = Decimal(line.replace('', ''))
else:
detail.infos = unicode(line, encoding='utf-8')
details.append(detail)
return details
def get_balance(self):
for calls in self.get_calls():
if "Votre solde" in calls.label:
detail = Detail()
detail.price = calls.price
detail.label = u"Balance"
return detail
# Standard pdf text extractor take text line by line
# But the position in the file is not always the "real" position to display...
# It produce some unsorted and unparsable data
# Example of bad software: pdfminer and others python tools
# This is why we have to use "ebook-convert" from calibre software,
# it is the only one to 'reflow" text and give some relevant results
# The bad new is that ebook-convert doesn't support simple use with stdin/stdout
def get_calls(self):
txt = self._parse_pdf()
pages = txt.split("DEBIT")
pages.pop(0) # remove headers
details = []
for page in pages:
page = page.split('RÉGLO MOBILE')[0].split('N.B. Prévoir')[0] # remove footers
lines = page.split('\n')
lines = [x for x in lines if len(x) > 0] # Remove empty lines
numitems = (len(lines) + 1) / 4 # Each line has five columns
lines.pop(0) # remove the extra € symbol
modif = 0
i = 0
while i < numitems:
if modif != 0:
numitems = ((len(lines) + 1 + modif) / 4)
base = i * 4 - modif
dateop = base
corres = base + 1
duree = base + 2
price = base + 3
if "Changement vers le Forfait" in lines[base]:
modif += 1
i += 1
continue
# Special case with 5 columns, the operation date is not in the first one
if len(re.split("(\d+\/\d+\/\d+)", lines[dateop])) < 2:
lines[base + 1] = lines[base] + " " + lines[base + 1]
dateop = base + 1
corres = base + 2
duree = base + 3
price = base + 4
modif -= 1
detail = Detail()
splits = re.split("(\d+\/\d+\/\d+)", lines[dateop])
mydate = date(*reversed([int(x) for x in splits[1].split("/")]))
mytime = time(*[int(x) for x in splits[2].split(":")])
detail.datetime = datetime.combine(mydate, mytime)
if lines[corres] == '-':
lines[corres] = ""
if lines[duree] == '-':
lines[duree] = ''
detail.label = unicode(splits[0], encoding='utf-8', errors='replace') + u" " + lines[corres] + u" " + lines[duree]
# Special case with only 3 columns, we insert a price
if "Activation de votre ligne" in detail.label or u"Résiliation" in detail.label:
lines.insert(price, '0')
try:
detail.price = Decimal(lines[price].replace(',', '.'))
except:
# In some special cases, there are no price column. Try to detect it
if "Inclus" not in lines[price]:
modif += 1
detail.price = Decimal(0)
details.append(detail)
i += 1
return sorted(details, key=_get_date, reverse=True)
class HistoryPage(BasePage):
def on_loaded(self):
pass
def getmaxid(self):
max = 1
while len(self.document.xpath('//li[@id="liMois%s"]' % max)) > 0:
max += 1
return max - 1
def date_bills(self, parentid):
max = 1
while len(self.document.xpath('//li[@id="liMois%s"]' % max)) > 0:
li = self.document.xpath('//li[@id="liMois%s"]' % max)[0]
max += 1
link = li.xpath('a')[0]
bill = Bill()
bill._url = link.attrib['href']
bill.label = unicode(link.text)
bill.format = u"pdf"
bill.id = parentid + bill.label.replace(' ', '')
yield bill