Use AmericanTransaction.decimal_amount, closest_date, decompress_pdf and ReTokenizer in WellsFargo module. Part of #1641

This commit is contained in:
Oleg Plakhotniuk 2014-10-13 04:24:51 -05:00
commit a5859d384d
2 changed files with 38 additions and 120 deletions

View file

@ -18,10 +18,12 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.bank import Account, Transaction from weboob.capabilities.bank import Account, Transaction
from weboob.tools.capabilities.bank.transactions import \
AmericanTransaction as AmTr
from weboob.browser.pages import HTMLPage, LoggedPage, RawPage from weboob.browser.pages import HTMLPage, LoggedPage, RawPage
from urllib import unquote from urllib import unquote
from requests.cookies import morsel_to_cookie from requests.cookies import morsel_to_cookie
from .parsers import StatementParser, clean_amount, clean_label from .parsers import StatementParser, clean_label
import itertools import itertools
import re import re
import datetime import datetime
@ -138,7 +140,7 @@ class ActivityPage(AccountPage):
account.id = id_ account.id = id_
account.label = name account.label = name
account.currency = currency account.currency = currency
account.balance = clean_amount(balance) account.balance = AmTr.decimal_amount(balance)
account.type = type_ account.type = type_
return account return account
@ -196,9 +198,9 @@ class ActivityCashPage(ActivityPage):
desc = clean_label(desc) desc = clean_label(desc)
deposit = deposit.strip() deposit = deposit.strip()
deposit = clean_amount(deposit or '0') deposit = AmTr.decimal_amount(deposit or '0')
withdraw = withdraw.strip() withdraw = withdraw.strip()
withdraw = clean_amount(withdraw or '0') withdraw = AmTr.decimal_amount(withdraw or '0')
amount = deposit - withdraw amount = deposit - withdraw
@ -273,9 +275,9 @@ class ActivityCardPage(ActivityPage):
ref = re.match('.*<REFERENCE ([^>]+)>.*', ref).group(1) ref = re.match('.*<REFERENCE ([^>]+)>.*', ref).group(1)
if amount.startswith('+'): if amount.startswith('+'):
amount = clean_amount(amount[1:]) amount = AmTr.decimal_amount(amount[1:])
else: else:
amount = -clean_amount(amount) amount = -AmTr.decimal_amount(amount)
trans = Transaction(ref) trans = Transaction(ref)
trans.date = tdate trans.date = tdate

View file

@ -18,19 +18,15 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.bank import Transaction from weboob.capabilities.bank import Transaction
from weboob.tools.capabilities.bank.transactions import AmericanTransaction from weboob.tools.capabilities.bank.transactions import \
from decimal import Decimal AmericanTransaction as AmTr
from tempfile import mkstemp from weboob.tools.date import closest_date
import subprocess from weboob.tools.pdf import decompress_pdf
import os from weboob.tools.tokenizer import ReTokenizer
import re import re
import datetime import datetime
def clean_amount(text):
return Decimal(AmericanTransaction.clean_amount(text))
def clean_label(text): def clean_label(text):
""" """
Web view and statements use different label formatting. Web view and statements use different label formatting.
@ -40,51 +36,33 @@ def clean_label(text):
return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE) return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE)
def full_date(date, date_from, date_to):
"""
Makes sure that date is close to the given range.
Transactions dates in a statement contain only day and month.
Statement dates range have a year though.
Merge them all together to get a full transaction date.
"""
dates = [datetime.datetime(d.year, date.month, date.day)
for d in (date_from, date_to)]
# Ideally, pick the date within given range.
for d in dates:
if date_from <= d <= date_to:
return d
# Otherwise, return the most recent date in the past
return min(dates, key=lambda d: abs(d-date_from))
def decompress_pdf(inpdf):
inh, inname = mkstemp(suffix='.pdf')
outh, outname = mkstemp(suffix='.pdf')
os.write(inh, inpdf)
os.close(inh)
os.close(outh)
# mutool is a part of MuPDF (http://www.mupdf.com).
subprocess.call(['mutool', 'clean', '-d', inname, outname])
with open(outname) as f:
outpdf = f.read()
os.remove(inname)
os.remove(outname)
return outpdf
class StatementParser(object): class StatementParser(object):
""" """
Each "read_*" method which takes position as its argument, Each "read_*" method takes position as its argument,
returns next token position if read was successful, and returns next token position if read was successful,
and the same position if it was not. or the same position if it was not.
""" """
LEX = [
('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
('date', r'^\[\((\d+/\d+)\)\] TJ$'),
('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
('layout_tz', r'^(\d+\.\d{2}) Tz$'),
('layout_tc', r'^(\d+\.\d{2}) Tc$'),
('layout_tw', r'^(\d+\.\d{2}) Tw$'),
('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
('text', r'^\[\(([^\)]+)\)\] TJ$')
]
def __init__(self, pdf): def __init__(self, pdf):
self._pdf = decompress_pdf(pdf) self._pdf = decompress_pdf(pdf)
self._tok = StatementTokenizer(self._pdf) self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
def read_card_transactions(self): def read_card_transactions(self):
# Early check if this is a card account statement at all. # Early check if this is a card account statement at all.
@ -149,8 +127,8 @@ class StatementParser(object):
or ref_layout is None or ref is None or desc is None or amount is None: or ref_layout is None or ref is None or desc is None or amount is None:
return startPos, None return startPos, None
else: else:
tdate = full_date(tdate, date_from, date_to) tdate = closest_date(tdate, date_from, date_to)
pdate = full_date(pdate, date_from, date_to) pdate = closest_date(pdate, date_from, date_to)
trans = Transaction(ref) trans = Transaction(ref)
trans.date = tdate trans.date = tdate
@ -179,7 +157,7 @@ class StatementParser(object):
if desc is None or date is None or amount is None: if desc is None or date is None or amount is None:
return startPos, None return startPos, None
else: else:
date = full_date(date, date_from, date_to) date = closest_date(date, date_from, date_to)
trans = Transaction(u'') trans = Transaction(u'')
trans.date = date trans.date = date
@ -229,7 +207,7 @@ class StatementParser(object):
return startPos, None return startPos, None
else: else:
# Infer amount type by its indentation in the layout. # Infer amount type by its indentation in the layout.
amount_total = clean_amount('0') amount_total = AmTr.decimal_amount('0')
for (_, _, _, _, indent, _), amount in amounts: for (_, _, _, _, indent, _), amount in amounts:
within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1] within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1]
if within(range_skip): if within(range_skip):
@ -295,7 +273,7 @@ class StatementParser(object):
def read_amount(self, pos): def read_amount(self, pos):
t = self._tok.tok(pos) t = self._tok.tok(pos)
return (pos+1, clean_amount(t.value())) \ return (pos+1, AmTr.decimal_amount(t.value())) \
if t.is_amount() else (pos, None) if t.is_amount() else (pos, None)
def read_date_range(self, pos): def read_date_range(self, pos):
@ -334,65 +312,3 @@ class StatementParser(object):
return (pos+1, [float(v) for v in t.value()]) \ return (pos+1, [float(v) for v in t.value()]) \
if t.is_layout_tm() else (pos, None) if t.is_layout_tm() else (pos, None)
class StatementTokenizer(object):
def __init__(self, pdf):
self._tok = [StatementToken(line) for line in pdf.split('\n')]
def tok(self, index):
if 0 <= index < len(self._tok):
return self._tok[index]
else:
return StatementToken(eof=True)
class StatementToken(object):
"""
Simple regex-based lexer.
There's a lexing table consisting of type-regex tuples.
Text line is sequentially matched against regexes and first
successful match defines the type of the token.
"""
LEX = [
('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
('date', r'^\[\((\d+/\d+)\)\] TJ$'),
('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
('layout_tz', r'^(\d+\.\d{2}) Tz$'),
('layout_tc', r'^(\d+\.\d{2}) Tc$'),
('layout_tw', r'^(\d+\.\d{2}) Tw$'),
('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
('text', r'^\[\(([^\)]+)\)\] TJ$')
]
def __init__(self, line=None, eof=False):
self._eof = eof
self._value = None
self._type = None
if line is not None:
for type_, regex in self.LEX:
m = re.match(regex, line, flags=re.UNICODE)
if m:
self._type = type_
if len(m.groups()) == 1:
self._value = m.groups()[0]
elif m.groups():
self._value = m.groups()
else:
self._value = m.group(0)
break
def is_eof(self):
return self._eof
def value(self):
return self._value
for type_, _ in StatementToken.LEX:
setattr(StatementToken, 'is_%s' % type_,
eval('lambda self: self._type == "%s"' % type_))