Use AmericanTransaction.decimal_amount, closest_date, decompress_pdf and ReTokenizer in WellsFargo module. Part of #1641
This commit is contained in:
parent
3aaf8181a5
commit
a5859d384d
2 changed files with 38 additions and 120 deletions
|
|
@ -18,10 +18,12 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from weboob.capabilities.bank import Account, Transaction
|
from weboob.capabilities.bank import Account, Transaction
|
||||||
|
from weboob.tools.capabilities.bank.transactions import \
|
||||||
|
AmericanTransaction as AmTr
|
||||||
from weboob.browser.pages import HTMLPage, LoggedPage, RawPage
|
from weboob.browser.pages import HTMLPage, LoggedPage, RawPage
|
||||||
from urllib import unquote
|
from urllib import unquote
|
||||||
from requests.cookies import morsel_to_cookie
|
from requests.cookies import morsel_to_cookie
|
||||||
from .parsers import StatementParser, clean_amount, clean_label
|
from .parsers import StatementParser, clean_label
|
||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
|
|
@ -138,7 +140,7 @@ class ActivityPage(AccountPage):
|
||||||
account.id = id_
|
account.id = id_
|
||||||
account.label = name
|
account.label = name
|
||||||
account.currency = currency
|
account.currency = currency
|
||||||
account.balance = clean_amount(balance)
|
account.balance = AmTr.decimal_amount(balance)
|
||||||
account.type = type_
|
account.type = type_
|
||||||
return account
|
return account
|
||||||
|
|
||||||
|
|
@ -196,9 +198,9 @@ class ActivityCashPage(ActivityPage):
|
||||||
desc = clean_label(desc)
|
desc = clean_label(desc)
|
||||||
|
|
||||||
deposit = deposit.strip()
|
deposit = deposit.strip()
|
||||||
deposit = clean_amount(deposit or '0')
|
deposit = AmTr.decimal_amount(deposit or '0')
|
||||||
withdraw = withdraw.strip()
|
withdraw = withdraw.strip()
|
||||||
withdraw = clean_amount(withdraw or '0')
|
withdraw = AmTr.decimal_amount(withdraw or '0')
|
||||||
|
|
||||||
amount = deposit - withdraw
|
amount = deposit - withdraw
|
||||||
|
|
||||||
|
|
@ -273,9 +275,9 @@ class ActivityCardPage(ActivityPage):
|
||||||
ref = re.match('.*<REFERENCE ([^>]+)>.*', ref).group(1)
|
ref = re.match('.*<REFERENCE ([^>]+)>.*', ref).group(1)
|
||||||
|
|
||||||
if amount.startswith('+'):
|
if amount.startswith('+'):
|
||||||
amount = clean_amount(amount[1:])
|
amount = AmTr.decimal_amount(amount[1:])
|
||||||
else:
|
else:
|
||||||
amount = -clean_amount(amount)
|
amount = -AmTr.decimal_amount(amount)
|
||||||
|
|
||||||
trans = Transaction(ref)
|
trans = Transaction(ref)
|
||||||
trans.date = tdate
|
trans.date = tdate
|
||||||
|
|
|
||||||
|
|
@ -18,19 +18,15 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from weboob.capabilities.bank import Transaction
|
from weboob.capabilities.bank import Transaction
|
||||||
from weboob.tools.capabilities.bank.transactions import AmericanTransaction
|
from weboob.tools.capabilities.bank.transactions import \
|
||||||
from decimal import Decimal
|
AmericanTransaction as AmTr
|
||||||
from tempfile import mkstemp
|
from weboob.tools.date import closest_date
|
||||||
import subprocess
|
from weboob.tools.pdf import decompress_pdf
|
||||||
import os
|
from weboob.tools.tokenizer import ReTokenizer
|
||||||
import re
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
def clean_amount(text):
|
|
||||||
return Decimal(AmericanTransaction.clean_amount(text))
|
|
||||||
|
|
||||||
|
|
||||||
def clean_label(text):
|
def clean_label(text):
|
||||||
"""
|
"""
|
||||||
Web view and statements use different label formatting.
|
Web view and statements use different label formatting.
|
||||||
|
|
@ -40,51 +36,33 @@ def clean_label(text):
|
||||||
return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE)
|
return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE)
|
||||||
|
|
||||||
|
|
||||||
def full_date(date, date_from, date_to):
|
|
||||||
"""
|
|
||||||
Makes sure that date is close to the given range.
|
|
||||||
Transactions dates in a statement contain only day and month.
|
|
||||||
Statement dates range have a year though.
|
|
||||||
Merge them all together to get a full transaction date.
|
|
||||||
"""
|
|
||||||
dates = [datetime.datetime(d.year, date.month, date.day)
|
|
||||||
for d in (date_from, date_to)]
|
|
||||||
|
|
||||||
# Ideally, pick the date within given range.
|
|
||||||
for d in dates:
|
|
||||||
if date_from <= d <= date_to:
|
|
||||||
return d
|
|
||||||
|
|
||||||
# Otherwise, return the most recent date in the past
|
|
||||||
return min(dates, key=lambda d: abs(d-date_from))
|
|
||||||
|
|
||||||
|
|
||||||
def decompress_pdf(inpdf):
|
|
||||||
inh, inname = mkstemp(suffix='.pdf')
|
|
||||||
outh, outname = mkstemp(suffix='.pdf')
|
|
||||||
os.write(inh, inpdf)
|
|
||||||
os.close(inh)
|
|
||||||
os.close(outh)
|
|
||||||
|
|
||||||
# mutool is a part of MuPDF (http://www.mupdf.com).
|
|
||||||
subprocess.call(['mutool', 'clean', '-d', inname, outname])
|
|
||||||
|
|
||||||
with open(outname) as f:
|
|
||||||
outpdf = f.read()
|
|
||||||
os.remove(inname)
|
|
||||||
os.remove(outname)
|
|
||||||
return outpdf
|
|
||||||
|
|
||||||
|
|
||||||
class StatementParser(object):
|
class StatementParser(object):
|
||||||
"""
|
"""
|
||||||
Each "read_*" method which takes position as its argument,
|
Each "read_*" method takes position as its argument,
|
||||||
returns next token position if read was successful,
|
and returns next token position if read was successful,
|
||||||
and the same position if it was not.
|
or the same position if it was not.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
LEX = [
|
||||||
|
('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
|
||||||
|
('date', r'^\[\((\d+/\d+)\)\] TJ$'),
|
||||||
|
('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
|
||||||
|
r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
|
||||||
|
('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
|
||||||
|
r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
|
||||||
|
('layout_tz', r'^(\d+\.\d{2}) Tz$'),
|
||||||
|
('layout_tc', r'^(\d+\.\d{2}) Tc$'),
|
||||||
|
('layout_tw', r'^(\d+\.\d{2}) Tw$'),
|
||||||
|
('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
|
||||||
|
('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
|
||||||
|
('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
|
||||||
|
|
||||||
|
('text', r'^\[\(([^\)]+)\)\] TJ$')
|
||||||
|
]
|
||||||
|
|
||||||
def __init__(self, pdf):
|
def __init__(self, pdf):
|
||||||
self._pdf = decompress_pdf(pdf)
|
self._pdf = decompress_pdf(pdf)
|
||||||
self._tok = StatementTokenizer(self._pdf)
|
self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
|
||||||
|
|
||||||
def read_card_transactions(self):
|
def read_card_transactions(self):
|
||||||
# Early check if this is a card account statement at all.
|
# Early check if this is a card account statement at all.
|
||||||
|
|
@ -149,8 +127,8 @@ class StatementParser(object):
|
||||||
or ref_layout is None or ref is None or desc is None or amount is None:
|
or ref_layout is None or ref is None or desc is None or amount is None:
|
||||||
return startPos, None
|
return startPos, None
|
||||||
else:
|
else:
|
||||||
tdate = full_date(tdate, date_from, date_to)
|
tdate = closest_date(tdate, date_from, date_to)
|
||||||
pdate = full_date(pdate, date_from, date_to)
|
pdate = closest_date(pdate, date_from, date_to)
|
||||||
|
|
||||||
trans = Transaction(ref)
|
trans = Transaction(ref)
|
||||||
trans.date = tdate
|
trans.date = tdate
|
||||||
|
|
@ -179,7 +157,7 @@ class StatementParser(object):
|
||||||
if desc is None or date is None or amount is None:
|
if desc is None or date is None or amount is None:
|
||||||
return startPos, None
|
return startPos, None
|
||||||
else:
|
else:
|
||||||
date = full_date(date, date_from, date_to)
|
date = closest_date(date, date_from, date_to)
|
||||||
|
|
||||||
trans = Transaction(u'')
|
trans = Transaction(u'')
|
||||||
trans.date = date
|
trans.date = date
|
||||||
|
|
@ -229,7 +207,7 @@ class StatementParser(object):
|
||||||
return startPos, None
|
return startPos, None
|
||||||
else:
|
else:
|
||||||
# Infer amount type by its indentation in the layout.
|
# Infer amount type by its indentation in the layout.
|
||||||
amount_total = clean_amount('0')
|
amount_total = AmTr.decimal_amount('0')
|
||||||
for (_, _, _, _, indent, _), amount in amounts:
|
for (_, _, _, _, indent, _), amount in amounts:
|
||||||
within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1]
|
within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1]
|
||||||
if within(range_skip):
|
if within(range_skip):
|
||||||
|
|
@ -295,7 +273,7 @@ class StatementParser(object):
|
||||||
|
|
||||||
def read_amount(self, pos):
|
def read_amount(self, pos):
|
||||||
t = self._tok.tok(pos)
|
t = self._tok.tok(pos)
|
||||||
return (pos+1, clean_amount(t.value())) \
|
return (pos+1, AmTr.decimal_amount(t.value())) \
|
||||||
if t.is_amount() else (pos, None)
|
if t.is_amount() else (pos, None)
|
||||||
|
|
||||||
def read_date_range(self, pos):
|
def read_date_range(self, pos):
|
||||||
|
|
@ -334,65 +312,3 @@ class StatementParser(object):
|
||||||
return (pos+1, [float(v) for v in t.value()]) \
|
return (pos+1, [float(v) for v in t.value()]) \
|
||||||
if t.is_layout_tm() else (pos, None)
|
if t.is_layout_tm() else (pos, None)
|
||||||
|
|
||||||
|
|
||||||
class StatementTokenizer(object):
|
|
||||||
def __init__(self, pdf):
|
|
||||||
self._tok = [StatementToken(line) for line in pdf.split('\n')]
|
|
||||||
|
|
||||||
def tok(self, index):
|
|
||||||
if 0 <= index < len(self._tok):
|
|
||||||
return self._tok[index]
|
|
||||||
else:
|
|
||||||
return StatementToken(eof=True)
|
|
||||||
|
|
||||||
|
|
||||||
class StatementToken(object):
|
|
||||||
"""
|
|
||||||
Simple regex-based lexer.
|
|
||||||
There's a lexing table consisting of type-regex tuples.
|
|
||||||
Text line is sequentially matched against regexes and first
|
|
||||||
successful match defines the type of the token.
|
|
||||||
"""
|
|
||||||
LEX = [
|
|
||||||
('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
|
|
||||||
('date', r'^\[\((\d+/\d+)\)\] TJ$'),
|
|
||||||
('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
|
|
||||||
r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
|
|
||||||
('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
|
|
||||||
r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
|
|
||||||
('layout_tz', r'^(\d+\.\d{2}) Tz$'),
|
|
||||||
('layout_tc', r'^(\d+\.\d{2}) Tc$'),
|
|
||||||
('layout_tw', r'^(\d+\.\d{2}) Tw$'),
|
|
||||||
('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
|
|
||||||
('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
|
|
||||||
('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
|
|
||||||
|
|
||||||
('text', r'^\[\(([^\)]+)\)\] TJ$')
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self, line=None, eof=False):
|
|
||||||
self._eof = eof
|
|
||||||
self._value = None
|
|
||||||
self._type = None
|
|
||||||
if line is not None:
|
|
||||||
for type_, regex in self.LEX:
|
|
||||||
m = re.match(regex, line, flags=re.UNICODE)
|
|
||||||
if m:
|
|
||||||
self._type = type_
|
|
||||||
if len(m.groups()) == 1:
|
|
||||||
self._value = m.groups()[0]
|
|
||||||
elif m.groups():
|
|
||||||
self._value = m.groups()
|
|
||||||
else:
|
|
||||||
self._value = m.group(0)
|
|
||||||
break
|
|
||||||
|
|
||||||
def is_eof(self):
|
|
||||||
return self._eof
|
|
||||||
|
|
||||||
def value(self):
|
|
||||||
return self._value
|
|
||||||
|
|
||||||
for type_, _ in StatementToken.LEX:
|
|
||||||
setattr(StatementToken, 'is_%s' % type_,
|
|
||||||
eval('lambda self: self._type == "%s"' % type_))
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue