From a5859d384d69d7cba1bc12d285c23c072b9bda54 Mon Sep 17 00:00:00 2001 From: Oleg Plakhotniuk Date: Mon, 13 Oct 2014 04:24:51 -0500 Subject: [PATCH] Use AmericanTransaction.decimal_amount, closest_date, decompress_pdf and ReTokenizer in WellsFargo module. Part of #1641 --- modules/wellsfargo/pages.py | 14 ++-- modules/wellsfargo/parsers.py | 148 ++++++++-------------------------- 2 files changed, 40 insertions(+), 122 deletions(-) diff --git a/modules/wellsfargo/pages.py b/modules/wellsfargo/pages.py index 10c3fd3c..435d981d 100644 --- a/modules/wellsfargo/pages.py +++ b/modules/wellsfargo/pages.py @@ -18,10 +18,12 @@ # along with weboob. If not, see . from weboob.capabilities.bank import Account, Transaction +from weboob.tools.capabilities.bank.transactions import \ + AmericanTransaction as AmTr from weboob.browser.pages import HTMLPage, LoggedPage, RawPage from urllib import unquote from requests.cookies import morsel_to_cookie -from .parsers import StatementParser, clean_amount, clean_label +from .parsers import StatementParser, clean_label import itertools import re import datetime @@ -138,7 +140,7 @@ class ActivityPage(AccountPage): account.id = id_ account.label = name account.currency = currency - account.balance = clean_amount(balance) + account.balance = AmTr.decimal_amount(balance) account.type = type_ return account @@ -196,9 +198,9 @@ class ActivityCashPage(ActivityPage): desc = clean_label(desc) deposit = deposit.strip() - deposit = clean_amount(deposit or '0') + deposit = AmTr.decimal_amount(deposit or '0') withdraw = withdraw.strip() - withdraw = clean_amount(withdraw or '0') + withdraw = AmTr.decimal_amount(withdraw or '0') amount = deposit - withdraw @@ -273,9 +275,9 @@ class ActivityCardPage(ActivityPage): ref = re.match('.*]+)>.*', ref).group(1) if amount.startswith('+'): - amount = clean_amount(amount[1:]) + amount = AmTr.decimal_amount(amount[1:]) else: - amount = -clean_amount(amount) + amount = -AmTr.decimal_amount(amount) trans = Transaction(ref) trans.date = tdate diff --git a/modules/wellsfargo/parsers.py b/modules/wellsfargo/parsers.py index 596f0b6a..f1ea148f 100644 --- a/modules/wellsfargo/parsers.py +++ b/modules/wellsfargo/parsers.py @@ -18,19 +18,15 @@ # along with weboob. If not, see . from weboob.capabilities.bank import Transaction -from weboob.tools.capabilities.bank.transactions import AmericanTransaction -from decimal import Decimal -from tempfile import mkstemp -import subprocess -import os +from weboob.tools.capabilities.bank.transactions import \ + AmericanTransaction as AmTr +from weboob.tools.date import closest_date +from weboob.tools.pdf import decompress_pdf +from weboob.tools.tokenizer import ReTokenizer import re import datetime -def clean_amount(text): - return Decimal(AmericanTransaction.clean_amount(text)) - - def clean_label(text): """ Web view and statements use different label formatting. @@ -40,51 +36,33 @@ def clean_label(text): return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE) -def full_date(date, date_from, date_to): - """ - Makes sure that date is close to the given range. - Transactions dates in a statement contain only day and month. - Statement dates range have a year though. - Merge them all together to get a full transaction date. - """ - dates = [datetime.datetime(d.year, date.month, date.day) - for d in (date_from, date_to)] - - # Ideally, pick the date within given range. - for d in dates: - if date_from <= d <= date_to: - return d - - # Otherwise, return the most recent date in the past - return min(dates, key=lambda d: abs(d-date_from)) - - -def decompress_pdf(inpdf): - inh, inname = mkstemp(suffix='.pdf') - outh, outname = mkstemp(suffix='.pdf') - os.write(inh, inpdf) - os.close(inh) - os.close(outh) - - # mutool is a part of MuPDF (http://www.mupdf.com). - subprocess.call(['mutool', 'clean', '-d', inname, outname]) - - with open(outname) as f: - outpdf = f.read() - os.remove(inname) - os.remove(outname) - return outpdf - - class StatementParser(object): """ - Each "read_*" method which takes position as its argument, - returns next token position if read was successful, - and the same position if it was not. + Each "read_*" method takes position as its argument, + and returns next token position if read was successful, + or the same position if it was not. """ + + LEX = [ + ('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'), + ('date', r'^\[\((\d+/\d+)\)\] TJ$'), + ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})' + r' - ([A-z]+ \d+, \d{4})\)\] TJ$'), + ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})' + r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'), + ('layout_tz', r'^(\d+\.\d{2}) Tz$'), + ('layout_tc', r'^(\d+\.\d{2}) Tc$'), + ('layout_tw', r'^(\d+\.\d{2}) Tw$'), + ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'), + ('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'), + ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'), + + ('text', r'^\[\(([^\)]+)\)\] TJ$') + ] + def __init__(self, pdf): self._pdf = decompress_pdf(pdf) - self._tok = StatementTokenizer(self._pdf) + self._tok = ReTokenizer(self._pdf, '\n', self.LEX) def read_card_transactions(self): # Early check if this is a card account statement at all. @@ -149,8 +127,8 @@ class StatementParser(object): or ref_layout is None or ref is None or desc is None or amount is None: return startPos, None else: - tdate = full_date(tdate, date_from, date_to) - pdate = full_date(pdate, date_from, date_to) + tdate = closest_date(tdate, date_from, date_to) + pdate = closest_date(pdate, date_from, date_to) trans = Transaction(ref) trans.date = tdate @@ -179,7 +157,7 @@ class StatementParser(object): if desc is None or date is None or amount is None: return startPos, None else: - date = full_date(date, date_from, date_to) + date = closest_date(date, date_from, date_to) trans = Transaction(u'') trans.date = date @@ -229,7 +207,7 @@ class StatementParser(object): return startPos, None else: # Infer amount type by its indentation in the layout. - amount_total = clean_amount('0') + amount_total = AmTr.decimal_amount('0') for (_, _, _, _, indent, _), amount in amounts: within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1] if within(range_skip): @@ -295,7 +273,7 @@ class StatementParser(object): def read_amount(self, pos): t = self._tok.tok(pos) - return (pos+1, clean_amount(t.value())) \ + return (pos+1, AmTr.decimal_amount(t.value())) \ if t.is_amount() else (pos, None) def read_date_range(self, pos): @@ -334,65 +312,3 @@ class StatementParser(object): return (pos+1, [float(v) for v in t.value()]) \ if t.is_layout_tm() else (pos, None) - -class StatementTokenizer(object): - def __init__(self, pdf): - self._tok = [StatementToken(line) for line in pdf.split('\n')] - - def tok(self, index): - if 0 <= index < len(self._tok): - return self._tok[index] - else: - return StatementToken(eof=True) - - -class StatementToken(object): - """ - Simple regex-based lexer. - There's a lexing table consisting of type-regex tuples. - Text line is sequentially matched against regexes and first - successful match defines the type of the token. - """ - LEX = [ - ('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'), - ('date', r'^\[\((\d+/\d+)\)\] TJ$'), - ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})' - r' - ([A-z]+ \d+, \d{4})\)\] TJ$'), - ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})' - r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'), - ('layout_tz', r'^(\d+\.\d{2}) Tz$'), - ('layout_tc', r'^(\d+\.\d{2}) Tc$'), - ('layout_tw', r'^(\d+\.\d{2}) Tw$'), - ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'), - ('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'), - ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'), - - ('text', r'^\[\(([^\)]+)\)\] TJ$') - ] - - def __init__(self, line=None, eof=False): - self._eof = eof - self._value = None - self._type = None - if line is not None: - for type_, regex in self.LEX: - m = re.match(regex, line, flags=re.UNICODE) - if m: - self._type = type_ - if len(m.groups()) == 1: - self._value = m.groups()[0] - elif m.groups(): - self._value = m.groups() - else: - self._value = m.group(0) - break - - def is_eof(self): - return self._eof - - def value(self): - return self._value - -for type_, _ in StatementToken.LEX: - setattr(StatementToken, 'is_%s' % type_, - eval('lambda self: self._type == "%s"' % type_))