Use AmericanTransaction.decimal_amount, closest_date, decompress_pdf and ReTokenizer in WellsFargo module. Part of #1641

2014-10-13 04:24:51 -05:00 · 2014-10-13 04:24:51 -05:00 · a5859d384d
commit a5859d384d
parent 3aaf8181a5
2 changed files with 38 additions and 120 deletions
--- a/modules/wellsfargo/pages.py
+++ b/modules/wellsfargo/pages.py
@ -18,10 +18,12 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.

 from weboob.capabilities.bank import Account, Transaction
+from weboob.tools.capabilities.bank.transactions import \
+    AmericanTransaction as AmTr
 from weboob.browser.pages import HTMLPage, LoggedPage, RawPage
 from urllib import unquote
 from requests.cookies import morsel_to_cookie
-from .parsers import StatementParser, clean_amount, clean_label
+from .parsers import StatementParser, clean_label
 import itertools
 import re
 import datetime
@ -138,7 +140,7 @@ class ActivityPage(AccountPage):
        account.id = id_
        account.label = name
        account.currency = currency
-        account.balance = clean_amount(balance)
+        account.balance = AmTr.decimal_amount(balance)
        account.type = type_
        return account

@ -196,9 +198,9 @@ class ActivityCashPage(ActivityPage):
            desc = clean_label(desc)

            deposit = deposit.strip()
-            deposit = clean_amount(deposit or '0')
+            deposit = AmTr.decimal_amount(deposit or '0')
            withdraw = withdraw.strip()
-            withdraw = clean_amount(withdraw or '0')
+            withdraw = AmTr.decimal_amount(withdraw or '0')

            amount = deposit - withdraw

@ -273,9 +275,9 @@ class ActivityCardPage(ActivityPage):
            ref = re.match('.*<REFERENCE ([^>]+)>.*', ref).group(1)

            if amount.startswith('+'):
-                amount = clean_amount(amount[1:])
+                amount = AmTr.decimal_amount(amount[1:])
            else:
-                amount = -clean_amount(amount)
+                amount = -AmTr.decimal_amount(amount)

            trans = Transaction(ref)
            trans.date = tdate
--- a/modules/wellsfargo/parsers.py
+++ b/modules/wellsfargo/parsers.py
@ -18,19 +18,15 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.

 from weboob.capabilities.bank import Transaction
-from weboob.tools.capabilities.bank.transactions import AmericanTransaction
-from decimal import Decimal
-from tempfile import mkstemp
-import subprocess
-import os
+from weboob.tools.capabilities.bank.transactions import \
+    AmericanTransaction as AmTr
+from weboob.tools.date import closest_date
+from weboob.tools.pdf import decompress_pdf
+from weboob.tools.tokenizer import ReTokenizer
 import re
 import datetime


-def clean_amount(text):
-    return Decimal(AmericanTransaction.clean_amount(text))
-
-
 def clean_label(text):
    """
    Web view and statements use different label formatting.
@ -40,51 +36,33 @@ def clean_label(text):
    return re.sub(u' +', u' ', text.strip().upper(), re.UNICODE)


-def full_date(date, date_from, date_to):
-    """
-    Makes sure that date is close to the given range.
-    Transactions dates in a statement contain only day and month.
-    Statement dates range have a year though.
-    Merge them all together to get a full transaction date.
-    """
-    dates = [datetime.datetime(d.year, date.month, date.day)
-             for d in (date_from, date_to)]
-
-    # Ideally, pick the date within given range.
-    for d in dates:
-        if date_from <= d <= date_to:
-            return d
-
-    # Otherwise, return the most recent date in the past
-    return min(dates, key=lambda d: abs(d-date_from))
-
-
-def decompress_pdf(inpdf):
-    inh, inname = mkstemp(suffix='.pdf')
-    outh, outname = mkstemp(suffix='.pdf')
-    os.write(inh, inpdf)
-    os.close(inh)
-    os.close(outh)
-
-    # mutool is a part of MuPDF (http://www.mupdf.com).
-    subprocess.call(['mutool', 'clean', '-d', inname, outname])
-
-    with open(outname) as f:
-        outpdf = f.read()
-    os.remove(inname)
-    os.remove(outname)
-    return outpdf
-
-
 class StatementParser(object):
    """
-    Each "read_*" method which takes position as its argument,
-    returns next token position if read was successful,
-    and the same position if it was not.
+    Each "read_*" method takes position as its argument,
+    and returns next token position if read was successful,
+    or the same position if it was not.
    """
+
+    LEX = [
+        ('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
+        ('date', r'^\[\((\d+/\d+)\)\] TJ$'),
+        ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
+                         r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
+        ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
+                         r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
+        ('layout_tz', r'^(\d+\.\d{2}) Tz$'),
+        ('layout_tc', r'^(\d+\.\d{2}) Tc$'),
+        ('layout_tw', r'^(\d+\.\d{2}) Tw$'),
+        ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
+        ('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
+        ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
+
+        ('text', r'^\[\(([^\)]+)\)\] TJ$')
+    ]
+
    def __init__(self, pdf):
        self._pdf = decompress_pdf(pdf)
-        self._tok = StatementTokenizer(self._pdf)
+        self._tok = ReTokenizer(self._pdf, '\n', self.LEX)

    def read_card_transactions(self):
        # Early check if this is a card account statement at all.
@ -149,8 +127,8 @@ class StatementParser(object):
        or ref_layout is None or ref is None or desc is None or amount is None:
            return startPos, None
        else:
-            tdate = full_date(tdate, date_from, date_to)
-            pdate = full_date(pdate, date_from, date_to)
+            tdate = closest_date(tdate, date_from, date_to)
+            pdate = closest_date(pdate, date_from, date_to)

            trans = Transaction(ref)
            trans.date = tdate
@ -179,7 +157,7 @@ class StatementParser(object):
        if desc is None or date is None or amount is None:
            return startPos, None
        else:
-            date = full_date(date, date_from, date_to)
+            date = closest_date(date, date_from, date_to)

            trans = Transaction(u'')
            trans.date = date
@ -229,7 +207,7 @@ class StatementParser(object):
            return startPos, None
        else:
            # Infer amount type by its indentation in the layout.
-            amount_total = clean_amount('0')
+            amount_total = AmTr.decimal_amount('0')
            for (_, _, _, _, indent, _), amount in amounts:
                within = lambda xmin_xmax: xmin_xmax[0] <= indent <= xmin_xmax[1]
                if within(range_skip):
@ -295,7 +273,7 @@ class StatementParser(object):

    def read_amount(self, pos):
        t = self._tok.tok(pos)
-        return (pos+1, clean_amount(t.value())) \
+        return (pos+1, AmTr.decimal_amount(t.value())) \
            if t.is_amount() else (pos, None)

    def read_date_range(self, pos):
@ -334,65 +312,3 @@ class StatementParser(object):
        return (pos+1, [float(v) for v in t.value()]) \
            if t.is_layout_tm() else (pos, None)

-
-class StatementTokenizer(object):
-    def __init__(self, pdf):
-        self._tok = [StatementToken(line) for line in pdf.split('\n')]
-
-    def tok(self, index):
-        if 0 <= index < len(self._tok):
-            return self._tok[index]
-        else:
-            return StatementToken(eof=True)
-
-
-class StatementToken(object):
-    """
-    Simple regex-based lexer.
-    There's a lexing table consisting of type-regex tuples.
-    Text line is sequentially matched against regexes and first
-    successful match defines the type of the token.
-    """
-    LEX = [
-        ('amount', r'^\[\(([0-9,]+\.\d+)\)\] TJ$'),
-        ('date', r'^\[\((\d+/\d+)\)\] TJ$'),
-        ('date_range_1', r'^\[\(([A-z]+ \d+, \d{4})'
-                          r' - ([A-z]+ \d+, \d{4})\)\] TJ$'),
-        ('date_range_2', r'^\[\((\d{2}/\d{2}/\d{4})'
-                         r' to (\d{2}/\d{2}/\d{4})\)\] TJ$'),
-        ('layout_tz', r'^(\d+\.\d{2}) Tz$'),
-        ('layout_tc', r'^(\d+\.\d{2}) Tc$'),
-        ('layout_tw', r'^(\d+\.\d{2}) Tw$'),
-        ('layout_tf', r'^/F(\d) (\d+\.\d{2}) Tf$'),
-        ('layout_tm', r'^' + (r'(\d+\.\d+ )'*6) + r'Tm$'),
-        ('ref', r'^\[\(([0-9A-Z]{17})\)\] TJ$'),
-
-        ('text', r'^\[\(([^\)]+)\)\] TJ$')
-    ]
-
-    def __init__(self, line=None, eof=False):
-        self._eof = eof
-        self._value = None
-        self._type = None
-        if line is not None:
-            for type_, regex in self.LEX:
-                m = re.match(regex, line, flags=re.UNICODE)
-                if m:
-                    self._type = type_
-                    if len(m.groups()) == 1:
-                        self._value = m.groups()[0]
-                    elif m.groups():
-                        self._value = m.groups()
-                    else:
-                        self._value = m.group(0)
-                    break
-
-    def is_eof(self):
-        return self._eof
-
-    def value(self):
-        return self._value
-
-for type_, _ in StatementToken.LEX:
-    setattr(StatementToken, 'is_%s' % type_,
-        eval('lambda self: self._type == "%s"' % type_))