American amount to decimal conversion; PDF decompression function; regexp-based tokenizer. Part of #1641

2014-10-13 04:23:05 -05:00 · 2014-10-13 04:23:05 -05:00 · d7acf211bd
commit d7acf211bd
parent c79dcc22df
5 changed files with 216 additions and 1 deletions
--- a/setup.cfg
+++ b/setup.cfg
@ -7,7 +7,9 @@ tests = weboob.tools.capabilities.bank.transactions,
        weboob.tools.capabilities.paste,
        weboob.tools.application.formatters.json,
        weboob.tools.application.formatters.table,
+        weboob.tools.date,
        weboob.tools.path,
+        weboob.tools.tokenizer,
        weboob.browser.browsers,
        weboob.browser.pages,
        weboob.browser.filters.standard
--- a/weboob/tools/capabilities/bank/transactions.py
+++ b/weboob/tools/capabilities/bank/transactions.py
@ -335,6 +335,14 @@ class AmericanTransaction(Transaction):
            text = text.replace(',', ' ').replace('.', ',')
        return FrenchTransaction.clean_amount(text)

+    @classmethod
+    def decimal_amount(klass, text):
+        """
+        Convert a string containing an amount to Decimal.
+        """
+        amnt = AmericanTransaction.clean_amount(text)
+        return Decimal(amnt) if amnt else Decimal('0')
+

 def test():
    clean_amount = AmericanTransaction.clean_amount
@ -344,3 +352,7 @@ def test():
    assert clean_amount('$42.12 USD') == '42.12'
    assert clean_amount('$12.442,12 USD') == '12442.12'
    assert clean_amount('$12,442.12 USD') == '12442.12'
+
+    decimal_amount = AmericanTransaction.decimal_amount
+    assert decimal_amount('$12,442.12 USD') == Decimal('12442.12')
+    assert decimal_amount('') == Decimal('0')
--- a/weboob/tools/date.py
+++ b/weboob/tools/date.py
@ -27,7 +27,7 @@ except ImportError:
    raise ImportError('Please install python-dateutil')


-__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime']
+__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime', 'closest_date']


 def local2utc(dateobj):
@ -315,3 +315,47 @@ def parse_date(string):

    elif string.upper() == "TODAY":
        return date.today()
+
+
+def closest_date(date, date_from, date_to):
+    """
+    Adjusts year so that the date is closest to the given range.
+    Transactions dates in a statement usually contain only day and month.
+    Statement dates range have a year though.
+    Merge them all together to get a full transaction date.
+    """
+    # If the date is within given range, we're done.
+    if date_from <= date <= date_to:
+        return date
+
+    dates = [real_datetime(year, date.month, date.day)
+             for year in xrange(date_from.year, date_to.year+1)]
+
+    # Ideally, pick the date within given range.
+    for d in dates:
+        if date_from <= d <= date_to:
+            return d
+
+    # Otherwise, return the most recent date in the past.
+    return min(dates, key=lambda d: abs(d-date_from))
+
+
+def test():
+    dt = real_datetime
+    range1 = [dt(2012,12,20), dt(2013,1,10)]
+
+    assert closest_date(dt(2012,12,15), *range1) == dt(2012,12,15)
+    assert closest_date(dt(2000,12,15), *range1) == dt(2012,12,15)
+    assert closest_date(dt(2020,12,15), *range1) == dt(2012,12,15)
+
+    assert closest_date(dt(2013,1,15), *range1) == dt(2013,1,15)
+    assert closest_date(dt(2000,1,15), *range1) == dt(2013,1,15)
+    assert closest_date(dt(2020,1,15), *range1) == dt(2013,1,15)
+
+    assert closest_date(dt(2013,1,1), *range1) == dt(2013,1,1)
+    assert closest_date(dt(2000,1,1), *range1) == dt(2013,1,1)
+    assert closest_date(dt(2020,1,1), *range1) == dt(2013,1,1)
+
+    range2 = [dt(2012,12,20), dt(2014,1,10)]
+    assert closest_date(dt(2012,12,15), *range2) == dt(2013,12,15)
+    assert closest_date(dt(2014,1,15), *range2) == dt(2013,1,15)
--- a/weboob/tools/pdf.py
+++ b/weboob/tools/pdf.py
@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2014 Oleg Plakhotniuk
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import subprocess
+from tempfile import mkstemp
+
+
+__all__ = ['decompress_pdf']
+
+
+def decompress_pdf(inpdf):
+    """
+    Takes PDF file contents as a string and returns decompressed version
+    of the file contents, suitable for text parsing.
+
+    External dependencies:
+    MuPDF (http://www.mupdf.com).
+    """
+
+    inh, inname = mkstemp(suffix='.pdf')
+    outh, outname = mkstemp(suffix='.pdf')
+    os.write(inh, inpdf)
+    os.close(inh)
+    os.close(outh)
+
+    subprocess.call(['mutool', 'clean', '-d', inname, outname])
+
+    with open(outname) as f:
+        outpdf = f.read()
+    os.remove(inname)
+    os.remove(outname)
+    return outpdf
--- a/weboob/tools/tokenizer.py
+++ b/weboob/tools/tokenizer.py
@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2014 Oleg Plakhotniuk
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+import re
+
+
+__all__ = ['ReTokenizer']
+
+
+class ReTokenizer(object):
+    """
+    Simple regex-based tokenizer (AKA lexer or lexical analyser).
+    Useful for PDF statements parsing.
+
+    1. There's a lexing table consisting of type-regex tuples.
+    2. Lexer splits text into chunks using the separator character.
+    3. Text chunk is sequentially matched against regexes and first
+       successful match defines the type of the token.
+
+    Check out test() function below for examples.
+    """
+
+    def __init__(self, text, sep, lex):
+        self._lex = lex
+        self._tok = [ReToken(lex, chunk) for chunk in text.split(sep)]
+
+    def tok(self, index):
+        if 0 <= index < len(self._tok):
+            return self._tok[index]
+        else:
+            return ReToken(self._lex, eof=True)
+
+    def simple_read(self, token_type, pos, transform=lambda v: v):
+        t = self.tok(pos)
+        is_type = getattr(t, 'is_%s' % token_type)()
+        return (pos+1, transform(t.value())) if is_type else (pos, None)
+
+
+class ReToken(object):
+    def __init__(self, lex, chunk=None, eof=False):
+        self._lex = lex
+        self._eof = eof
+        self._value = None
+        self._type = None
+        if chunk is not None:
+            for type_, regex in self._lex:
+                m = re.match(regex, chunk, flags=re.UNICODE)
+                if m:
+                    self._type = type_
+                    if len(m.groups()) == 1:
+                        self._value = m.groups()[0]
+                    elif m.groups():
+                        self._value = m.groups()
+                    else:
+                        self._value = m.group(0)
+                    break
+
+    def is_eof(self):
+        return self._eof
+
+    def value(self):
+        return self._value
+
+    def __getattr__(self, name):
+        if name.startswith('is_'):
+            return lambda: self._type == name[3:]
+        raise AttributeError()
+
+
+def test():
+    t = ReTokenizer('foo bar baz', ' ', [('f', r'^f'), ('b', r'^b')])
+
+    assert t.tok(0).is_f()
+    assert t.tok(1).is_b()
+    assert t.tok(2).is_b()
+
+    assert t.tok(-1).is_eof()
+    assert t.tok(3).is_eof()
+
+    assert not t.tok(-1).is_f()
+    assert not t.tok(0).is_b()
+    assert not t.tok(0).is_eof()
+
+    t = ReTokenizer('nogroup onegroup multigroup', ' ', [
+        ('ng', r'^n.*$'),
+        ('og', r'^one(g.*)$'),
+        ('mg', r'^(m.*)(g.*)$')])
+
+    assert t.tok(-1).value() == None
+    assert t.tok(0).value() == 'nogroup'
+    assert t.tok(1).value() == 'group'
+    assert t.tok(2).value() == ('multi', 'group')