American amount to decimal conversion; PDF decompression function; regexp-based tokenizer. Part of #1641
This commit is contained in:
parent
c79dcc22df
commit
d7acf211bd
5 changed files with 216 additions and 1 deletions
|
|
@ -7,7 +7,9 @@ tests = weboob.tools.capabilities.bank.transactions,
|
||||||
weboob.tools.capabilities.paste,
|
weboob.tools.capabilities.paste,
|
||||||
weboob.tools.application.formatters.json,
|
weboob.tools.application.formatters.json,
|
||||||
weboob.tools.application.formatters.table,
|
weboob.tools.application.formatters.table,
|
||||||
|
weboob.tools.date,
|
||||||
weboob.tools.path,
|
weboob.tools.path,
|
||||||
|
weboob.tools.tokenizer,
|
||||||
weboob.browser.browsers,
|
weboob.browser.browsers,
|
||||||
weboob.browser.pages,
|
weboob.browser.pages,
|
||||||
weboob.browser.filters.standard
|
weboob.browser.filters.standard
|
||||||
|
|
|
||||||
|
|
@ -335,6 +335,14 @@ class AmericanTransaction(Transaction):
|
||||||
text = text.replace(',', ' ').replace('.', ',')
|
text = text.replace(',', ' ').replace('.', ',')
|
||||||
return FrenchTransaction.clean_amount(text)
|
return FrenchTransaction.clean_amount(text)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def decimal_amount(klass, text):
|
||||||
|
"""
|
||||||
|
Convert a string containing an amount to Decimal.
|
||||||
|
"""
|
||||||
|
amnt = AmericanTransaction.clean_amount(text)
|
||||||
|
return Decimal(amnt) if amnt else Decimal('0')
|
||||||
|
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
clean_amount = AmericanTransaction.clean_amount
|
clean_amount = AmericanTransaction.clean_amount
|
||||||
|
|
@ -344,3 +352,7 @@ def test():
|
||||||
assert clean_amount('$42.12 USD') == '42.12'
|
assert clean_amount('$42.12 USD') == '42.12'
|
||||||
assert clean_amount('$12.442,12 USD') == '12442.12'
|
assert clean_amount('$12.442,12 USD') == '12442.12'
|
||||||
assert clean_amount('$12,442.12 USD') == '12442.12'
|
assert clean_amount('$12,442.12 USD') == '12442.12'
|
||||||
|
|
||||||
|
decimal_amount = AmericanTransaction.decimal_amount
|
||||||
|
assert decimal_amount('$12,442.12 USD') == Decimal('12442.12')
|
||||||
|
assert decimal_amount('') == Decimal('0')
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ except ImportError:
|
||||||
raise ImportError('Please install python-dateutil')
|
raise ImportError('Please install python-dateutil')
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime']
|
__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime', 'closest_date']
|
||||||
|
|
||||||
|
|
||||||
def local2utc(dateobj):
|
def local2utc(dateobj):
|
||||||
|
|
@ -315,3 +315,47 @@ def parse_date(string):
|
||||||
|
|
||||||
elif string.upper() == "TODAY":
|
elif string.upper() == "TODAY":
|
||||||
return date.today()
|
return date.today()
|
||||||
|
|
||||||
|
|
||||||
|
def closest_date(date, date_from, date_to):
|
||||||
|
"""
|
||||||
|
Adjusts year so that the date is closest to the given range.
|
||||||
|
Transactions dates in a statement usually contain only day and month.
|
||||||
|
Statement dates range have a year though.
|
||||||
|
Merge them all together to get a full transaction date.
|
||||||
|
"""
|
||||||
|
# If the date is within given range, we're done.
|
||||||
|
if date_from <= date <= date_to:
|
||||||
|
return date
|
||||||
|
|
||||||
|
dates = [real_datetime(year, date.month, date.day)
|
||||||
|
for year in xrange(date_from.year, date_to.year+1)]
|
||||||
|
|
||||||
|
# Ideally, pick the date within given range.
|
||||||
|
for d in dates:
|
||||||
|
if date_from <= d <= date_to:
|
||||||
|
return d
|
||||||
|
|
||||||
|
# Otherwise, return the most recent date in the past.
|
||||||
|
return min(dates, key=lambda d: abs(d-date_from))
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
dt = real_datetime
|
||||||
|
range1 = [dt(2012,12,20), dt(2013,1,10)]
|
||||||
|
|
||||||
|
assert closest_date(dt(2012,12,15), *range1) == dt(2012,12,15)
|
||||||
|
assert closest_date(dt(2000,12,15), *range1) == dt(2012,12,15)
|
||||||
|
assert closest_date(dt(2020,12,15), *range1) == dt(2012,12,15)
|
||||||
|
|
||||||
|
assert closest_date(dt(2013,1,15), *range1) == dt(2013,1,15)
|
||||||
|
assert closest_date(dt(2000,1,15), *range1) == dt(2013,1,15)
|
||||||
|
assert closest_date(dt(2020,1,15), *range1) == dt(2013,1,15)
|
||||||
|
|
||||||
|
assert closest_date(dt(2013,1,1), *range1) == dt(2013,1,1)
|
||||||
|
assert closest_date(dt(2000,1,1), *range1) == dt(2013,1,1)
|
||||||
|
assert closest_date(dt(2020,1,1), *range1) == dt(2013,1,1)
|
||||||
|
|
||||||
|
range2 = [dt(2012,12,20), dt(2014,1,10)]
|
||||||
|
assert closest_date(dt(2012,12,15), *range2) == dt(2013,12,15)
|
||||||
|
assert closest_date(dt(2014,1,15), *range2) == dt(2013,1,15)
|
||||||
|
|
|
||||||
49
weboob/tools/pdf.py
Normal file
49
weboob/tools/pdf.py
Normal file
|
|
@ -0,0 +1,49 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright(C) 2014 Oleg Plakhotniuk
|
||||||
|
#
|
||||||
|
# This file is part of weboob.
|
||||||
|
#
|
||||||
|
# weboob is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# weboob is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
from tempfile import mkstemp
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['decompress_pdf']
|
||||||
|
|
||||||
|
|
||||||
|
def decompress_pdf(inpdf):
|
||||||
|
"""
|
||||||
|
Takes PDF file contents as a string and returns decompressed version
|
||||||
|
of the file contents, suitable for text parsing.
|
||||||
|
|
||||||
|
External dependencies:
|
||||||
|
MuPDF (http://www.mupdf.com).
|
||||||
|
"""
|
||||||
|
|
||||||
|
inh, inname = mkstemp(suffix='.pdf')
|
||||||
|
outh, outname = mkstemp(suffix='.pdf')
|
||||||
|
os.write(inh, inpdf)
|
||||||
|
os.close(inh)
|
||||||
|
os.close(outh)
|
||||||
|
|
||||||
|
subprocess.call(['mutool', 'clean', '-d', inname, outname])
|
||||||
|
|
||||||
|
with open(outname) as f:
|
||||||
|
outpdf = f.read()
|
||||||
|
os.remove(inname)
|
||||||
|
os.remove(outname)
|
||||||
|
return outpdf
|
||||||
108
weboob/tools/tokenizer.py
Normal file
108
weboob/tools/tokenizer.py
Normal file
|
|
@ -0,0 +1,108 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright(C) 2014 Oleg Plakhotniuk
|
||||||
|
#
|
||||||
|
# This file is part of weboob.
|
||||||
|
#
|
||||||
|
# weboob is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# weboob is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ['ReTokenizer']
|
||||||
|
|
||||||
|
|
||||||
|
class ReTokenizer(object):
|
||||||
|
"""
|
||||||
|
Simple regex-based tokenizer (AKA lexer or lexical analyser).
|
||||||
|
Useful for PDF statements parsing.
|
||||||
|
|
||||||
|
1. There's a lexing table consisting of type-regex tuples.
|
||||||
|
2. Lexer splits text into chunks using the separator character.
|
||||||
|
3. Text chunk is sequentially matched against regexes and first
|
||||||
|
successful match defines the type of the token.
|
||||||
|
|
||||||
|
Check out test() function below for examples.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, text, sep, lex):
|
||||||
|
self._lex = lex
|
||||||
|
self._tok = [ReToken(lex, chunk) for chunk in text.split(sep)]
|
||||||
|
|
||||||
|
def tok(self, index):
|
||||||
|
if 0 <= index < len(self._tok):
|
||||||
|
return self._tok[index]
|
||||||
|
else:
|
||||||
|
return ReToken(self._lex, eof=True)
|
||||||
|
|
||||||
|
def simple_read(self, token_type, pos, transform=lambda v: v):
|
||||||
|
t = self.tok(pos)
|
||||||
|
is_type = getattr(t, 'is_%s' % token_type)()
|
||||||
|
return (pos+1, transform(t.value())) if is_type else (pos, None)
|
||||||
|
|
||||||
|
|
||||||
|
class ReToken(object):
|
||||||
|
def __init__(self, lex, chunk=None, eof=False):
|
||||||
|
self._lex = lex
|
||||||
|
self._eof = eof
|
||||||
|
self._value = None
|
||||||
|
self._type = None
|
||||||
|
if chunk is not None:
|
||||||
|
for type_, regex in self._lex:
|
||||||
|
m = re.match(regex, chunk, flags=re.UNICODE)
|
||||||
|
if m:
|
||||||
|
self._type = type_
|
||||||
|
if len(m.groups()) == 1:
|
||||||
|
self._value = m.groups()[0]
|
||||||
|
elif m.groups():
|
||||||
|
self._value = m.groups()
|
||||||
|
else:
|
||||||
|
self._value = m.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
def is_eof(self):
|
||||||
|
return self._eof
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return self._value
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
if name.startswith('is_'):
|
||||||
|
return lambda: self._type == name[3:]
|
||||||
|
raise AttributeError()
|
||||||
|
|
||||||
|
|
||||||
|
def test():
|
||||||
|
t = ReTokenizer('foo bar baz', ' ', [('f', r'^f'), ('b', r'^b')])
|
||||||
|
|
||||||
|
assert t.tok(0).is_f()
|
||||||
|
assert t.tok(1).is_b()
|
||||||
|
assert t.tok(2).is_b()
|
||||||
|
|
||||||
|
assert t.tok(-1).is_eof()
|
||||||
|
assert t.tok(3).is_eof()
|
||||||
|
|
||||||
|
assert not t.tok(-1).is_f()
|
||||||
|
assert not t.tok(0).is_b()
|
||||||
|
assert not t.tok(0).is_eof()
|
||||||
|
|
||||||
|
t = ReTokenizer('nogroup onegroup multigroup', ' ', [
|
||||||
|
('ng', r'^n.*$'),
|
||||||
|
('og', r'^one(g.*)$'),
|
||||||
|
('mg', r'^(m.*)(g.*)$')])
|
||||||
|
|
||||||
|
assert t.tok(-1).value() == None
|
||||||
|
assert t.tok(0).value() == 'nogroup'
|
||||||
|
assert t.tok(1).value() == 'group'
|
||||||
|
assert t.tok(2).value() == ('multi', 'group')
|
||||||
Loading…
Add table
Add a link
Reference in a new issue