American amount to decimal conversion; PDF decompression function; regexp-based tokenizer. Part of #1641
This commit is contained in:
parent
c79dcc22df
commit
d7acf211bd
5 changed files with 216 additions and 1 deletions
|
|
@ -7,7 +7,9 @@ tests = weboob.tools.capabilities.bank.transactions,
|
|||
weboob.tools.capabilities.paste,
|
||||
weboob.tools.application.formatters.json,
|
||||
weboob.tools.application.formatters.table,
|
||||
weboob.tools.date,
|
||||
weboob.tools.path,
|
||||
weboob.tools.tokenizer,
|
||||
weboob.browser.browsers,
|
||||
weboob.browser.pages,
|
||||
weboob.browser.filters.standard
|
||||
|
|
|
|||
|
|
@ -335,6 +335,14 @@ class AmericanTransaction(Transaction):
|
|||
text = text.replace(',', ' ').replace('.', ',')
|
||||
return FrenchTransaction.clean_amount(text)
|
||||
|
||||
@classmethod
|
||||
def decimal_amount(klass, text):
|
||||
"""
|
||||
Convert a string containing an amount to Decimal.
|
||||
"""
|
||||
amnt = AmericanTransaction.clean_amount(text)
|
||||
return Decimal(amnt) if amnt else Decimal('0')
|
||||
|
||||
|
||||
def test():
|
||||
clean_amount = AmericanTransaction.clean_amount
|
||||
|
|
@ -344,3 +352,7 @@ def test():
|
|||
assert clean_amount('$42.12 USD') == '42.12'
|
||||
assert clean_amount('$12.442,12 USD') == '12442.12'
|
||||
assert clean_amount('$12,442.12 USD') == '12442.12'
|
||||
|
||||
decimal_amount = AmericanTransaction.decimal_amount
|
||||
assert decimal_amount('$12,442.12 USD') == Decimal('12442.12')
|
||||
assert decimal_amount('') == Decimal('0')
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ except ImportError:
|
|||
raise ImportError('Please install python-dateutil')
|
||||
|
||||
|
||||
__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime']
|
||||
__all__ = ['local2utc', 'utc2local', 'LinearDateGuesser', 'date', 'datetime', 'new_date', 'new_datetime', 'closest_date']
|
||||
|
||||
|
||||
def local2utc(dateobj):
|
||||
|
|
@ -315,3 +315,47 @@ def parse_date(string):
|
|||
|
||||
elif string.upper() == "TODAY":
|
||||
return date.today()
|
||||
|
||||
|
||||
def closest_date(date, date_from, date_to):
|
||||
"""
|
||||
Adjusts year so that the date is closest to the given range.
|
||||
Transactions dates in a statement usually contain only day and month.
|
||||
Statement dates range have a year though.
|
||||
Merge them all together to get a full transaction date.
|
||||
"""
|
||||
# If the date is within given range, we're done.
|
||||
if date_from <= date <= date_to:
|
||||
return date
|
||||
|
||||
dates = [real_datetime(year, date.month, date.day)
|
||||
for year in xrange(date_from.year, date_to.year+1)]
|
||||
|
||||
# Ideally, pick the date within given range.
|
||||
for d in dates:
|
||||
if date_from <= d <= date_to:
|
||||
return d
|
||||
|
||||
# Otherwise, return the most recent date in the past.
|
||||
return min(dates, key=lambda d: abs(d-date_from))
|
||||
|
||||
|
||||
def test():
|
||||
dt = real_datetime
|
||||
range1 = [dt(2012,12,20), dt(2013,1,10)]
|
||||
|
||||
assert closest_date(dt(2012,12,15), *range1) == dt(2012,12,15)
|
||||
assert closest_date(dt(2000,12,15), *range1) == dt(2012,12,15)
|
||||
assert closest_date(dt(2020,12,15), *range1) == dt(2012,12,15)
|
||||
|
||||
assert closest_date(dt(2013,1,15), *range1) == dt(2013,1,15)
|
||||
assert closest_date(dt(2000,1,15), *range1) == dt(2013,1,15)
|
||||
assert closest_date(dt(2020,1,15), *range1) == dt(2013,1,15)
|
||||
|
||||
assert closest_date(dt(2013,1,1), *range1) == dt(2013,1,1)
|
||||
assert closest_date(dt(2000,1,1), *range1) == dt(2013,1,1)
|
||||
assert closest_date(dt(2020,1,1), *range1) == dt(2013,1,1)
|
||||
|
||||
range2 = [dt(2012,12,20), dt(2014,1,10)]
|
||||
assert closest_date(dt(2012,12,15), *range2) == dt(2013,12,15)
|
||||
assert closest_date(dt(2014,1,15), *range2) == dt(2013,1,15)
|
||||
|
|
|
|||
49
weboob/tools/pdf.py
Normal file
49
weboob/tools/pdf.py
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright(C) 2014 Oleg Plakhotniuk
|
||||
#
|
||||
# This file is part of weboob.
|
||||
#
|
||||
# weboob is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# weboob is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from tempfile import mkstemp
|
||||
|
||||
|
||||
__all__ = ['decompress_pdf']
|
||||
|
||||
|
||||
def decompress_pdf(inpdf):
|
||||
"""
|
||||
Takes PDF file contents as a string and returns decompressed version
|
||||
of the file contents, suitable for text parsing.
|
||||
|
||||
External dependencies:
|
||||
MuPDF (http://www.mupdf.com).
|
||||
"""
|
||||
|
||||
inh, inname = mkstemp(suffix='.pdf')
|
||||
outh, outname = mkstemp(suffix='.pdf')
|
||||
os.write(inh, inpdf)
|
||||
os.close(inh)
|
||||
os.close(outh)
|
||||
|
||||
subprocess.call(['mutool', 'clean', '-d', inname, outname])
|
||||
|
||||
with open(outname) as f:
|
||||
outpdf = f.read()
|
||||
os.remove(inname)
|
||||
os.remove(outname)
|
||||
return outpdf
|
||||
108
weboob/tools/tokenizer.py
Normal file
108
weboob/tools/tokenizer.py
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright(C) 2014 Oleg Plakhotniuk
|
||||
#
|
||||
# This file is part of weboob.
|
||||
#
|
||||
# weboob is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# weboob is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
|
||||
|
||||
__all__ = ['ReTokenizer']
|
||||
|
||||
|
||||
class ReTokenizer(object):
|
||||
"""
|
||||
Simple regex-based tokenizer (AKA lexer or lexical analyser).
|
||||
Useful for PDF statements parsing.
|
||||
|
||||
1. There's a lexing table consisting of type-regex tuples.
|
||||
2. Lexer splits text into chunks using the separator character.
|
||||
3. Text chunk is sequentially matched against regexes and first
|
||||
successful match defines the type of the token.
|
||||
|
||||
Check out test() function below for examples.
|
||||
"""
|
||||
|
||||
def __init__(self, text, sep, lex):
|
||||
self._lex = lex
|
||||
self._tok = [ReToken(lex, chunk) for chunk in text.split(sep)]
|
||||
|
||||
def tok(self, index):
|
||||
if 0 <= index < len(self._tok):
|
||||
return self._tok[index]
|
||||
else:
|
||||
return ReToken(self._lex, eof=True)
|
||||
|
||||
def simple_read(self, token_type, pos, transform=lambda v: v):
|
||||
t = self.tok(pos)
|
||||
is_type = getattr(t, 'is_%s' % token_type)()
|
||||
return (pos+1, transform(t.value())) if is_type else (pos, None)
|
||||
|
||||
|
||||
class ReToken(object):
|
||||
def __init__(self, lex, chunk=None, eof=False):
|
||||
self._lex = lex
|
||||
self._eof = eof
|
||||
self._value = None
|
||||
self._type = None
|
||||
if chunk is not None:
|
||||
for type_, regex in self._lex:
|
||||
m = re.match(regex, chunk, flags=re.UNICODE)
|
||||
if m:
|
||||
self._type = type_
|
||||
if len(m.groups()) == 1:
|
||||
self._value = m.groups()[0]
|
||||
elif m.groups():
|
||||
self._value = m.groups()
|
||||
else:
|
||||
self._value = m.group(0)
|
||||
break
|
||||
|
||||
def is_eof(self):
|
||||
return self._eof
|
||||
|
||||
def value(self):
|
||||
return self._value
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name.startswith('is_'):
|
||||
return lambda: self._type == name[3:]
|
||||
raise AttributeError()
|
||||
|
||||
|
||||
def test():
|
||||
t = ReTokenizer('foo bar baz', ' ', [('f', r'^f'), ('b', r'^b')])
|
||||
|
||||
assert t.tok(0).is_f()
|
||||
assert t.tok(1).is_b()
|
||||
assert t.tok(2).is_b()
|
||||
|
||||
assert t.tok(-1).is_eof()
|
||||
assert t.tok(3).is_eof()
|
||||
|
||||
assert not t.tok(-1).is_f()
|
||||
assert not t.tok(0).is_b()
|
||||
assert not t.tok(0).is_eof()
|
||||
|
||||
t = ReTokenizer('nogroup onegroup multigroup', ' ', [
|
||||
('ng', r'^n.*$'),
|
||||
('og', r'^one(g.*)$'),
|
||||
('mg', r'^(m.*)(g.*)$')])
|
||||
|
||||
assert t.tok(-1).value() == None
|
||||
assert t.tok(0).value() == 'nogroup'
|
||||
assert t.tok(1).value() == 'group'
|
||||
assert t.tok(2).value() == ('multi', 'group')
|
||||
Loading…
Add table
Add a link
Reference in a new issue