Citibank module (https://online.citibank.com). Implements #1642
This commit is contained in:
parent
a5859d384d
commit
5997448169
6 changed files with 586 additions and 0 deletions
214
modules/citibank/parser.py
Normal file
214
modules/citibank/parser.py
Normal file
|
|
@ -0,0 +1,214 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright(C) 2014 Oleg Plakhotniuk
|
||||
#
|
||||
# This file is part of weboob.
|
||||
#
|
||||
# weboob is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# weboob is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU Affero General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.capabilities.bank import Transaction
|
||||
from weboob.tools.capabilities.bank.transactions import \
|
||||
AmericanTransaction as AmTr
|
||||
from weboob.tools.date import closest_date
|
||||
from weboob.tools.pdf import decompress_pdf
|
||||
from weboob.tools.tokenizer import ReTokenizer
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
|
||||
def clean_label(text):
|
||||
"""
|
||||
Web view and statements use different label formatting.
|
||||
User shouldn't be able to see the difference, so we
|
||||
need to make labels from both sources look the same.
|
||||
"""
|
||||
for pattern in [r' \d+\.\d+ +POUND STERLING',
|
||||
u'Subject to Foreign Fee',
|
||||
u'Description']:
|
||||
text = re.sub(pattern, u'', text, re.UNICODE)
|
||||
return re.sub(r' +', u' ', text.strip().upper(), re.UNICODE)
|
||||
|
||||
|
||||
def formatted(read_func):
|
||||
"""
|
||||
Reads boilerplate PDF formatting around the data of interest.
|
||||
"""
|
||||
def wrapped(self, pos):
|
||||
startPos = pos
|
||||
pos, ws = self.read_whitespace(pos)
|
||||
pos, bt = self.read_layout_bt(pos)
|
||||
pos, tf = self.read_layout_tf(pos)
|
||||
pos, tm = self.read_layout_tm(pos)
|
||||
pos, data = read_func(self, pos)
|
||||
pos, et = self.read_layout_et(pos)
|
||||
if ws is None or bt is None or tf is None \
|
||||
or tm is None or data is None or et is None:
|
||||
return startPos, None
|
||||
else:
|
||||
return pos, data
|
||||
return wrapped
|
||||
|
||||
|
||||
class StatementParser(object):
|
||||
"""
|
||||
Each "read_*" method takes position as its argument,
|
||||
and returns next token position if read was successful,
|
||||
or the same position if it was not.
|
||||
"""
|
||||
|
||||
LEX = [
|
||||
('date_range', r'^\((\d{2}/\d{2}/\d{2})-(\d{2}/\d{2}/\d{2})\) Tj$'),
|
||||
('amount', r'^\((-?\$\d+\.\d{2})\) Tj$'),
|
||||
('date', r'^\((\d{2}/\d{2})\) Tj$'),
|
||||
('text', r'^\((.*)\) Tj$'),
|
||||
('layout_tf', r'^.* Tf$'),
|
||||
('layout_tm', r'^' + (6*r'([^ ]+) ') + r'Tm$'),
|
||||
('layout_bt', r'^BT$'),
|
||||
('layout_et', r'^ET$'),
|
||||
('whitespace', r'^$')
|
||||
]
|
||||
|
||||
def __init__(self, pdf):
|
||||
self._pdf = decompress_pdf(pdf)
|
||||
self._tok = ReTokenizer(self._pdf, '\n', self.LEX)
|
||||
|
||||
def read_transactions(self):
|
||||
# Read statement dates range.
|
||||
date_from, date_to = self.read_first_date_range()
|
||||
|
||||
# Read transactions.
|
||||
pos = 0
|
||||
while not self._tok.tok(pos).is_eof():
|
||||
pos, trans = self.read_transaction(pos, date_from, date_to)
|
||||
if trans:
|
||||
yield trans
|
||||
else:
|
||||
pos += 1
|
||||
|
||||
def read_first_date_range(self):
|
||||
pos = 0
|
||||
while not self._tok.tok(pos).is_eof():
|
||||
pos, date_range = self.read_date_range(pos)
|
||||
if date_range is not None:
|
||||
return date_range
|
||||
else:
|
||||
pos += 1
|
||||
|
||||
def read_date_range(self, pos):
|
||||
t = self._tok.tok(pos)
|
||||
if t.is_date_range():
|
||||
return (pos+1, [datetime.datetime.strptime(v, '%m/%d/%y')
|
||||
for v in t.value()])
|
||||
else:
|
||||
return (pos, None)
|
||||
|
||||
def read_transaction(self, pos, date_from, date_to):
|
||||
startPos = pos
|
||||
|
||||
pos, tdate = self.read_date(pos)
|
||||
pos, pdate = self.read_date(pos)
|
||||
|
||||
# Early check to call read_multiline_desc() only when needed.
|
||||
if tdate is None:
|
||||
return startPos, None
|
||||
|
||||
pos, desc = self.read_multiline_desc(pos)
|
||||
pos, amount = self.read_amount(pos)
|
||||
|
||||
if desc is None or amount is None:
|
||||
return startPos, None
|
||||
else:
|
||||
# Sometimes one date is missing.
|
||||
pdate = pdate or tdate
|
||||
|
||||
tdate = closest_date(tdate, date_from, date_to)
|
||||
pdate = closest_date(pdate, date_from, date_to)
|
||||
|
||||
trans = Transaction()
|
||||
trans.date = tdate
|
||||
trans.rdate = pdate
|
||||
trans.type = Transaction.TYPE_UNKNOWN
|
||||
trans.raw = desc
|
||||
trans.label = desc
|
||||
trans.amount = -amount
|
||||
return pos, trans
|
||||
|
||||
def read_multiline_desc(self, pos):
|
||||
"""
|
||||
Read transaction description which can span over multiple lines.
|
||||
Amount must always follow the multiline description.
|
||||
But multiline description might be split by page break.
|
||||
After reading first line of the description, we skip everything
|
||||
which is not an amount and which has different horizontal offset
|
||||
than the first read line.
|
||||
"""
|
||||
startPos = pos
|
||||
|
||||
descs = []
|
||||
xofs = None
|
||||
while not self._tok.tok(pos).is_eof():
|
||||
pos, desc_tm = self.read_text(pos)
|
||||
if desc_tm is None:
|
||||
if not descs:
|
||||
break
|
||||
prev_pos = pos
|
||||
pos, amount = self.read_amount(pos)
|
||||
if amount is not None:
|
||||
pos = prev_pos
|
||||
break
|
||||
pos += 1
|
||||
else:
|
||||
desc, tm = desc_tm
|
||||
if xofs is None:
|
||||
_, _, _, _, xofs, _ = tm
|
||||
_, _, _, _, xofs_new, _ = tm
|
||||
if xofs == xofs_new:
|
||||
descs.append(desc)
|
||||
else:
|
||||
pos += 1
|
||||
|
||||
if descs:
|
||||
return pos, clean_label(' '.join(descs))
|
||||
else:
|
||||
return startPos, None
|
||||
|
||||
def __getattr__(self, name):
|
||||
if name.startswith('read_'):
|
||||
return lambda pos: self._tok.simple_read(name[5:], pos)
|
||||
raise AttributeError()
|
||||
|
||||
@formatted
|
||||
def read_date(self, pos):
|
||||
return self._tok.simple_read('date', pos,
|
||||
lambda v: datetime.datetime.strptime(v, '%m/%d'))
|
||||
|
||||
@formatted
|
||||
def read_amount(self, pos):
|
||||
return self._tok.simple_read('amount', pos, AmTr.decimal_amount)
|
||||
|
||||
def read_text(self, pos):
|
||||
startPos = pos
|
||||
pos, ws = self.read_whitespace(pos)
|
||||
pos, bt = self.read_layout_bt(pos)
|
||||
pos, tf = self.read_layout_tf(pos)
|
||||
pos, tm = self.read_layout_tm(pos)
|
||||
pos, text = self._tok.simple_read('text', pos,
|
||||
lambda v: unicode(v, errors='ignore'))
|
||||
pos, et = self.read_layout_et(pos)
|
||||
if ws is None or bt is None or tf is None \
|
||||
or tm is None or text is None or et is None:
|
||||
return startPos, None
|
||||
else:
|
||||
return pos, (text, tm)
|
||||
Loading…
Add table
Add a link
Reference in a new issue