From 3aa13dcef1cb0796ccfd6d0ba033ffa08beccb66 Mon Sep 17 00:00:00 2001 From: Oleg Plakhotniuk Date: Thu, 26 Feb 2015 03:52:47 -0600 Subject: [PATCH] [citibank] Use Selenium only to log in. Closes #1740 --- modules/citibank/browser.py | 394 ++++++++++++++++-------------------- 1 file changed, 171 insertions(+), 223 deletions(-) diff --git a/modules/citibank/browser.py b/modules/citibank/browser.py index ee4cb8e2..78c9d095 100644 --- a/modules/citibank/browser.py +++ b/modules/citibank/browser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2014 Oleg Plakhotniuk +# Copyright(C) 2014, 2015 Oleg Plakhotniuk # # This file is part of weboob. # @@ -18,6 +18,8 @@ # along with weboob. If not, see . +from weboob.browser import LoginBrowser, URL, need_login +from weboob.browser.pages import HTMLPage, JsonPage, RawPage from weboob.capabilities.bank import Account, AccountNotFound, Transaction from weboob.exceptions import BrowserIncorrectPassword, BrowserUnavailable from weboob.tools.capabilities.bank.transactions import \ @@ -25,42 +27,109 @@ from weboob.tools.capabilities.bank.transactions import \ from .parser import StatementParser, clean_label -from time import sleep -from tempfile import mkdtemp -from shutil import rmtree -from itertools import chain -import datetime +import gc import re -import os -import subprocess +from datetime import datetime +from time import sleep +from urllib import unquote +from shutil import rmtree __all__ = ['Citibank'] -def retrying(func): - def inner(*args, **kwargs): - MAX_RETRIES = 10 - MAX_DELAY = 10 - for i in xrange(MAX_RETRIES): - try: - return func(*args, **kwargs) - except OnceAgain: - sleep(min(1 << i, MAX_DELAY)) - raise BrowserUnavailable('Unexpected site behavior. ' - 'Perhaps this module needs some fixing...') - return inner +class SomePage(HTMLPage): + @property + def logged(self): + return bool(self.doc.xpath(u'//a[text()="Sign Off"]')) -class OnceAgain(Exception): - pass +class AccountsPage(JsonPage): + logged = True + + def inner_ids_dict(self): + return dict((prod['parsedAccountName'][-4:], prod['accountInstanceId']) + for cat in self.doc['accountsSummaryViewObj']['categoryList'] + for prod in cat['products'] if cat['categoryType'] == 'CRD') -class Citibank(object): +class AccDetailsPage(JsonPage): + logged = True + + def account(self): + detact = self.doc['accountDetailsAndActivity'] + details = detact['accountDetails'] + account = Account() + account.type = Account.TYPE_CARD + account.label = re.sub(r'<[^>]+>', '', detact['accountName']) + account.id = account.label[-4:] + account._innerId = details['instanceID'] + for bal in details['accountBalances']: + label, value = bal['label'], (bal['value'] or ['0'])[0] + if label == u'Current Balance:': + account.currency = Account.get_currency(value) + account.balance = -AmTr.decimal_amount(value) + elif label == u'Total Revolving Credit Line:': + account.cardlimit = AmTr.decimal_amount(value) + elif label.startswith(u'Minimum Payment Due'): + d = re.match(r'.*(..-..-....):$', label).group(1) + account.paydate = datetime.strptime(d, '%m-%d-%Y') + account.paymin = AmTr.decimal_amount(value) + return account + + def transactions(self): + return sorted(self.unsorted_trans(), + lambda a, b: cmp(a.date, b.date), reverse=True) + + def unsorted_trans(self): + for jnl in self.doc['accountDetailsAndActivity']['accountActivity'] \ + ['postedTransactionJournals']: + tdate = jnl['columns'][0]['activityColumn'][0] + label = jnl['columns'][1]['activityColumn'][0] + amount = jnl['columns'][3]['activityColumn'][0] + xdescs = dict((x['label'], x['value'][0]) + for x in jnl['extendedDescriptions']) + pdate = xdescs[u'Posted Date :'] + ref = xdescs.get(u'Reference Number:') or u'' + + if amount.startswith(u'(') and amount.endswith(u')'): + amount = AmTr.decimal_amount(amount[1:-1]) + else: + amount = -AmTr.decimal_amount(amount) + label = clean_label(label) + + trans = Transaction(ref) + trans.date = datetime.strptime(tdate, '%m-%d-%Y') + trans.rdate = datetime.strptime(pdate, '%m-%d-%Y') + trans.type = Transaction.TYPE_UNKNOWN + trans.raw = label + trans.label = label + trans.amount = amount + yield trans + + +class StatementsPage(SomePage): + def dates(self): + return self.doc.xpath( + '//select[@id="currentStatementsDate"]/option/text()')[1:] + + +class StatementPage(RawPage): + logged = True + + def __init__(self, *args, **kwArgs): + RawPage.__init__(self, *args, **kwArgs) + self._parser = StatementParser(self.doc) + + def transactions(self): + return sorted(self._parser.read_transactions(), + cmp=lambda t1, t2: cmp(t2.date, t1.date)) + + +class Citibank(LoginBrowser): """ - Citibank website uses lots of Javascript, some of which seems to be - dynamically generated and intentionally obfuscated. - Our answer to THAT is... heavy artillery firing Selenium rounds! + Citibank website uses some kind of Javascript magic during login + negotiation, hence a real browser is being used to log in. External dependencies: Firefox (https://www.mozilla.org/firefox). @@ -68,218 +137,97 @@ class Citibank(object): Python bindings for Selenium (https://pypi.python.org/pypi/selenium). Xvfb (http://www.x.org/releases/X11R7.6/doc/man/man1/Xvfb.1.xhtml). - Tested on Arch Linux snapshot of 2014-08-25 (official and user packages). - Don't forget to do "export DISPLAY=:0". + Tested on Arch Linux snapshot of 2014-11-11 (official and user packages). Only a single credit card account is currently supported. Contributions are welcome! """ - def __init__(self, username, password, logger, **kwargs): - self._logger = logger - self._username = username - self._password = password + MAX_RETRIES = 10 + MAX_DELAY = 10 + BASEURL = 'https://online.citibank.com' + home = URL(r'/US/JPS/portal/Home.do', SomePage) + accounts = URL(r'/US/REST/accountsPanel' + r'/getCustomerAccounts.jws\?ttc=(?P.*)$', + AccountsPage) + accdetails = URL(r'/US/REST/accountDetailsActivity' + r'/getAccountDetailsActivity.jws\?accountID=(?P.*)$', + AccDetailsPage) + statements = URL(r'/US/NCSC/doccenter/flow.action\?TTC=1079&' + 'accountID=(?P.*)$', StatementsPage) + statement = URL(r'/US/REST/doccenterresource/downloadStatementsPdf.jws\?' + r'selectedIndex=0&date=(?P....-..-..)&' + r'downloadFormat=pdf', StatementPage) + unknown = URL('/.*$', SomePage) def get_account(self, id_): - account = next(self.iter_accounts()) - if account.id != id_: - raise AccountNotFound() - return account + innerId = self.to_accounts().inner_ids_dict().get(id_) + if innerId: + return self.to_account(innerId).account() + raise AccountNotFound() def iter_accounts(self): - self.start() - self._account_link().click() - self.wait_ajax() - label = self.wait('label#accountSingle .cS-accountMenuAccount')[0].text - bal = self._browser.find_elements_by_xpath(u'//span[contains(text(),' - '"Current Balance:")]/../../div[@class="cT-valueItem"]')[0].text - cardlimit = self._browser.find_elements_by_xpath( - u'//span[contains(text(),"Total Revolving Credit Line:")]' - '/../../div[@class="cT-valueItem"]')[0].text - paydue=self._browser.find_elements_by_xpath(u'//span[contains(text(),' - '"Minimum Payment Due") and @role="gridcell"]')[0].text - paydate = re.match(u'Minimum Payment Due.On (..-..-....):', paydue, - re.DOTALL + re.UNICODE).group(1) - paymin = self._browser.find_elements_by_xpath( - u'//span[contains(text(),"Minimum Payment Due")]' - '/../../div[contains(@class,"cT-valueItem")]')[0].text - account = Account() - account.id = label[-4:] - account.label = label - account.currency = Account.get_currency(bal) - account.balance = -AmTr.decimal_amount(bal) - account.cardlimit = AmTr.decimal_amount(cardlimit) - account.paydate = datetime.datetime.strptime(paydate, '%m-%d-%Y') - account.paymin = AmTr.decimal_amount(paymin) - account.type = Account.TYPE_CARD - self.finish() - yield account + for innerId in self.to_accounts().inner_ids_dict().values(): + yield self.to_account(innerId).account() def iter_history(self, account): - for trans in chain(self.iter_history_recent(account), - self.iter_history_statements(account)): + innerId = account._innerId + for trans in self.to_account(innerId).transactions(): yield trans + for date in self.to_statements(innerId).dates(): + for trans in self.to_statement(date).transactions(): + yield trans - def start(self): + def to_account(self, innerId): + return self.to_page(self.accdetails, accountID=innerId) + + def to_accounts(self): + return self.to_page(self.accounts, ttc='742') + + def to_statements(self, innerId): + return self.to_page(self.statements, accountID=innerId) + + def to_statement(self, date): + return self.to_page(self.statement, date=date) + + @need_login + def to_page(self, url, **data): + return self.page if url.is_here(**data) else url.go(data=data, **data) + + def do_login(self): # To avoid ImportError during e.g. building modules list. from selenium import webdriver - HOME_URL = 'https://online.citibank.com/US/JPS/portal/Home.do' - WIDTH = 1920 - HEIGHT = 10000 # So that everything fits... - - self._downloads = mkdtemp() - self._logger.debug('Saving downloaded files to %s' % self._downloads) - prof = webdriver.FirefoxProfile() - prof.set_preference('browser.download.folderList', 2) - prof.set_preference('browser.download.dir', self._downloads) - prof.set_preference('browser.helperApps.neverAsk.saveToDisk', - 'application/pdf') - prof.set_preference('pdfjs.disabled', True) - self._browser = webdriver.Firefox(prof) - self._browser.set_window_size(WIDTH, HEIGHT) - - self._browser.get('https://online.citibank.com') - self.wait('input[name="usernameMasked"]')[0].send_keys(self._username) - self.wait('input[name="password"]')[0].send_keys(self._password) - self.wait('form[name="SignonForm"]')[0].submit() - self._browser.get(HOME_URL) - if self._browser.current_url != HOME_URL: + browser = webdriver.Firefox() + browser.get(self.BASEURL) + browser.execute_script(''' + $("input[name='username']").val("%s"); + $("input[name='password']").val("%s"); + ''' % (self.username, self.password)) + self.wait(browser, 'form[name="SignonForm"]')[0].submit() + self.session.cookies.clear() + for c in browser.get_cookies(): + self.session.cookies.set(name=c['name'], value=unquote(c['value'])) + browser.close() + prof_dir = browser.firefox_profile.profile_dir + browser = None + gc.collect() # Make sure Firefox process is dead. + for i in xrange(self.MAX_RETRIES): + try: + rmtree(prof_dir) + break + except OSError: + sleep(min(1 << i, self.MAX_DELAY)) + if not self.home.go().logged: raise BrowserIncorrectPassword() - def finish(self): - prof_dir = self._browser.firefox_profile.profile_dir - self._browser.close() - del self._browser - rmtree(self._downloads) - del self._downloads - rmtree(prof_dir) - - def iter_history_recent(self, account): - self.start() - if account.id != self._account_id(): - raise AccountNotFound() - self._account_link().click() - self.wait_ajax() - for span in self.find('span.cM-maximizeButton'): - span.click() - for tr in self.find('tr.payments,tr.purchase'): - trdata = lambda n: tr.find_element_by_css_selector( - 'td.cT-bodyTableColumn%i span.cT-line1' % n).text - treid = tr.get_attribute('id').replace('rowID', 'rowIDExt') - tredata = {} - for tre in self.find('tr#%s' % treid): - labels = [x.text for x in tre.find_elements_by_css_selector( - 'div.cT-labelItem')] - values = [x.text for x in tre.find_elements_by_css_selector( - 'div.cT-valueItem')] - tredata = dict(zip(labels, values)) - - ref = tredata.get(u'Reference Number:', u'') - tdate = trdata(1) - pdate = tredata.get(u'Posted Date :', tdate) - desc = clean_label(trdata(2)) - amount = trdata(4) - - tdate = datetime.datetime.strptime(tdate, '%m-%d-%Y') - pdate = datetime.datetime.strptime(pdate, '%m-%d-%Y') - - if amount.startswith(u'(') and amount.endswith(u')'): - amount = AmTr.decimal_amount(amount[1:-1]) - else: - amount = -AmTr.decimal_amount(amount) - - trans = Transaction(ref) - trans.date = tdate - trans.rdate = pdate - trans.type = Transaction.TYPE_UNKNOWN - trans.raw = desc - trans.label = desc - trans.amount = amount - yield trans - - self.finish() - - def iter_history_statements(self, account): - # To avoid ImportError during e.g. building modules list. - from selenium.webdriver.common.keys import Keys - from selenium.common.exceptions import MoveTargetOutOfBoundsException,\ - ElementNotVisibleException - self.start() - if account.id != self._account_id(): - raise AccountNotFound() - self.wait('a#cmlink_ViewPastStmtLink')[0].click() - opts = self.wait('option#currentStatementDateOptions') - for i, opt in enumerate(opts): - # We're interested only in finalized statements. - if u'Unbilled' in opt.get_attribute('value'): - continue - self.wait('div#currentStatementsDate-button')[0].click() - ul = self.wait('ul#currentStatementsDate-menu')[0] - while True: - try: - self.wait('li#currentStatementDateOptions span')[i].click() - break - except (MoveTargetOutOfBoundsException, - ElementNotVisibleException): - ul.send_keys(Keys.ARROW_DOWN) - self.wait('a#downloadCurrentStatements')[0].click() - pdfname = self.wait_file('.pdf') - pdfpath = os.path.join(self._downloads, pdfname) - with open(pdfpath, 'rb') as f: - parser = StatementParser(f.read()) - os.remove(pdfpath) - # Transactions in a statement can go in different order. - ts = sorted(parser.read_transactions(), - cmp=lambda t1, t2: cmp(t2.date, t1.date)) - for t in ts: - yield t - self.finish() - - def find(self, selector): - self._logger.debug('Finding selector """%s""" on page %s' % ( - selector, self._browser.current_url)) - return self._browser.find_elements_by_css_selector(selector) - - @retrying - def wait(self, selector): - els = self.find(selector) - if not els: - raise OnceAgain() - return els - - @retrying - def wait_ajax(self): - from selenium.common.exceptions import StaleElementReferenceException - self._logger.debug('Waiting for async requests to finish on page %s' - % self._browser.current_url) - els = self._browser.find_elements_by_xpath( - u'//*[contains(text(),"Please wait")]') - try: - if not els or any(x.is_displayed() for x in els): - raise OnceAgain() - except StaleElementReferenceException: - raise OnceAgain() - - @retrying - def wait_file(self, suffix): - self._logger.debug('Waiting for file "*%s" to finish downloading.' % - suffix) - for name in os.listdir(self._downloads): - if not name.endswith(suffix): - continue - path = os.path.join(self._downloads, name) - # Wait until file is not empty. - if not os.stat(path).st_size: - continue - # Wait until no processes are accessing the file. - if subprocess.call(['fuser', '-s', path]) == 0: - continue - return name - raise OnceAgain() - - def _account_link(self): - return self.wait('a#cmlink_AccountNameLink')[0] - - def _account_id(self): - return re.match('.*-([0-9]+)$', self._account_link().text).group(1) + def wait(self, browser, selector): + self.logger.debug('Finding selector """%s""" on page %s' % ( + selector, browser.current_url)) + for i in xrange(self.MAX_RETRIES): + els = browser.find_elements_by_css_selector(selector) + if els: + return els + sleep(min(1 << i, self.MAX_DELAY)) + raise BrowserUnavailable('Unexpected site behavior. ' + 'Perhaps this module needs some fixing...')