CrAgr: optimized operations retrieval for CA Centre by adding a start_offset parameter to AccountsList.get_history

This commit is contained in:
Xavier G 2011-09-11 16:25:45 +02:00 committed by Romain Bignon
commit 43d562a338
2 changed files with 35 additions and 19 deletions

View file

@ -131,26 +131,29 @@ class Cragr(BaseBrowser):
return None
def get_history(self, account):
page_url = account.link_id
history_url = account.link_id
operations_count = 0
while (page_url):
# 1st, go on the account page
self.logger.debug('going on: %s' % page_url)
self.location('https://%s%s' % (self.DOMAIN, page_url))
# then, expand all history
# (it's not a next page, but more operation on one page)
# tested on CA centre
while True:
history_url = self.page.expand_history_page_url()
if not history_url :
break
self.location(history_url)
# 1st, go on the account page
self.logger.debug('going on: %s' % history_url)
self.location('https://%s%s' % (self.DOMAIN, history_url))
for page_operation in self.page.get_history(operations_count):
# Some regions have a "Show more" (well, actually "Voir les 25
# suivants") link we have to use to get all the operations.
# However, it does not show only the 25 next results, it *adds* them
# to the current view. Therefore, we have to parse each new page using
# an offset, in order to ignore all already-fetched operations.
# This especially occurs on CA Centre.
use_expand_url = bool(self.page.expand_history_page_url())
while (history_url):
# we skip "operations_count" operations on each page if we are in the case described above
operations_offset = operations_count if use_expand_url else 0
for page_operation in self.page.get_history(operations_count, operations_offset):
operations_count += 1
yield page_operation
page_url = self.page.next_page_url()
history_url = self.page.expand_history_page_url() if use_expand_url else self.page.next_page_url()
self.logger.debug('going on: %s' % history_url)
self.location('https://%s%s' % (self.DOMAIN, history_url))
def dict_find_value(self, dictionary, value):
"""

View file

@ -173,10 +173,12 @@ class AccountsList(CragrBasePage):
data = re.sub(' +', ' ', data.replace("\n", ' ').strip())
return data
def get_history(self, start_index = 0):
def get_history(self, start_index = 0, start_offset = 0):
"""
Returns the history of a specific account. Note that this function
expects the current page page to be the one dedicated to this history.
expects the current page to be the one dedicated to this history.
start_index is the id used for the first created operation.
start_offset allows ignoring the `n' first Operations on the page.
"""
# tested on CA Lorraine, Paris, Toulouse
# avoir parsing the page as an account-dedicated page if it is not the case
@ -185,6 +187,7 @@ class AccountsList(CragrBasePage):
index = start_index
operation = False
skipped = 0
body_elmt_list = self.document.xpath('/html/body/*')
@ -229,6 +232,9 @@ class AccountsList(CragrBasePage):
if table_layout:
lines = self.document.xpath('id("operationsContent")//table[@class="tb"]/tr')
for line in lines:
if skipped < start_offset:
skipped += 1
continue
operation = Operation(index)
index += 1
operation.date = self.extract_text(line[0])
@ -237,6 +243,10 @@ class AccountsList(CragrBasePage):
yield operation
elif (not alternate_layout):
for body_elmt in interesting_divs:
if skipped < start_offset:
if self.is_right_aligned_div(body_elmt):
skipped += 1
continue
if (self.is_right_aligned_div(body_elmt)):
# this is the second line of an operation entry, displaying the amount
operation.amount = clean_amount(self.extract_text(body_elmt))
@ -255,6 +265,9 @@ class AccountsList(CragrBasePage):
operation.label = u'Unknown'
else:
for i in range(0, len(interesting_divs)/3):
if skipped < start_offset:
skipped += 1
continue
operation = Operation(index)
index += 1
# amount