CrAgr: optimized operations retrieval for CA Centre by adding a start_offset parameter to AccountsList.get_history

This commit is contained in:
Xavier G 2011-09-11 16:25:45 +02:00 committed by Romain Bignon
commit 43d562a338
2 changed files with 35 additions and 19 deletions

View file

@ -131,26 +131,29 @@ class Cragr(BaseBrowser):
return None return None
def get_history(self, account): def get_history(self, account):
page_url = account.link_id history_url = account.link_id
operations_count = 0 operations_count = 0
while (page_url):
# 1st, go on the account page
self.logger.debug('going on: %s' % page_url)
self.location('https://%s%s' % (self.DOMAIN, page_url))
# then, expand all history # 1st, go on the account page
# (it's not a next page, but more operation on one page) self.logger.debug('going on: %s' % history_url)
# tested on CA centre self.location('https://%s%s' % (self.DOMAIN, history_url))
while True:
history_url = self.page.expand_history_page_url()
if not history_url :
break
self.location(history_url)
for page_operation in self.page.get_history(operations_count): # Some regions have a "Show more" (well, actually "Voir les 25
# suivants") link we have to use to get all the operations.
# However, it does not show only the 25 next results, it *adds* them
# to the current view. Therefore, we have to parse each new page using
# an offset, in order to ignore all already-fetched operations.
# This especially occurs on CA Centre.
use_expand_url = bool(self.page.expand_history_page_url())
while (history_url):
# we skip "operations_count" operations on each page if we are in the case described above
operations_offset = operations_count if use_expand_url else 0
for page_operation in self.page.get_history(operations_count, operations_offset):
operations_count += 1 operations_count += 1
yield page_operation yield page_operation
page_url = self.page.next_page_url() history_url = self.page.expand_history_page_url() if use_expand_url else self.page.next_page_url()
self.logger.debug('going on: %s' % history_url)
self.location('https://%s%s' % (self.DOMAIN, history_url))
def dict_find_value(self, dictionary, value): def dict_find_value(self, dictionary, value):
""" """

View file

@ -173,10 +173,12 @@ class AccountsList(CragrBasePage):
data = re.sub(' +', ' ', data.replace("\n", ' ').strip()) data = re.sub(' +', ' ', data.replace("\n", ' ').strip())
return data return data
def get_history(self, start_index = 0): def get_history(self, start_index = 0, start_offset = 0):
""" """
Returns the history of a specific account. Note that this function Returns the history of a specific account. Note that this function
expects the current page page to be the one dedicated to this history. expects the current page to be the one dedicated to this history.
start_index is the id used for the first created operation.
start_offset allows ignoring the `n' first Operations on the page.
""" """
# tested on CA Lorraine, Paris, Toulouse # tested on CA Lorraine, Paris, Toulouse
# avoir parsing the page as an account-dedicated page if it is not the case # avoir parsing the page as an account-dedicated page if it is not the case
@ -185,6 +187,7 @@ class AccountsList(CragrBasePage):
index = start_index index = start_index
operation = False operation = False
skipped = 0
body_elmt_list = self.document.xpath('/html/body/*') body_elmt_list = self.document.xpath('/html/body/*')
@ -229,6 +232,9 @@ class AccountsList(CragrBasePage):
if table_layout: if table_layout:
lines = self.document.xpath('id("operationsContent")//table[@class="tb"]/tr') lines = self.document.xpath('id("operationsContent")//table[@class="tb"]/tr')
for line in lines: for line in lines:
if skipped < start_offset:
skipped += 1
continue
operation = Operation(index) operation = Operation(index)
index += 1 index += 1
operation.date = self.extract_text(line[0]) operation.date = self.extract_text(line[0])
@ -237,6 +243,10 @@ class AccountsList(CragrBasePage):
yield operation yield operation
elif (not alternate_layout): elif (not alternate_layout):
for body_elmt in interesting_divs: for body_elmt in interesting_divs:
if skipped < start_offset:
if self.is_right_aligned_div(body_elmt):
skipped += 1
continue
if (self.is_right_aligned_div(body_elmt)): if (self.is_right_aligned_div(body_elmt)):
# this is the second line of an operation entry, displaying the amount # this is the second line of an operation entry, displaying the amount
operation.amount = clean_amount(self.extract_text(body_elmt)) operation.amount = clean_amount(self.extract_text(body_elmt))
@ -255,6 +265,9 @@ class AccountsList(CragrBasePage):
operation.label = u'Unknown' operation.label = u'Unknown'
else: else:
for i in range(0, len(interesting_divs)/3): for i in range(0, len(interesting_divs)/3):
if skipped < start_offset:
skipped += 1
continue
operation = Operation(index) operation = Operation(index)
index += 1 index += 1
# amount # amount