From 43d562a3385c8c2ac175f289430cc5bf33022a87 Mon Sep 17 00:00:00 2001 From: Xavier G Date: Sun, 11 Sep 2011 16:25:45 +0200 Subject: [PATCH] CrAgr: optimized operations retrieval for CA Centre by adding a start_offset parameter to AccountsList.get_history --- weboob/backends/cragr/browser.py | 33 +++++++++++--------- weboob/backends/cragr/pages/accounts_list.py | 17 ++++++++-- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/weboob/backends/cragr/browser.py b/weboob/backends/cragr/browser.py index 38326cd3..e1a2c35b 100644 --- a/weboob/backends/cragr/browser.py +++ b/weboob/backends/cragr/browser.py @@ -131,26 +131,29 @@ class Cragr(BaseBrowser): return None def get_history(self, account): - page_url = account.link_id + history_url = account.link_id operations_count = 0 - while (page_url): - # 1st, go on the account page - self.logger.debug('going on: %s' % page_url) - self.location('https://%s%s' % (self.DOMAIN, page_url)) - # then, expand all history - # (it's not a next page, but more operation on one page) - # tested on CA centre - while True: - history_url = self.page.expand_history_page_url() - if not history_url : - break - self.location(history_url) + # 1st, go on the account page + self.logger.debug('going on: %s' % history_url) + self.location('https://%s%s' % (self.DOMAIN, history_url)) - for page_operation in self.page.get_history(operations_count): + # Some regions have a "Show more" (well, actually "Voir les 25 + # suivants") link we have to use to get all the operations. + # However, it does not show only the 25 next results, it *adds* them + # to the current view. Therefore, we have to parse each new page using + # an offset, in order to ignore all already-fetched operations. + # This especially occurs on CA Centre. + use_expand_url = bool(self.page.expand_history_page_url()) + while (history_url): + # we skip "operations_count" operations on each page if we are in the case described above + operations_offset = operations_count if use_expand_url else 0 + for page_operation in self.page.get_history(operations_count, operations_offset): operations_count += 1 yield page_operation - page_url = self.page.next_page_url() + history_url = self.page.expand_history_page_url() if use_expand_url else self.page.next_page_url() + self.logger.debug('going on: %s' % history_url) + self.location('https://%s%s' % (self.DOMAIN, history_url)) def dict_find_value(self, dictionary, value): """ diff --git a/weboob/backends/cragr/pages/accounts_list.py b/weboob/backends/cragr/pages/accounts_list.py index 50abbc0e..65df9ffc 100644 --- a/weboob/backends/cragr/pages/accounts_list.py +++ b/weboob/backends/cragr/pages/accounts_list.py @@ -173,10 +173,12 @@ class AccountsList(CragrBasePage): data = re.sub(' +', ' ', data.replace("\n", ' ').strip()) return data - def get_history(self, start_index = 0): + def get_history(self, start_index = 0, start_offset = 0): """ Returns the history of a specific account. Note that this function - expects the current page page to be the one dedicated to this history. + expects the current page to be the one dedicated to this history. + start_index is the id used for the first created operation. + start_offset allows ignoring the `n' first Operations on the page. """ # tested on CA Lorraine, Paris, Toulouse # avoir parsing the page as an account-dedicated page if it is not the case @@ -185,6 +187,7 @@ class AccountsList(CragrBasePage): index = start_index operation = False + skipped = 0 body_elmt_list = self.document.xpath('/html/body/*') @@ -229,6 +232,9 @@ class AccountsList(CragrBasePage): if table_layout: lines = self.document.xpath('id("operationsContent")//table[@class="tb"]/tr') for line in lines: + if skipped < start_offset: + skipped += 1 + continue operation = Operation(index) index += 1 operation.date = self.extract_text(line[0]) @@ -237,6 +243,10 @@ class AccountsList(CragrBasePage): yield operation elif (not alternate_layout): for body_elmt in interesting_divs: + if skipped < start_offset: + if self.is_right_aligned_div(body_elmt): + skipped += 1 + continue if (self.is_right_aligned_div(body_elmt)): # this is the second line of an operation entry, displaying the amount operation.amount = clean_amount(self.extract_text(body_elmt)) @@ -255,6 +265,9 @@ class AccountsList(CragrBasePage): operation.label = u'Unknown' else: for i in range(0, len(interesting_divs)/3): + if skipped < start_offset: + skipped += 1 + continue operation = Operation(index) index += 1 # amount