From 43d562a3385c8c2ac175f289430cc5bf33022a87 Mon Sep 17 00:00:00 2001
From: Xavier G <xavier@tuxfamily.org>
Date: Sun, 11 Sep 2011 16:25:45 +0200
Subject: [PATCH] CrAgr: optimized operations retrieval for CA Centre by adding
 a start_offset parameter to AccountsList.get_history

---
 weboob/backends/cragr/browser.py             | 33 +++++++++++---------
 weboob/backends/cragr/pages/accounts_list.py | 17 ++++++++--
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/weboob/backends/cragr/browser.py b/weboob/backends/cragr/browser.py
index 38326cd3..e1a2c35b 100644
--- a/weboob/backends/cragr/browser.py
+++ b/weboob/backends/cragr/browser.py
@@ -131,26 +131,29 @@ class Cragr(BaseBrowser):
         return None
 
     def get_history(self, account):
-        page_url = account.link_id
+        history_url = account.link_id
         operations_count = 0
-        while (page_url):
-            # 1st, go on the account page
-            self.logger.debug('going on: %s' % page_url)
-            self.location('https://%s%s' % (self.DOMAIN, page_url))
 
-            # then, expand all history
-            # (it's not a next page, but more operation on one page)
-            # tested on CA centre
-            while True:
-                history_url = self.page.expand_history_page_url()
-                if not history_url :
-                    break
-                self.location(history_url)
+        # 1st, go on the account page
+        self.logger.debug('going on: %s' % history_url)
+        self.location('https://%s%s' % (self.DOMAIN, history_url))
 
-            for page_operation in self.page.get_history(operations_count):
+        # Some regions have a "Show more" (well, actually "Voir les 25
+        # suivants") link we have to use to get all the operations.
+        # However, it does not show only the 25 next results, it *adds* them
+        # to the current view. Therefore, we have to parse each new page using
+        # an offset, in order to ignore all already-fetched operations.
+        # This especially occurs on CA Centre.
+        use_expand_url = bool(self.page.expand_history_page_url())
+        while (history_url):
+            # we skip "operations_count" operations on each page if we are in the case described above
+            operations_offset = operations_count if use_expand_url else 0
+            for page_operation in self.page.get_history(operations_count, operations_offset):
                 operations_count += 1
                 yield page_operation
-            page_url = self.page.next_page_url()
+            history_url = self.page.expand_history_page_url() if use_expand_url else self.page.next_page_url()
+            self.logger.debug('going on: %s' % history_url)
+            self.location('https://%s%s' % (self.DOMAIN, history_url))
 
     def dict_find_value(self, dictionary, value):
         """
diff --git a/weboob/backends/cragr/pages/accounts_list.py b/weboob/backends/cragr/pages/accounts_list.py
index 50abbc0e..65df9ffc 100644
--- a/weboob/backends/cragr/pages/accounts_list.py
+++ b/weboob/backends/cragr/pages/accounts_list.py
@@ -173,10 +173,12 @@ class AccountsList(CragrBasePage):
         data = re.sub(' +', ' ', data.replace("\n", ' ').strip())
         return data
 
-    def get_history(self, start_index = 0):
+    def get_history(self, start_index = 0, start_offset = 0):
         """
             Returns the history of a specific account. Note that this function
-            expects the current page page to be the one dedicated to this history.
+            expects the current page to be the one dedicated to this history.
+            start_index is the id used for the first created operation.
+            start_offset allows ignoring the `n' first Operations on the page.
         """
         # tested on CA Lorraine, Paris, Toulouse
         # avoir parsing the page as an account-dedicated page if it is not the case
@@ -185,6 +187,7 @@ class AccountsList(CragrBasePage):
 
         index = start_index
         operation = False
+        skipped = 0
 
         body_elmt_list = self.document.xpath('/html/body/*')
 
@@ -229,6 +232,9 @@ class AccountsList(CragrBasePage):
         if table_layout:
             lines = self.document.xpath('id("operationsContent")//table[@class="tb"]/tr')
             for line in lines:
+                if skipped < start_offset:
+                    skipped += 1
+                    continue
                 operation = Operation(index)
                 index += 1
                 operation.date = self.extract_text(line[0])
@@ -237,6 +243,10 @@ class AccountsList(CragrBasePage):
                 yield operation
         elif (not alternate_layout):
             for body_elmt in interesting_divs:
+                if skipped < start_offset:
+                    if self.is_right_aligned_div(body_elmt):
+                        skipped += 1 
+                    continue
                 if (self.is_right_aligned_div(body_elmt)):
                     # this is the second line of an operation entry, displaying the amount
                     operation.amount = clean_amount(self.extract_text(body_elmt))
@@ -255,6 +265,9 @@ class AccountsList(CragrBasePage):
                         operation.label = u'Unknown'
         else:
             for i in range(0, len(interesting_divs)/3):
+                if skipped < start_offset:
+                    skipped += 1
+                    continue
                 operation = Operation(index)
                 index += 1
                 # amount