From a0559e539e0780cab4f15a1330a1208a3edb0b94 Mon Sep 17 00:00:00 2001
From: Laurent Bachelier <laurent@bachelier.name>
Date: Thu, 31 Jul 2014 13:47:29 +0200
Subject: [PATCH] CleanText improvements

* \t is always in \s so no need to add it
* handle the non-breaking space thanks to the re.UNICODE flag
* add an option to keep (but normalize) newlines
* more tests
---
 weboob/tools/browser2/filters/standard.py | 32 +++++++++++++++++------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/weboob/tools/browser2/filters/standard.py b/weboob/tools/browser2/filters/standard.py
index 7f93102a..f4fcc44e 100644
--- a/weboob/tools/browser2/filters/standard.py
+++ b/weboob/tools/browser2/filters/standard.py
@@ -218,41 +218,51 @@ class CleanText(Filter):
     """
     Get a cleaned text from an element.
 
-    It first replaces all tabs and multiple spaces to one space and strip the result
-    string.
-    Second, it replaces all symbols given in second argument.
+    It first replaces all tabs and multiple spaces
+    (including newlines if ``newlines`` is True)
+    to one space and strips the result string.
+    Then it replaces all symbols given in the ``symbols`` argument.
 
     >>> CleanText().filter('coucou ')
     u'coucou'
-    >>> CleanText().filter(u'coucou\xc2\xa0\t\\ncoucou')
+    >>> CleanText().filter(u'coucou\xa0coucou')
     u'coucou coucou'
+    >>> CleanText(newlines=True).filter(u'coucou\\r\\n coucou ')
+    u'coucou coucou'
+    >>> CleanText(newlines=False).filter(u'coucou\\r\\n coucou ')
+    u'coucou\\ncoucou'
     """
 
-    def __init__(self, selector=None, symbols='', replace=[], childs=True, **kwargs):
+    def __init__(self, selector=None, symbols='', replace=[], childs=True, newlines=True, **kwargs):
         super(CleanText, self).__init__(selector, **kwargs)
         self.symbols = symbols
         self.toreplace = replace
         self.childs = childs
+        self.newlines = newlines
 
     def filter(self, txt):
         if isinstance(txt, (tuple, list)):
             txt = u' '.join([self.clean(item, childs=self.childs) for item in txt])
 
-        txt = self.clean(txt, childs=self.childs)
+        txt = self.clean(txt, childs=self.childs, newlines=self.newlines)
         txt = self.remove(txt, self.symbols)
         txt = self.replace(txt, self.toreplace)
         # lxml under Python 2 returns str instead of unicode if it is pure ASCII
         return unicode(txt)
 
     @classmethod
-    def clean(cls, txt, childs=True):
+    def clean(cls, txt, childs=True, newlines=True):
         if not isinstance(txt, basestring):
             if childs:
                 txt = [t.strip() for t in txt.itertext()]
             else:
                 txt = [txt.text.strip()]
             txt = u' '.join(txt)                 # 'foo   bar'
-        txt = re.sub(u'[\\s\xa0\t]+', u' ', txt)   # 'foo bar'
+        if newlines:
+            txt = re.sub(u'\s+', u' ', txt, flags=re.UNICODE)   # 'foo bar'
+        else:
+            # normalize newlines and clean what is inside
+            txt = '\n'.join([cls.clean(l) for l in txt.splitlines()])
         return txt.strip()
 
     @classmethod
@@ -482,3 +492,9 @@ class Join(Filter):
             res += self.pattern % self.textCleaner.clean(li)
 
         return res
+
+
+def test():
+    # This test works poorly under a doctest, or would be hard to read
+    assert CleanText().filter(u' coucou  \n\théhé') == u'coucou héhé'
+    assert CleanText().filter('coucou\xa0coucou') == CleanText().filter(u'coucou\xa0coucou') == u'coucou coucou'