From fd3484e063841415b16fc5564cc4ed675654fb00 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Wed, 15 Oct 2014 11:41:34 +0200 Subject: [PATCH] filters: Add Unicode normalization to CleanText With a default to 'NFC'. This is inspired by python-ftfy, which has a nice explanation of the modes: https://github.com/LuminosoInsight/python-ftfy I think K is too strong to be a default, however C has zero visual differences and should be the most common way to represent Unicode characters. --- weboob/browser/filters/standard.py | 34 ++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/weboob/browser/filters/standard.py b/weboob/browser/filters/standard.py index 73474d97..f84ac703 100644 --- a/weboob/browser/filters/standard.py +++ b/weboob/browser/filters/standard.py @@ -21,6 +21,7 @@ from __future__ import absolute_import import datetime import re +import unicodedata from decimal import Decimal, InvalidOperation from itertools import islice @@ -299,6 +300,10 @@ class CleanText(Filter): It first replaces all tabs and multiple spaces (including newlines if ``newlines`` is True) to one space and strips the result string. + + The result is coerced into unicode, and optionally normalized + according to the ``normalize`` argument. + Then it replaces all symbols given in the ``symbols`` argument. >>> CleanText().filter('coucou ') @@ -311,26 +316,27 @@ class CleanText(Filter): u'coucou\\ncoucou' """ - def __init__(self, selector=None, symbols='', replace=[], childs=True, newlines=True, **kwargs): + def __init__(self, selector=None, symbols='', replace=[], childs=True, newlines=True, normalize='NFC', **kwargs): super(CleanText, self).__init__(selector, **kwargs) self.symbols = symbols self.toreplace = replace self.childs = childs self.newlines = newlines + self.normalize = normalize @debug() def filter(self, txt): if isinstance(txt, (tuple, list)): txt = u' '.join([self.clean(item, childs=self.childs) for item in txt]) - txt = self.clean(txt, childs=self.childs, newlines=self.newlines) + txt = self.clean(txt, self.childs, self.newlines, self.normalize) txt = self.remove(txt, self.symbols) txt = self.replace(txt, self.toreplace) - # lxml under Python 2 returns str instead of unicode if it is pure ASCII + # ensure it didn't become str by mistake return unicode(txt) @classmethod - def clean(cls, txt, childs=True, newlines=True): + def clean(cls, txt, childs=True, newlines=True, normalize='NFC'): if not isinstance(txt, basestring): if childs: txt = [t.strip() for t in txt.itertext()] @@ -342,7 +348,13 @@ class CleanText(Filter): else: # normalize newlines and clean what is inside txt = '\n'.join([cls.clean(l) for l in txt.splitlines()]) - return txt.strip() + txt = txt.strip() + # lxml under Python 2 returns str instead of unicode if it is pure ASCII + txt = unicode(txt) + # normalize to a standard Unicode form + if normalize: + txt = unicodedata.normalize(normalize, txt) + return txt @classmethod def remove(cls, txt, symbols): @@ -672,7 +684,17 @@ class Join(Filter): return res -def test(): +def test_CleanText(): # This test works poorly under a doctest, or would be hard to read assert CleanText().filter(u' coucou  \n\théhé') == u'coucou héhé' assert CleanText().filter('coucou\xa0coucou') == CleanText().filter(u'coucou\xa0coucou') == u'coucou coucou' + + # Unicode normalization + assert CleanText().filter(u'Éçã') == u'Éçã' + assert CleanText(normalize='NFKC').filter(u'…') == u'...' + assert CleanText().filter(u'…') == u'…' + # Diacritical mark (dakuten) + assert CleanText().filter(u'\u3053\u3099') == u'\u3054' + assert CleanText(normalize='NFD').filter(u'\u3053\u3099') == u'\u3053\u3099' + assert CleanText(normalize='NFD').filter(u'\u3054') == u'\u3053\u3099' + assert CleanText(normalize=False).filter(u'\u3053\u3099') == u'\u3053\u3099'