diff --git a/weboob/tools/parsers/__init__.py b/weboob/tools/parsers/__init__.py index 5c47fe93..96ba3976 100644 --- a/weboob/tools/parsers/__init__.py +++ b/weboob/tools/parsers/__init__.py @@ -38,6 +38,11 @@ def load_lxmlsoup(): return LxmlSoupParser +def load_xml(): + from .lxmlparser import LxmlXmlParser + return LxmlXmlParser + + def load_html5lib(): from .html5libparser import Html5libParser return Html5libParser @@ -82,6 +87,7 @@ def get_parser(preference_order=('lxml', 'lxmlsoup')): """ if not isinstance(preference_order, (tuple, list)): preference_order = [preference_order] + for kind in preference_order: if not 'load_%s' % kind in globals(): continue diff --git a/weboob/tools/parsers/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py index 58856cbf..9e0e54fa 100644 --- a/weboob/tools/parsers/lxmlparser.py +++ b/weboob/tools/parsers/lxmlparser.py @@ -19,31 +19,35 @@ import re -import lxml.html +import lxml.html as html +import lxml.etree as etree from .iparser import IParser from ..browser import BrokenPageError -__all__ = ['LxmlHtmlParser'] +__all__ = ['LxmlHtmlParser', 'LxmlXmlParser'] -class LxmlHtmlParser(IParser): +class LxmlParser(IParser): """ Parser using lxml. Note that it is not available on every systems. """ + def get_parser(encoding=None): + pass + def parse(self, data, encoding=None): if encoding is None: parser = None else: - parser = lxml.html.HTMLParser(encoding=encoding) - return lxml.html.parse(data, parser) + parser = self.get_parser(encoding=encoding) + return self.module.parse(data, parser) def tostring(self, element): - return lxml.html.tostring(element, encoding=unicode) + return self.module.tostring(element, encoding=unicode) def tocleanstring(self, element): txt = [txt.strip() for txt in element.itertext()] @@ -52,7 +56,7 @@ class LxmlHtmlParser(IParser): return txt.strip() def strip(self, s): - doc = lxml.html.fromstring(s) # parse html string + doc = self.module.fromstring(s) # parse html/xml string return self.tocleanstring(doc) @classmethod @@ -97,3 +101,29 @@ class LxmlHtmlParser(IParser): return results[0] if nb == 1 else results else: raise Exception('Unhandled value for kwarg "nb": %s' % nb) + + +class LxmlHtmlParser(LxmlParser): + """ + Parser using lxml. + + Note that it is not available on every systems. + """ + def __init__(self, *args, **kwargs): + self.module = html + + def get_parser(self, encoding=None): + return html.HTMLParser(encoding=encoding) + + +class LxmlXmlParser(LxmlParser): + """ + Parser using lxml. + + Note that it is not available on every systems. + """ + def __init__(self, *args, **kwargs): + self.module = etree + + def get_parser(self, encoding=None): + return etree.XMLParser(encoding=encoding, strip_cdata=False)