diff --git a/weboob/tools/parsers/__init__.py b/weboob/tools/parsers/__init__.py index 86db1006..7aa712c3 100644 --- a/weboob/tools/parsers/__init__.py +++ b/weboob/tools/parsers/__init__.py @@ -30,6 +30,10 @@ def load_lxml(): from .lxmlparser import LxmlHtmlParser return LxmlHtmlParser +def load_lxmlsoup(): + from .lxmlsoupparser import LxmlSoupParser + return LxmlSoupParser + def load_html5lib(): from .html5libparser import Html5libParser return Html5libParser @@ -42,7 +46,7 @@ def load_builtin(): from .htmlparser import HTMLParser return HTMLParser -def get_parser(preference_order=('lxml', 'html5lib', 'elementtidy', 'builtin')): +def get_parser(preference_order=('lxml', 'lxmlsoup', 'html5lib', 'elementtidy', 'builtin')): """ Get a parser from a preference order list. This allows Weboob to run on systems without lxml, which is the default parser. diff --git a/weboob/tools/parsers/lxmlsoupparser.py b/weboob/tools/parsers/lxmlsoupparser.py new file mode 100644 index 00000000..0df80e38 --- /dev/null +++ b/weboob/tools/parsers/lxmlsoupparser.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Christophe Benz +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +import lxml.html +import lxml.html.soupparser + +from .iparser import IParser + + +__all__ = ['LxmlHtmlParser'] + + +class LxmlSoupParser(IParser): + """ + Parser using lxml elementsoup. + + Note that it is not available on every systems. + """ + + def parse(self, data, encoding=None): + return lxml.html.soupparser.parse(data) + + def tostring(self, element): + return lxml.html.tostring(element, encoding=unicode)