From da612c58c4736dcd5cadd9560ba86b2780018d00 Mon Sep 17 00:00:00 2001 From: Christophe Benz Date: Tue, 13 Apr 2010 15:58:19 +0200 Subject: [PATCH] provide many parsers --- weboob/tools/parser/__init__.py | 23 ++++++ weboob/tools/parser/elementtidyparser.py | 44 ++++++++++++ weboob/tools/parser/lxmlparser.py | 26 +++++++ weboob/tools/parser/standardparser.py | 91 ++++++++++++++++++++++++ 4 files changed, 184 insertions(+) create mode 100644 weboob/tools/parser/__init__.py create mode 100644 weboob/tools/parser/elementtidyparser.py create mode 100644 weboob/tools/parser/lxmlparser.py create mode 100644 weboob/tools/parser/standardparser.py diff --git a/weboob/tools/parser/__init__.py b/weboob/tools/parser/__init__.py new file mode 100644 index 00000000..0c1006bc --- /dev/null +++ b/weboob/tools/parser/__init__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Christophe Benz + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +from .standardparser import StandardParser +from .elementtidyparser import ElementTidyParser +from .lxmlparser import LxmlHtmlParser diff --git a/weboob/tools/parser/elementtidyparser.py b/weboob/tools/parser/elementtidyparser.py new file mode 100644 index 00000000..1ecd1f22 --- /dev/null +++ b/weboob/tools/parser/elementtidyparser.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Romain Bignon + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +# XXX Currently, elementtidy segfaults when there are no error, because of +# the behavior of libtidy. +# A patch has been sent to Debian: +# http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=576343 +# +# As it is not integrated in Debian yet, and as this problem persists on other +# systems, using elementtidy is for now to avoid. + +from elementtidy import TidyHTMLTreeBuilder +try: + from xml.etree import cElementTree as ElementTree +except ImportError: + from xml.etree import ElementTree + +class ElementTidyParser(object): + def parse(self, data, encoding=None): + TidyHTMLTreeBuilder.ElementTree = ElementTree + HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder + parser = HTMLTreeBuilder(encoding) + tree = ElementTree.parse(data, parser) + for elem in tree.getiterator(): + if elem.tag.startswith('{'): + elem.tag = elem.tag[elem.tag.find('}')+1:] + return tree diff --git a/weboob/tools/parser/lxmlparser.py b/weboob/tools/parser/lxmlparser.py new file mode 100644 index 00000000..8f865357 --- /dev/null +++ b/weboob/tools/parser/lxmlparser.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Christophe Benz + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +import lxml.html + +class LxmlHtmlParser(object): + def parse(self, data, encoding=None): + parser = lxml.html.HTMLParser(encoding=encoding) + return lxml.html.parse(data, parser) diff --git a/weboob/tools/parser/standardparser.py b/weboob/tools/parser/standardparser.py new file mode 100644 index 00000000..28d1cfde --- /dev/null +++ b/weboob/tools/parser/standardparser.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Romain Bignon + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +__all__ = ['StandardParser', 'tostring'] + +from HTMLParser import HTMLParser +import htmlentitydefs +try: + from xml.etree import cElementTree as ElementTree +except ImportError: + from xml.etree import ElementTree + +class HTMLTreeBuilder(HTMLParser): + def __init__(self, encoding=None): + HTMLParser.__init__(self) + self._target = ElementTree.TreeBuilder() + + def doctype(self, name, pubid, system): + pass + + def close(self): + tree = self._target.close() + return tree + + def handle_starttag(self, tag, attrs): + self._target.start(tag, dict(attrs)) + + def handle_startendtag(self, tag, attrs): + self._target.start(tag, dict(attrs)) + self._target.end(tag) + + def handle_charref(self, name): + self._target.data(unichr(int(name))) + + def handle_entityref(self, name): + try: + self._target.data(unichr(htmlentitydefs.name2codepoint[name])) + except KeyError: + self._target.data('&' + name) + + def handle_data(self, data): + self._target.data(data) + + def handle_endtag(self, tag): + try: + self._target.end(tag) + except: + pass + +class StandardParser(object): + def parse(self, data, encoding=None): + parser = HTMLTreeBuilder(encoding) + tree = ElementTree.parse(data, parser) + for elem in tree.getiterator(): + if elem.tag.startswith('{'): + elem.tag = elem.tag[elem.tag.find('}')+1:] + return tree + +def tostring(element): + e = ElementTree.Element('body') + e.text = element.text + e.tail = element.tail + for sub in element.getchildren(): + e.append(sub) + s = '' + # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. + for encoding in ('utf-8', 'ISO-8859-1'): + try: + s = ElementTree.tostring(e, encoding) + except UnicodeError: + continue + else: + break + return unicode(s)