From afb3bc412fb5fadf6010dbf107c016b9c95ee152 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Fri, 21 Mar 2014 18:48:14 +0100 Subject: [PATCH] Remove unused/obsolete parsers lxmlsoup is currently used by one module. --- weboob/tools/parsers/__init__.py | 16 ---- weboob/tools/parsers/elementtidyparser.py | 67 ---------------- weboob/tools/parsers/html5libparser.py | 53 ------------- weboob/tools/parsers/htmlparser.py | 96 ----------------------- 4 files changed, 232 deletions(-) delete mode 100644 weboob/tools/parsers/elementtidyparser.py delete mode 100644 weboob/tools/parsers/html5libparser.py delete mode 100644 weboob/tools/parsers/htmlparser.py diff --git a/weboob/tools/parsers/__init__.py b/weboob/tools/parsers/__init__.py index 96ba3976..a78f224e 100644 --- a/weboob/tools/parsers/__init__.py +++ b/weboob/tools/parsers/__init__.py @@ -43,21 +43,6 @@ def load_xml(): return LxmlXmlParser -def load_html5lib(): - from .html5libparser import Html5libParser - return Html5libParser - - -def load_elementtidy(): - from .elementtidyparser import ElementTidyParser - return ElementTidyParser - - -def load_builtin(): - from .htmlparser import HTMLParser - return HTMLParser - - def load_json(): # This parser doesn't read HTML, don't include it in the # preference_order default value below. @@ -82,7 +67,6 @@ def load_raw(): def get_parser(preference_order=('lxml', 'lxmlsoup')): """ Get a parser from a preference order list. - This allows Weboob to run on systems without lxml, which is the default parser. Return a parser implementing IParser. """ if not isinstance(preference_order, (tuple, list)): diff --git a/weboob/tools/parsers/elementtidyparser.py b/weboob/tools/parsers/elementtidyparser.py deleted file mode 100644 index 162ae237..00000000 --- a/weboob/tools/parsers/elementtidyparser.py +++ /dev/null @@ -1,67 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010-2011 Romain Bignon -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - -# XXX Currently, elementtidy segfaults when there are no error, because of -# the behavior of libtidy. -# A patch has been sent to Debian: -# http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=576343 -# -# As it is not integrated in Debian yet, and as this problem persists on other -# systems, using elementtidy is for now to avoid. - - -from elementtidy import TidyHTMLTreeBuilder -try: - from xml.etree import cElementTree as ElementTree -except ImportError: - from xml.etree import ElementTree - -from .iparser import IParser - - -__all__ = ['ElementTidyParser'] - - -class ElementTidyParser(IParser): - def parse(self, data, encoding=None): - TidyHTMLTreeBuilder.ElementTree = ElementTree - HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder - parser = HTMLTreeBuilder(encoding) - tree = ElementTree.parse(data, parser) - for elem in tree.getiterator(): - if elem.tag.startswith('{'): - elem.tag = elem.tag[elem.tag.find('}')+1:] - return tree - - def tostring(self, element): - e = ElementTree.Element('body') - e.text = element.text - e.tail = element.tail - for sub in element.getchildren(): - e.append(sub) - s = '' - # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. - for encoding in ('utf-8', 'ISO-8859-1'): - try: - s = ElementTree.tostring(e, encoding) - except UnicodeError: - continue - else: - break - return unicode(s) diff --git a/weboob/tools/parsers/html5libparser.py b/weboob/tools/parsers/html5libparser.py deleted file mode 100644 index ec9e3fe6..00000000 --- a/weboob/tools/parsers/html5libparser.py +++ /dev/null @@ -1,53 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010-2011 Romain Bignon, Christophe Benz -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - - -from html5lib import treebuilders, HTMLParser -try: - from xml.etree import cElementTree as ElementTree -except ImportError: - from xml.etree import ElementTree - -from .iparser import IParser - - -__all__ = ['Html5libParser'] - - -class Html5libParser(HTMLParser, IParser): - """ - Parser using html5lib. - - Note that it is not available on every systems. - """ - - # Default implementation for each type of API. - defaults = {'etree': ElementTree} - - def __init__(self, api='etree'): - # if no default implementation is defined for this api, set it to None - # to let getTreeBuilder() using the corresponding implementation. - implementation = self.defaults.get(api, None) - HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder(api, implementation)) - - def parse(self, data, encoding): - return HTMLParser.parse(self, data, encoding=encoding) - - def tostring(self, element): - return element.toxml() diff --git a/weboob/tools/parsers/htmlparser.py b/weboob/tools/parsers/htmlparser.py deleted file mode 100644 index 4983050e..00000000 --- a/weboob/tools/parsers/htmlparser.py +++ /dev/null @@ -1,96 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010-2011 Romain Bignon -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - - -from HTMLParser import HTMLParser as _HTMLParser -import htmlentitydefs -try: - from xml.etree import cElementTree as ElementTree -except ImportError: - from xml.etree import ElementTree - -from .iparser import IParser - - -__all__ = ['HTMLParser'] - - -class HTMLTreeBuilder(_HTMLParser): - def __init__(self, encoding=None): - _HTMLParser.__init__(self) - self._target = ElementTree.TreeBuilder() - - def doctype(self, name, pubid, system): - pass - - def close(self): - tree = self._target.close() - return tree - - def handle_starttag(self, tag, attrs): - self._target.start(tag, dict(attrs)) - - def handle_startendtag(self, tag, attrs): - self._target.start(tag, dict(attrs)) - self._target.end(tag) - - def handle_charref(self, name): - self._target.data(unichr(int(name))) - - def handle_entityref(self, name): - try: - self._target.data(unichr(htmlentitydefs.name2codepoint[name])) - except KeyError: - self._target.data('&' + name) - - def handle_data(self, data): - self._target.data(data) - - def handle_endtag(self, tag): - try: - self._target.end(tag) - except: - pass - - -class HTMLParser(IParser): - def parse(self, data, encoding=None): - parser = HTMLTreeBuilder(encoding) - tree = ElementTree.parse(data, parser) - for elem in tree.getiterator(): - if elem.tag.startswith('{'): - elem.tag = elem.tag[elem.tag.find('}')+1:] - return tree - - def tostring(self, element): - e = ElementTree.Element('body') - e.text = element.text - e.tail = element.tail - for sub in element.getchildren(): - e.append(sub) - s = '' - # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. - for encoding in ('utf-8', 'ISO-8859-1'): - try: - s = ElementTree.tostring(e, encoding) - except UnicodeError: - continue - else: - break - return unicode(s)