From 3ebdea9faf4980d71dbffab71e2a2d2009af9523 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 14 Apr 2010 09:49:10 +0200 Subject: [PATCH] several fixes --- weboob/backends/bnporc/browser.py | 8 +-- weboob/backends/dlfp/browser.py | 2 +- weboob/tools/parser.py | 106 ------------------------------ weboob/tools/parser/__init__.py | 2 +- 4 files changed, 6 insertions(+), 112 deletions(-) delete mode 100644 weboob/tools/parser.py diff --git a/weboob/backends/bnporc/browser.py b/weboob/backends/bnporc/browser.py index f0e8db09..3e299dd0 100644 --- a/weboob/backends/bnporc/browser.py +++ b/weboob/backends/bnporc/browser.py @@ -21,16 +21,16 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from cStringIO import StringIO from weboob.tools.browser import Browser, BrowserIncorrectPassword -from weboob.tools.parser import StandardParser +from weboob.tools.parser import ElementTidyParser from weboob.backends.bnporc import pages # Parser -class BNParser(StandardParser): +class BNParser(ElementTidyParser): def parse(self, data, encoding): s = data.read() s = s.replace('', '') data = StringIO(s) - return StandardParser.parse(self, data, encoding) + return ElementTidyParser.parse(self, data, encoding) # Browser class BNPorc(Browser): @@ -49,7 +49,7 @@ class BNPorc(Browser): is_logging = False def __init__(self, *args, **kwargs): - kwargs['parser'] = BNParser + kwargs['parser'] = BNParser() Browser.__init__(self, *args, **kwargs) def home(self): diff --git a/weboob/backends/dlfp/browser.py b/weboob/backends/dlfp/browser.py index c69c6225..923913a4 100644 --- a/weboob/backends/dlfp/browser.py +++ b/weboob/backends/dlfp/browser.py @@ -48,7 +48,7 @@ class DLFP(Browser): } def __init__(self, *args, **kwargs): - kwargs['parser'] = DLFParser + kwargs['parser'] = DLFParser() Browser.__init__(self, *args, **kwargs) def home(self): diff --git a/weboob/tools/parser.py b/weboob/tools/parser.py deleted file mode 100644 index cf35d4c8..00000000 --- a/weboob/tools/parser.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Copyright(C) 2010 Romain Bignon - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, version 3 of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -""" - -try: - from xml.etree import cElementTree as ElementTree -except ImportError: - from xml.etree import ElementTree - -try: - # XXX Currently, elementtidy segfaults when there are no error, because of - # the behavior of libtidy. - # A patch has been sent to Debian: - # http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=576343 - # - # As it is not integrated in Debian yet, and as this problem persists on other - # systems, using elementtidy is for now disabled. - raise ImportError - - from elementtidy import TidyHTMLTreeBuilder - TidyHTMLTreeBuilder.ElementTree = ElementTree # force cElementTree if using it. - HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder -except ImportError: - from HTMLParser import HTMLParser - import htmlentitydefs - - class HTMLTreeBuilder(HTMLParser): - def __init__(self, encoding=None): - HTMLParser.__init__(self) - self._target = ElementTree.TreeBuilder() - - def doctype(self, name, pubid, system): - pass - - def close(self): - tree = self._target.close() - return tree - - def handle_starttag(self, tag, attrs): - self._target.start(tag, dict(attrs)) - - def handle_startendtag(self, tag, attrs): - self._target.start(tag, dict(attrs)) - self._target.end(tag) - - def handle_charref(self, name): - self._target.data(unichr(int(name))) - - def handle_entityref(self, name): - try: - self._target.data(unichr(htmlentitydefs.name2codepoint[name])) - except KeyError: - self._target.data('&' + name) - - def handle_data(self, data): - self._target.data(data) - - def handle_endtag(self, tag): - try: - self._target.end(tag) - except: - pass - -class StandardParser(object): - def parse(self, data, encoding=None): - parser = HTMLTreeBuilder(encoding) - tree = ElementTree.parse(data, parser) - - for elem in tree.getiterator(): - if elem.tag.startswith('{'): - elem.tag = elem.tag[elem.tag.find('}')+1:] - return tree - -def tostring(element): - e = ElementTree.Element('body') - e.text = element.text - e.tail = element.tail - for sub in element.getchildren(): - e.append(sub) - - s = '' - # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. - for encoding in ('utf-8', 'ISO-8859-1'): - try: - s = ElementTree.tostring(e, encoding) - except UnicodeError: - continue - else: - break - return unicode(s) diff --git a/weboob/tools/parser/__init__.py b/weboob/tools/parser/__init__.py index ee44b4b4..f30cfb22 100644 --- a/weboob/tools/parser/__init__.py +++ b/weboob/tools/parser/__init__.py @@ -21,4 +21,4 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from .elementtidyparser import ElementTidyParser from .html5libparser import Html5libParser from .lxmlparser import LxmlHtmlParser -from .standardparser import StandardParser +from .standardparser import StandardParser, tostring