use LxmlHtmlParser as default parser

2010-04-16 14:06:28 +02:00 · 2010-04-16 14:06:28 +02:00 · 3703adb44e
commit 3703adb44e
parent 2d2b26b311
10 changed files with 130 additions and 45 deletions
--- a/weboob/tools/parser/init.py
+++ b/weboob/tools/parser/init.py
@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-

 """
-Copyright(C) 2010  Christophe Benz
+Copyright(C) 2010  Christophe Benz, Romain Bignon

 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@ -18,17 +18,28 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

 """

-from .standardparser import StandardParser, tostring
-
+# Low performances
+# v
+# v
 try:
-    from .elementtidyparser import ElementTidyParser
+    from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser
 except ImportError:
    pass
+# v
 try:
-    from .html5libparser import Html5libParser
+    from .htmlparser import HTMLParser, HTMLParser as StandardParser
 except ImportError:
    pass
+# v
 try:
-    from .lxmlparser import LxmlHtmlParser
+    from .html5libparser import Html5libParser, Html5libParser as StandardParser
 except ImportError:
    pass
+# v
+try:
+    from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser
+except ImportError:
+    pass
+# v
+# v
+# High performances
--- a/weboob/tools/parser/elementtidyparser.py
+++ b/weboob/tools/parser/elementtidyparser.py
@ -32,7 +32,9 @@ try:
 except ImportError:
    from xml.etree import ElementTree

-class ElementTidyParser(object):
+from .iparser import IParser
+
+class ElementTidyParser(IParser):
    def parse(self, data, encoding=None):
        TidyHTMLTreeBuilder.ElementTree = ElementTree
        HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder
@ -42,3 +44,20 @@ class ElementTidyParser(object):
            if elem.tag.startswith('{'):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree
+
+    def dump(self, element):
+        e = ElementTree.Element('body')
+        e.text = element.text
+        e.tail = element.tail
+        for sub in element.getchildren():
+            e.append(sub)
+        s = ''
+        # XXX OK if it doesn't work with utf-8, the result will be fucking ugly.
+        for encoding in ('utf-8', 'ISO-8859-1'):
+            try:
+                s = ElementTree.tostring(e, encoding)
+            except UnicodeError:
+                continue
+            else:
+                break
+        return unicode(s)
--- a/weboob/tools/parser/html5libparser.py
+++ b/weboob/tools/parser/html5libparser.py
@ -24,7 +24,9 @@ try:
 except ImportError:
    from xml.etree import ElementTree

-class Html5libParser(HTMLParser):
+from .iparser import IParser
+
+class Html5libParser(HTMLParser, IParser):
    """
    Parser using html5lib.

@ -42,3 +44,7 @@ class Html5libParser(HTMLParser):

    def parse(self, data, encoding):
        return HTMLParser.parse(self, data, encoding=encoding)
+
+    def dump(self, elem):
+        # TODO
+        raise NotImplementedError()
--- a/weboob/tools/parser/standardparser.py
+++ b/weboob/tools/parser/standardparser.py
@ -18,18 +18,20 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

 """

-__all__ = ['StandardParser', 'tostring']
+__all__ = ['HTMLParser']

-from HTMLParser import HTMLParser
+from HTMLParser import HTMLParser as _HTMLParser
 import htmlentitydefs
 try:
    from xml.etree import cElementTree as ElementTree
 except ImportError:
    from xml.etree import ElementTree

-class HTMLTreeBuilder(HTMLParser):
+from .iparser import IParser
+
+class HTMLTreeBuilder(_HTMLParser):
    def __init__(self, encoding=None):
-        HTMLParser.__init__(self)
+        _HTMLParser.__init__(self)
        self._target = ElementTree.TreeBuilder()

    def doctype(self, name, pubid, system):
@ -64,7 +66,7 @@ class HTMLTreeBuilder(HTMLParser):
        except:
            pass

-class StandardParser(object):
+class HTMLParser(IParser):
    def parse(self, data, encoding=None):
        parser = HTMLTreeBuilder(encoding)
        tree = ElementTree.parse(data, parser)
@ -73,19 +75,19 @@ class StandardParser(object):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree

-def tostring(element):
-    e = ElementTree.Element('body')
-    e.text = element.text
-    e.tail = element.tail
-    for sub in element.getchildren():
-        e.append(sub)
-    s = ''
-    # XXX OK if it doesn't work with utf-8, the result will be fucking ugly.
-    for encoding in ('utf-8', 'ISO-8859-1'):
-        try:
-            s = ElementTree.tostring(e, encoding)
-        except UnicodeError:
-            continue
-        else:
-            break
-    return unicode(s)
+    def dump(self, element):
+        e = ElementTree.Element('body')
+        e.text = element.text
+        e.tail = element.tail
+        for sub in element.getchildren():
+            e.append(sub)
+        s = ''
+        # XXX OK if it doesn't work with utf-8, the result will be fucking ugly.
+        for encoding in ('utf-8', 'ISO-8859-1'):
+            try:
+                s = ElementTree.tostring(e, encoding)
+            except UnicodeError:
+                continue
+            else:
+                break
+        return unicode(s)
--- a/weboob/tools/parser/iparser.py
+++ b/weboob/tools/parser/iparser.py
@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright(C) 2010  Romain Bignon
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+"""
+
+class IParser(object):
+    def dump(self, elem):
+        """
+        Get HTML string from an element.
+        """
+        raise NotImplementedError()
+
+    def parse(self, data, encoding=None):
+        """
+        Parse a HTML document with a specific encoding to get a tree.
+
+        @param data  [str] HTML document
+        @param encoding  [str] encoding to use
+        @return  an object with the structured document
+        """
+        raise NotImplementedError()
--- a/weboob/tools/parser/lxmlparser.py
+++ b/weboob/tools/parser/lxmlparser.py
@ -19,8 +19,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """

 import lxml.html
+from .iparser import IParser

-class LxmlHtmlParser(object):
+class LxmlHtmlParser(IParser):
    def parse(self, data, encoding=None):
        parser = lxml.html.HTMLParser(encoding=encoding)
        return lxml.html.parse(data, parser)
+
+    def dump(self, element):
+        return lxml.html.tostring(element, encoding=unicode)