rename parser/parsers module, add get_parsers() with preference_order

2010-04-16 18:00:44 +02:00 · 2010-04-16 18:00:44 +02:00 · 8638024756
commit 8638024756
parent 54cc3b0a4a
13 changed files with 104 additions and 70 deletions
--- a/weboob/backends/aum/browser.py
+++ b/weboob/backends/aum/browser.py
@ -22,7 +22,7 @@ import time
 from logging import warning

 from weboob.tools.browser import BaseBrowser
-from weboob.tools.parser import Html5libParser
+from weboob.tools.parsers.html5libparser import Html5libParser

 from weboob.backends.aum.exceptions import AdopteWait

--- a/weboob/backends/bnporc/browser.py
+++ b/weboob/backends/bnporc/browser.py
@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 from cStringIO import StringIO

 from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword
-from weboob.tools.parser import ElementTidyParser
+from weboob.tools.parsers.elementtidyparser import ElementTidyParser
 from weboob.backends.bnporc import pages

 # Parser
--- a/weboob/backends/dlfp/browser.py
+++ b/weboob/backends/dlfp/browser.py
@ -26,15 +26,15 @@ from .pages.index import IndexPage, LoginPage
 from .pages.news import ContentPage
 from .tools import id2url, id2threadid, id2contenttype

-from weboob.tools.parser import StandardParser
+from weboob.tools.parsers.htmlparser import HTMLParser

 # Parser
-class DLFParser(StandardParser):
+class DLFParser(HTMLParser):
    def parse(self, data, encoding):
        s = data.read()
        s = s.replace('<<', '<')
        data = StringIO(s)
-        return StandardParser.parse(self, data, encoding)
+        return HTMLParser.parse(self, data, encoding)

 # Browser
 class DLFP(BaseBrowser):
--- a/weboob/backends/youjizz/browser.py
+++ b/weboob/backends/youjizz/browser.py
@ -22,15 +22,10 @@ from logging import error
 import re

 from weboob.tools.browser import BaseBrowser
-from weboob.tools.parser import LxmlHtmlParser

 class YoujizzBrowser(BaseBrowser):
    video_file_regex = re.compile(r'"(http://media[^ ,]+\.flv)"')

-    def __init__(self, *args, **kwargs):
-        kwargs['parser'] = LxmlHtmlParser()
-        Browser.__init__(self, *args, **kwargs)
-
    def iter_page_urls(self, mozaic_url):
        raise NotImplementedError()

--- a/weboob/backends/youtube/browser.py
+++ b/weboob/backends/youtube/browser.py
@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 import re

 from weboob.tools.browser import BaseBrowser
-from weboob.tools.parser import LxmlHtmlParser
+from weboob.tools.parsers.lxmlhtmlparser import LxmlHtmlParser

 from .pages import VideoPage

--- a/weboob/tools/browser.py
+++ b/weboob/tools/browser.py
@ -26,7 +26,7 @@ import time
 from logging import warning, error, debug
 from copy import copy

-from weboob.tools.parser import StandardParser
+from weboob.tools.parsers import get_parser

 # Try to load cookies
 try:
@ -110,7 +110,7 @@ class BaseBrowser(mechanize.Browser):

    # ------ Browser methods ---------------------------------------

-    def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser(), history=NoHistory()):
+    def __init__(self, username=None, password=None, firefox_cookies=None, parser=get_parser(), history=NoHistory()):
        mechanize.Browser.__init__(self, history=history)
        self.addheaders = [
                ['User-agent', self.USER_AGENT]
--- a/weboob/tools/parser/init.py
+++ b/weboob/tools/parser/init.py
@ -1,45 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Copyright(C) 2010  Christophe Benz, Romain Bignon
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, version 3 of the License.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
-"""
-
-# Low performances
-# v
-# v
-try:
-    from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser
-except ImportError:
-    pass
-# v
-try:
-    from .htmlparser import HTMLParser, HTMLParser as StandardParser
-except ImportError:
-    pass
-# v
-try:
-    from .html5libparser import Html5libParser, Html5libParser as StandardParser
-except ImportError:
-    pass
-# v
-try:
-    from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser
-except ImportError:
-    pass
-# v
-# v
-# High performances
--- a/weboob/tools/parsers/init.py
+++ b/weboob/tools/parsers/init.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+"""
+Copyright(C) 2010  Christophe Benz, Romain Bignon
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, version 3 of the License.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+"""
+
+import logging
+
+
+__all__ = ['get_parser', 'NoParserFound']
+
+
+class NoParserFound(Exception): pass
+
+
+def get_parser(preference_order=['lxml', 'html5lib', 'elementtidy', 'builtin'], *args, **kwargs):
+    """
+    Get a parser from a preference order list.
+    This allows Weboob to run on systems without lxml, which is the default parser.
+    Return a parser implementing IParser.
+    """
+    if not isinstance(preference_order, (tuple, list)):
+        preference_order = [preference_order]
+    for kind in preference_order:
+        if kind == 'lxml':
+            try:
+                from .lxmlparser import LxmlHtmlParser
+                return LxmlHtmlParser()
+            except ImportError:
+                logging.debug('%s is not installed.' % kind)
+        elif kind == 'html5lib':
+            try:
+                from .html5libparser import Html5libParser
+                return Html5libParser(*args, **kwargs)
+            except ImportError:
+                logging.debug('%s is not installed.' % kind)
+        elif kind == 'elementtidy':
+            try:
+                from .elementtidyparser import ElementTidyParser
+                return ElementTidyParser()
+            except ImportError:
+                logging.debug('%s is not installed.' % kind)
+        elif kind == 'builtin':
+            try:
+                from .htmlparser import HTMLParser
+                return HTMLParser()
+            except ImportError:
+                logging.debug('%s is not installed.' % kind)
+    raise NoParserFound()
--- a/weboob/tools/parsers/elementtidyparser.py
+++ b/weboob/tools/parsers/elementtidyparser.py
@ -34,6 +34,10 @@ except ImportError:

 from .iparser import IParser

+
+__all__ = ['ElementTidyParser']
+
+
 class ElementTidyParser(IParser):
    def parse(self, data, encoding=None):
        TidyHTMLTreeBuilder.ElementTree = ElementTree
@ -45,7 +49,7 @@ class ElementTidyParser(IParser):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree

-    def dump(self, element):
+    def tostring(self, element):
        e = ElementTree.Element('body')
        e.text = element.text
        e.tail = element.tail
--- a/weboob/tools/parsers/html5libparser.py
+++ b/weboob/tools/parsers/html5libparser.py
@ -26,6 +26,10 @@ except ImportError:

 from .iparser import IParser

+
+__all__ = ['Html5libParser']
+
+
 class Html5libParser(HTMLParser, IParser):
    """
    Parser using html5lib.
@ -45,6 +49,6 @@ class Html5libParser(HTMLParser, IParser):
    def parse(self, data, encoding):
        return HTMLParser.parse(self, data, encoding=encoding)

-    def dump(self, elem):
+    def tostring(self, elem):
        # TODO
        raise NotImplementedError()
--- a/weboob/tools/parsers/htmlparser.py
+++ b/weboob/tools/parsers/htmlparser.py
@ -18,8 +18,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

 """

-__all__ = ['HTMLParser']
-
 from HTMLParser import HTMLParser as _HTMLParser
 import htmlentitydefs
 try:
@ -29,6 +27,10 @@ except ImportError:

 from .iparser import IParser

+
+__all__ = ['HTMLParser']
+
+
 class HTMLTreeBuilder(_HTMLParser):
    def __init__(self, encoding=None):
        _HTMLParser.__init__(self)
@ -75,7 +77,7 @@ class HTMLParser(IParser):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree

-    def dump(self, element):
+    def tostring(self, element):
        e = ElementTree.Element('body')
        e.text = element.text
        e.tail = element.tail
--- a/weboob/tools/parsers/iparser.py
+++ b/weboob/tools/parsers/iparser.py
@ -19,12 +19,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """

 class IParser(object):
-    def dump(self, elem):
-        """
-        Get HTML string from an element.
-        """
-        raise NotImplementedError()
-
    def parse(self, data, encoding=None):
        """
        Parse a HTML document with a specific encoding to get a tree.
@ -34,3 +28,9 @@ class IParser(object):
        @return  an object with the structured document
        """
        raise NotImplementedError()
+
+    def tostring(self, elem):
+        """
+        Get HTML string from an element.
+        """
+        raise NotImplementedError()
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@ -19,12 +19,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """

 import lxml.html
+
 from .iparser import IParser

+
+__all__ = ['LxmlHtmlParser']
+
+
 class LxmlHtmlParser(IParser):
+    """
+    Parser using lxml.
+
+    Note that it is not available on every systems.
+    """
+
    def parse(self, data, encoding=None):
        parser = lxml.html.HTMLParser(encoding=encoding)
        return lxml.html.parse(data, parser)

-    def dump(self, element):
+    def tostring(self, element):
        return lxml.html.tostring(element, encoding=unicode)