rename parser/parsers module, add get_parsers() with preference_order

2010-04-16 18:00:44 +02:00 · 2010-04-16 18:00:44 +02:00 · 8638024756
commit 8638024756
parent 54cc3b0a4a
13 changed files with 104 additions and 70 deletions
--- a/weboob/backends/aum/browser.py
+++ b/weboob/backends/aum/browser.py
@ -22,7 +22,7 @@ import time
 from logging import warning
 from weboob.tools.browser import BaseBrowser
-from weboob.tools.parser import Html5libParser
+from weboob.tools.parsers.html5libparser import Html5libParser
 from weboob.backends.aum.exceptions import AdopteWait
--- a/weboob/backends/bnporc/browser.py
+++ b/weboob/backends/bnporc/browser.py
@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 from cStringIO import StringIO
 from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword
-from weboob.tools.parser import ElementTidyParser
+from weboob.tools.parsers.elementtidyparser import ElementTidyParser
 from weboob.backends.bnporc import pages
 # Parser
--- a/weboob/backends/dlfp/browser.py
+++ b/weboob/backends/dlfp/browser.py
@ -26,15 +26,15 @@ from .pages.index import IndexPage, LoginPage
 from .pages.news import ContentPage
 from .tools import id2url, id2threadid, id2contenttype
-from weboob.tools.parser import StandardParser
+from weboob.tools.parsers.htmlparser import HTMLParser
 # Parser
-class DLFParser(StandardParser):
+class DLFParser(HTMLParser):
    def parse(self, data, encoding):
        s = data.read()
        s = s.replace('<<', '<')
        data = StringIO(s)
-        return StandardParser.parse(self, data, encoding)
+        return HTMLParser.parse(self, data, encoding)
 # Browser
 class DLFP(BaseBrowser):
--- a/weboob/backends/youjizz/browser.py
+++ b/weboob/backends/youjizz/browser.py
@ -22,15 +22,10 @@ from logging import error
 import re
 from weboob.tools.browser import BaseBrowser
 from weboob.tools.parser import LxmlHtmlParser
 class YoujizzBrowser(BaseBrowser):
    video_file_regex = re.compile(r'"(http://media[^ ,]+\.flv)"')
    def __init__(self, *args, **kwargs):
        kwargs['parser'] = LxmlHtmlParser()
        Browser.__init__(self, *args, **kwargs)
    def iter_page_urls(self, mozaic_url):
        raise NotImplementedError()
--- a/weboob/backends/youtube/browser.py
+++ b/weboob/backends/youtube/browser.py
@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 import re
 from weboob.tools.browser import BaseBrowser
-from weboob.tools.parser import LxmlHtmlParser
+from weboob.tools.parsers.lxmlhtmlparser import LxmlHtmlParser
 from .pages import VideoPage
--- a/weboob/tools/browser.py
+++ b/weboob/tools/browser.py
@ -26,7 +26,7 @@ import time
 from logging import warning, error, debug
 from copy import copy
-from weboob.tools.parser import StandardParser
+from weboob.tools.parsers import get_parser
 # Try to load cookies
 try:
@ -110,7 +110,7 @@ class BaseBrowser(mechanize.Browser):
    # ------ Browser methods ---------------------------------------
-    def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser(), history=NoHistory()):
+    def __init__(self, username=None, password=None, firefox_cookies=None, parser=get_parser(), history=NoHistory()):
        mechanize.Browser.__init__(self, history=history)
        self.addheaders = [
                ['User-agent', self.USER_AGENT]
--- a/weboob/tools/parser/init.py
+++ b/weboob/tools/parser/init.py
@ -1,45 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 Copyright(C) 2010  Christophe Benz, Romain Bignon
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, version 3 of the License.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 # Low performances
 # v
 # v
 try:
    from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser
 except ImportError:
    pass
 # v
 try:
    from .htmlparser import HTMLParser, HTMLParser as StandardParser
 except ImportError:
    pass
 # v
 try:
    from .html5libparser import Html5libParser, Html5libParser as StandardParser
 except ImportError:
    pass
 # v
 try:
    from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser
 except ImportError:
    pass
 # v
 # v
 # High performances
--- a/weboob/tools/parsers/init.py
+++ b/weboob/tools/parsers/init.py
@ -0,0 +1,63 @@
 # -*- coding: utf-8 -*-
 """
 Copyright(C) 2010  Christophe Benz, Romain Bignon
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, version 3 of the License.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 import logging
 __all__ = ['get_parser', 'NoParserFound']
 class NoParserFound(Exception): pass
 def get_parser(preference_order=['lxml', 'html5lib', 'elementtidy', 'builtin'], *args, **kwargs):
    """
    Get a parser from a preference order list.
    This allows Weboob to run on systems without lxml, which is the default parser.
    Return a parser implementing IParser.
    """
    if not isinstance(preference_order, (tuple, list)):
        preference_order = [preference_order]
    for kind in preference_order:
        if kind == 'lxml':
            try:
                from .lxmlparser import LxmlHtmlParser
                return LxmlHtmlParser()
            except ImportError:
                logging.debug('%s is not installed.' % kind)
        elif kind == 'html5lib':
            try:
                from .html5libparser import Html5libParser
                return Html5libParser(*args, **kwargs)
            except ImportError:
                logging.debug('%s is not installed.' % kind)
        elif kind == 'elementtidy':
            try:
                from .elementtidyparser import ElementTidyParser
                return ElementTidyParser()
            except ImportError:
                logging.debug('%s is not installed.' % kind)
        elif kind == 'builtin':
            try:
                from .htmlparser import HTMLParser
                return HTMLParser()
            except ImportError:
                logging.debug('%s is not installed.' % kind)
    raise NoParserFound()
--- a/weboob/tools/parsers/elementtidyparser.py
+++ b/weboob/tools/parsers/elementtidyparser.py
@ -34,6 +34,10 @@ except ImportError:
 from .iparser import IParser
 __all__ = ['ElementTidyParser']
 class ElementTidyParser(IParser):
    def parse(self, data, encoding=None):
        TidyHTMLTreeBuilder.ElementTree = ElementTree
@ -45,7 +49,7 @@ class ElementTidyParser(IParser):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree
-    def dump(self, element):
+    def tostring(self, element):
        e = ElementTree.Element('body')
        e.text = element.text
        e.tail = element.tail
--- a/weboob/tools/parsers/html5libparser.py
+++ b/weboob/tools/parsers/html5libparser.py
@ -26,6 +26,10 @@ except ImportError:
 from .iparser import IParser
 __all__ = ['Html5libParser']
 class Html5libParser(HTMLParser, IParser):
    """
    Parser using html5lib.
@ -45,6 +49,6 @@ class Html5libParser(HTMLParser, IParser):
    def parse(self, data, encoding):
        return HTMLParser.parse(self, data, encoding=encoding)
-    def dump(self, elem):
+    def tostring(self, elem):
        # TODO
        raise NotImplementedError()
--- a/weboob/tools/parsers/htmlparser.py
+++ b/weboob/tools/parsers/htmlparser.py
@ -18,8 +18,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 __all__ = ['HTMLParser']
 from HTMLParser import HTMLParser as _HTMLParser
 import htmlentitydefs
 try:
@ -29,6 +27,10 @@ except ImportError:
 from .iparser import IParser
 __all__ = ['HTMLParser']
 class HTMLTreeBuilder(_HTMLParser):
    def __init__(self, encoding=None):
        _HTMLParser.__init__(self)
@ -75,7 +77,7 @@ class HTMLParser(IParser):
                elem.tag = elem.tag[elem.tag.find('}')+1:]
        return tree
-    def dump(self, element):
+    def tostring(self, element):
        e = ElementTree.Element('body')
        e.text = element.text
        e.tail = element.tail
--- a/weboob/tools/parsers/iparser.py
+++ b/weboob/tools/parsers/iparser.py
@ -19,12 +19,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 class IParser(object):
    def dump(self, elem):
        """
        Get HTML string from an element.
        """
        raise NotImplementedError()
    def parse(self, data, encoding=None):
        """
        Parse a HTML document with a specific encoding to get a tree.
@ -34,3 +28,9 @@ class IParser(object):
        @return  an object with the structured document
        """
        raise NotImplementedError()
    def tostring(self, elem):
        """
        Get HTML string from an element.
        """
        raise NotImplementedError()
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@ -19,12 +19,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 import lxml.html
 from .iparser import IParser
 __all__ = ['LxmlHtmlParser']
 class LxmlHtmlParser(IParser):
    """
    Parser using lxml.
    Note that it is not available on every systems.
    """
    def parse(self, data, encoding=None):
        parser = lxml.html.HTMLParser(encoding=encoding)
        return lxml.html.parse(data, parser)
-    def dump(self, element):
+    def tostring(self, element):
        return lxml.html.tostring(element, encoding=unicode)