diff --git a/weboob/backends/aum/browser.py b/weboob/backends/aum/browser.py index d8865ae1..5e34c88d 100644 --- a/weboob/backends/aum/browser.py +++ b/weboob/backends/aum/browser.py @@ -22,7 +22,7 @@ import time from logging import warning from weboob.tools.browser import BaseBrowser -from weboob.tools.parser import Html5libParser +from weboob.tools.parsers.html5libparser import Html5libParser from weboob.backends.aum.exceptions import AdopteWait diff --git a/weboob/backends/bnporc/browser.py b/weboob/backends/bnporc/browser.py index d13020db..0dae383d 100644 --- a/weboob/backends/bnporc/browser.py +++ b/weboob/backends/bnporc/browser.py @@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from cStringIO import StringIO from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword -from weboob.tools.parser import ElementTidyParser +from weboob.tools.parsers.elementtidyparser import ElementTidyParser from weboob.backends.bnporc import pages # Parser diff --git a/weboob/backends/dlfp/browser.py b/weboob/backends/dlfp/browser.py index 1679c6b0..80a60dec 100644 --- a/weboob/backends/dlfp/browser.py +++ b/weboob/backends/dlfp/browser.py @@ -26,15 +26,15 @@ from .pages.index import IndexPage, LoginPage from .pages.news import ContentPage from .tools import id2url, id2threadid, id2contenttype -from weboob.tools.parser import StandardParser +from weboob.tools.parsers.htmlparser import HTMLParser # Parser -class DLFParser(StandardParser): +class DLFParser(HTMLParser): def parse(self, data, encoding): s = data.read() s = s.replace('<<', '<') data = StringIO(s) - return StandardParser.parse(self, data, encoding) + return HTMLParser.parse(self, data, encoding) # Browser class DLFP(BaseBrowser): diff --git a/weboob/backends/youjizz/browser.py b/weboob/backends/youjizz/browser.py index ace6171f..466d787c 100644 --- a/weboob/backends/youjizz/browser.py +++ b/weboob/backends/youjizz/browser.py @@ -22,15 +22,10 @@ from logging import error import re from weboob.tools.browser import BaseBrowser -from weboob.tools.parser import LxmlHtmlParser class YoujizzBrowser(BaseBrowser): video_file_regex = re.compile(r'"(http://media[^ ,]+\.flv)"') - def __init__(self, *args, **kwargs): - kwargs['parser'] = LxmlHtmlParser() - Browser.__init__(self, *args, **kwargs) - def iter_page_urls(self, mozaic_url): raise NotImplementedError() diff --git a/weboob/backends/youtube/browser.py b/weboob/backends/youtube/browser.py index 68bc36fb..b970033e 100644 --- a/weboob/backends/youtube/browser.py +++ b/weboob/backends/youtube/browser.py @@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import re from weboob.tools.browser import BaseBrowser -from weboob.tools.parser import LxmlHtmlParser +from weboob.tools.parsers.lxmlhtmlparser import LxmlHtmlParser from .pages import VideoPage diff --git a/weboob/tools/browser.py b/weboob/tools/browser.py index e770e144..4d4d702f 100644 --- a/weboob/tools/browser.py +++ b/weboob/tools/browser.py @@ -26,7 +26,7 @@ import time from logging import warning, error, debug from copy import copy -from weboob.tools.parser import StandardParser +from weboob.tools.parsers import get_parser # Try to load cookies try: @@ -110,7 +110,7 @@ class BaseBrowser(mechanize.Browser): # ------ Browser methods --------------------------------------- - def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser(), history=NoHistory()): + def __init__(self, username=None, password=None, firefox_cookies=None, parser=get_parser(), history=NoHistory()): mechanize.Browser.__init__(self, history=history) self.addheaders = [ ['User-agent', self.USER_AGENT] diff --git a/weboob/tools/parser/__init__.py b/weboob/tools/parser/__init__.py deleted file mode 100644 index a1d6f651..00000000 --- a/weboob/tools/parser/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Copyright(C) 2010 Christophe Benz, Romain Bignon - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, version 3 of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -""" - -# Low performances -# v -# v -try: - from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser -except ImportError: - pass -# v -try: - from .htmlparser import HTMLParser, HTMLParser as StandardParser -except ImportError: - pass -# v -try: - from .html5libparser import Html5libParser, Html5libParser as StandardParser -except ImportError: - pass -# v -try: - from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser -except ImportError: - pass -# v -# v -# High performances diff --git a/weboob/tools/parsers/__init__.py b/weboob/tools/parsers/__init__.py new file mode 100644 index 00000000..aed4269d --- /dev/null +++ b/weboob/tools/parsers/__init__.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Christophe Benz, Romain Bignon + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +import logging + + +__all__ = ['get_parser', 'NoParserFound'] + + +class NoParserFound(Exception): pass + + +def get_parser(preference_order=['lxml', 'html5lib', 'elementtidy', 'builtin'], *args, **kwargs): + """ + Get a parser from a preference order list. + This allows Weboob to run on systems without lxml, which is the default parser. + Return a parser implementing IParser. + """ + if not isinstance(preference_order, (tuple, list)): + preference_order = [preference_order] + for kind in preference_order: + if kind == 'lxml': + try: + from .lxmlparser import LxmlHtmlParser + return LxmlHtmlParser() + except ImportError: + logging.debug('%s is not installed.' % kind) + elif kind == 'html5lib': + try: + from .html5libparser import Html5libParser + return Html5libParser(*args, **kwargs) + except ImportError: + logging.debug('%s is not installed.' % kind) + elif kind == 'elementtidy': + try: + from .elementtidyparser import ElementTidyParser + return ElementTidyParser() + except ImportError: + logging.debug('%s is not installed.' % kind) + elif kind == 'builtin': + try: + from .htmlparser import HTMLParser + return HTMLParser() + except ImportError: + logging.debug('%s is not installed.' % kind) + raise NoParserFound() diff --git a/weboob/tools/parser/elementtidyparser.py b/weboob/tools/parsers/elementtidyparser.py similarity index 96% rename from weboob/tools/parser/elementtidyparser.py rename to weboob/tools/parsers/elementtidyparser.py index 10e4e94e..73f08f46 100644 --- a/weboob/tools/parser/elementtidyparser.py +++ b/weboob/tools/parsers/elementtidyparser.py @@ -34,6 +34,10 @@ except ImportError: from .iparser import IParser + +__all__ = ['ElementTidyParser'] + + class ElementTidyParser(IParser): def parse(self, data, encoding=None): TidyHTMLTreeBuilder.ElementTree = ElementTree @@ -45,7 +49,7 @@ class ElementTidyParser(IParser): elem.tag = elem.tag[elem.tag.find('}')+1:] return tree - def dump(self, element): + def tostring(self, element): e = ElementTree.Element('body') e.text = element.text e.tail = element.tail diff --git a/weboob/tools/parser/html5libparser.py b/weboob/tools/parsers/html5libparser.py similarity index 96% rename from weboob/tools/parser/html5libparser.py rename to weboob/tools/parsers/html5libparser.py index 81ec20f9..03df71e5 100644 --- a/weboob/tools/parser/html5libparser.py +++ b/weboob/tools/parsers/html5libparser.py @@ -26,6 +26,10 @@ except ImportError: from .iparser import IParser + +__all__ = ['Html5libParser'] + + class Html5libParser(HTMLParser, IParser): """ Parser using html5lib. @@ -45,6 +49,6 @@ class Html5libParser(HTMLParser, IParser): def parse(self, data, encoding): return HTMLParser.parse(self, data, encoding=encoding) - def dump(self, elem): + def tostring(self, elem): # TODO raise NotImplementedError() diff --git a/weboob/tools/parser/htmlparser.py b/weboob/tools/parsers/htmlparser.py similarity index 98% rename from weboob/tools/parser/htmlparser.py rename to weboob/tools/parsers/htmlparser.py index 3239dabc..b8160107 100644 --- a/weboob/tools/parser/htmlparser.py +++ b/weboob/tools/parsers/htmlparser.py @@ -18,8 +18,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ -__all__ = ['HTMLParser'] - from HTMLParser import HTMLParser as _HTMLParser import htmlentitydefs try: @@ -29,6 +27,10 @@ except ImportError: from .iparser import IParser + +__all__ = ['HTMLParser'] + + class HTMLTreeBuilder(_HTMLParser): def __init__(self, encoding=None): _HTMLParser.__init__(self) @@ -75,7 +77,7 @@ class HTMLParser(IParser): elem.tag = elem.tag[elem.tag.find('}')+1:] return tree - def dump(self, element): + def tostring(self, element): e = ElementTree.Element('body') e.text = element.text e.tail = element.tail diff --git a/weboob/tools/parser/iparser.py b/weboob/tools/parsers/iparser.py similarity index 97% rename from weboob/tools/parser/iparser.py rename to weboob/tools/parsers/iparser.py index 2e2db4cc..60dbf386 100644 --- a/weboob/tools/parser/iparser.py +++ b/weboob/tools/parsers/iparser.py @@ -19,12 +19,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ class IParser(object): - def dump(self, elem): - """ - Get HTML string from an element. - """ - raise NotImplementedError() - def parse(self, data, encoding=None): """ Parse a HTML document with a specific encoding to get a tree. @@ -34,3 +28,9 @@ class IParser(object): @return an object with the structured document """ raise NotImplementedError() + + def tostring(self, elem): + """ + Get HTML string from an element. + """ + raise NotImplementedError() diff --git a/weboob/tools/parser/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py similarity index 86% rename from weboob/tools/parser/lxmlparser.py rename to weboob/tools/parsers/lxmlparser.py index 057ecca0..0d07260e 100644 --- a/weboob/tools/parser/lxmlparser.py +++ b/weboob/tools/parsers/lxmlparser.py @@ -19,12 +19,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ import lxml.html + from .iparser import IParser + +__all__ = ['LxmlHtmlParser'] + + class LxmlHtmlParser(IParser): + """ + Parser using lxml. + + Note that it is not available on every systems. + """ + def parse(self, data, encoding=None): parser = lxml.html.HTMLParser(encoding=encoding) return lxml.html.parse(data, parser) - def dump(self, element): + def tostring(self, element): return lxml.html.tostring(element, encoding=unicode)