rename parser/parsers module, add get_parsers() with preference_order

This commit is contained in:
Christophe Benz 2010-04-16 18:00:44 +02:00
commit 8638024756
13 changed files with 104 additions and 70 deletions

View file

@ -22,7 +22,7 @@ import time
from logging import warning from logging import warning
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.parser import Html5libParser from weboob.tools.parsers.html5libparser import Html5libParser
from weboob.backends.aum.exceptions import AdopteWait from weboob.backends.aum.exceptions import AdopteWait

View file

@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from cStringIO import StringIO from cStringIO import StringIO
from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword
from weboob.tools.parser import ElementTidyParser from weboob.tools.parsers.elementtidyparser import ElementTidyParser
from weboob.backends.bnporc import pages from weboob.backends.bnporc import pages
# Parser # Parser

View file

@ -26,15 +26,15 @@ from .pages.index import IndexPage, LoginPage
from .pages.news import ContentPage from .pages.news import ContentPage
from .tools import id2url, id2threadid, id2contenttype from .tools import id2url, id2threadid, id2contenttype
from weboob.tools.parser import StandardParser from weboob.tools.parsers.htmlparser import HTMLParser
# Parser # Parser
class DLFParser(StandardParser): class DLFParser(HTMLParser):
def parse(self, data, encoding): def parse(self, data, encoding):
s = data.read() s = data.read()
s = s.replace('<<', '<') s = s.replace('<<', '<')
data = StringIO(s) data = StringIO(s)
return StandardParser.parse(self, data, encoding) return HTMLParser.parse(self, data, encoding)
# Browser # Browser
class DLFP(BaseBrowser): class DLFP(BaseBrowser):

View file

@ -22,15 +22,10 @@ from logging import error
import re import re
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.parser import LxmlHtmlParser
class YoujizzBrowser(BaseBrowser): class YoujizzBrowser(BaseBrowser):
video_file_regex = re.compile(r'"(http://media[^ ,]+\.flv)"') video_file_regex = re.compile(r'"(http://media[^ ,]+\.flv)"')
def __init__(self, *args, **kwargs):
kwargs['parser'] = LxmlHtmlParser()
Browser.__init__(self, *args, **kwargs)
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
raise NotImplementedError() raise NotImplementedError()

View file

@ -21,7 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import re import re
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.parser import LxmlHtmlParser from weboob.tools.parsers.lxmlhtmlparser import LxmlHtmlParser
from .pages import VideoPage from .pages import VideoPage

View file

@ -26,7 +26,7 @@ import time
from logging import warning, error, debug from logging import warning, error, debug
from copy import copy from copy import copy
from weboob.tools.parser import StandardParser from weboob.tools.parsers import get_parser
# Try to load cookies # Try to load cookies
try: try:
@ -110,7 +110,7 @@ class BaseBrowser(mechanize.Browser):
# ------ Browser methods --------------------------------------- # ------ Browser methods ---------------------------------------
def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser(), history=NoHistory()): def __init__(self, username=None, password=None, firefox_cookies=None, parser=get_parser(), history=NoHistory()):
mechanize.Browser.__init__(self, history=history) mechanize.Browser.__init__(self, history=history)
self.addheaders = [ self.addheaders = [
['User-agent', self.USER_AGENT] ['User-agent', self.USER_AGENT]

View file

@ -1,45 +0,0 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Christophe Benz, Romain Bignon
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
# Low performances
# v
# v
try:
from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser
except ImportError:
pass
# v
try:
from .htmlparser import HTMLParser, HTMLParser as StandardParser
except ImportError:
pass
# v
try:
from .html5libparser import Html5libParser, Html5libParser as StandardParser
except ImportError:
pass
# v
try:
from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser
except ImportError:
pass
# v
# v
# High performances

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Christophe Benz, Romain Bignon
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
import logging
__all__ = ['get_parser', 'NoParserFound']
class NoParserFound(Exception): pass
def get_parser(preference_order=['lxml', 'html5lib', 'elementtidy', 'builtin'], *args, **kwargs):
"""
Get a parser from a preference order list.
This allows Weboob to run on systems without lxml, which is the default parser.
Return a parser implementing IParser.
"""
if not isinstance(preference_order, (tuple, list)):
preference_order = [preference_order]
for kind in preference_order:
if kind == 'lxml':
try:
from .lxmlparser import LxmlHtmlParser
return LxmlHtmlParser()
except ImportError:
logging.debug('%s is not installed.' % kind)
elif kind == 'html5lib':
try:
from .html5libparser import Html5libParser
return Html5libParser(*args, **kwargs)
except ImportError:
logging.debug('%s is not installed.' % kind)
elif kind == 'elementtidy':
try:
from .elementtidyparser import ElementTidyParser
return ElementTidyParser()
except ImportError:
logging.debug('%s is not installed.' % kind)
elif kind == 'builtin':
try:
from .htmlparser import HTMLParser
return HTMLParser()
except ImportError:
logging.debug('%s is not installed.' % kind)
raise NoParserFound()

View file

@ -34,6 +34,10 @@ except ImportError:
from .iparser import IParser from .iparser import IParser
__all__ = ['ElementTidyParser']
class ElementTidyParser(IParser): class ElementTidyParser(IParser):
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
TidyHTMLTreeBuilder.ElementTree = ElementTree TidyHTMLTreeBuilder.ElementTree = ElementTree
@ -45,7 +49,7 @@ class ElementTidyParser(IParser):
elem.tag = elem.tag[elem.tag.find('}')+1:] elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree return tree
def dump(self, element): def tostring(self, element):
e = ElementTree.Element('body') e = ElementTree.Element('body')
e.text = element.text e.text = element.text
e.tail = element.tail e.tail = element.tail

View file

@ -26,6 +26,10 @@ except ImportError:
from .iparser import IParser from .iparser import IParser
__all__ = ['Html5libParser']
class Html5libParser(HTMLParser, IParser): class Html5libParser(HTMLParser, IParser):
""" """
Parser using html5lib. Parser using html5lib.
@ -45,6 +49,6 @@ class Html5libParser(HTMLParser, IParser):
def parse(self, data, encoding): def parse(self, data, encoding):
return HTMLParser.parse(self, data, encoding=encoding) return HTMLParser.parse(self, data, encoding=encoding)
def dump(self, elem): def tostring(self, elem):
# TODO # TODO
raise NotImplementedError() raise NotImplementedError()

View file

@ -18,8 +18,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
__all__ = ['HTMLParser']
from HTMLParser import HTMLParser as _HTMLParser from HTMLParser import HTMLParser as _HTMLParser
import htmlentitydefs import htmlentitydefs
try: try:
@ -29,6 +27,10 @@ except ImportError:
from .iparser import IParser from .iparser import IParser
__all__ = ['HTMLParser']
class HTMLTreeBuilder(_HTMLParser): class HTMLTreeBuilder(_HTMLParser):
def __init__(self, encoding=None): def __init__(self, encoding=None):
_HTMLParser.__init__(self) _HTMLParser.__init__(self)
@ -75,7 +77,7 @@ class HTMLParser(IParser):
elem.tag = elem.tag[elem.tag.find('}')+1:] elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree return tree
def dump(self, element): def tostring(self, element):
e = ElementTree.Element('body') e = ElementTree.Element('body')
e.text = element.text e.text = element.text
e.tail = element.tail e.tail = element.tail

View file

@ -19,12 +19,6 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
class IParser(object): class IParser(object):
def dump(self, elem):
"""
Get HTML string from an element.
"""
raise NotImplementedError()
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
""" """
Parse a HTML document with a specific encoding to get a tree. Parse a HTML document with a specific encoding to get a tree.
@ -34,3 +28,9 @@ class IParser(object):
@return an object with the structured document @return an object with the structured document
""" """
raise NotImplementedError() raise NotImplementedError()
def tostring(self, elem):
"""
Get HTML string from an element.
"""
raise NotImplementedError()

View file

@ -19,12 +19,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
import lxml.html import lxml.html
from .iparser import IParser from .iparser import IParser
__all__ = ['LxmlHtmlParser']
class LxmlHtmlParser(IParser): class LxmlHtmlParser(IParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
parser = lxml.html.HTMLParser(encoding=encoding) parser = lxml.html.HTMLParser(encoding=encoding)
return lxml.html.parse(data, parser) return lxml.html.parse(data, parser)
def dump(self, element): def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode) return lxml.html.tostring(element, encoding=unicode)