add an xml parser

This commit is contained in:
Bezleputh 2014-02-24 20:07:49 +01:00
commit 5ea0307b6a
2 changed files with 43 additions and 7 deletions

View file

@ -38,6 +38,11 @@ def load_lxmlsoup():
return LxmlSoupParser
def load_xml():
from .lxmlparser import LxmlXmlParser
return LxmlXmlParser
def load_html5lib():
from .html5libparser import Html5libParser
return Html5libParser
@ -82,6 +87,7 @@ def get_parser(preference_order=('lxml', 'lxmlsoup')):
"""
if not isinstance(preference_order, (tuple, list)):
preference_order = [preference_order]
for kind in preference_order:
if not 'load_%s' % kind in globals():
continue

View file

@ -19,31 +19,35 @@
import re
import lxml.html
import lxml.html as html
import lxml.etree as etree
from .iparser import IParser
from ..browser import BrokenPageError
__all__ = ['LxmlHtmlParser']
__all__ = ['LxmlHtmlParser', 'LxmlXmlParser']
class LxmlHtmlParser(IParser):
class LxmlParser(IParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def get_parser(encoding=None):
pass
def parse(self, data, encoding=None):
if encoding is None:
parser = None
else:
parser = lxml.html.HTMLParser(encoding=encoding)
return lxml.html.parse(data, parser)
parser = self.get_parser(encoding=encoding)
return self.module.parse(data, parser)
def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode)
return self.module.tostring(element, encoding=unicode)
def tocleanstring(self, element):
txt = [txt.strip() for txt in element.itertext()]
@ -52,7 +56,7 @@ class LxmlHtmlParser(IParser):
return txt.strip()
def strip(self, s):
doc = lxml.html.fromstring(s) # parse html string
doc = self.module.fromstring(s) # parse html/xml string
return self.tocleanstring(doc)
@classmethod
@ -97,3 +101,29 @@ class LxmlHtmlParser(IParser):
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
class LxmlHtmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = html
def get_parser(self, encoding=None):
return html.HTMLParser(encoding=encoding)
class LxmlXmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = etree
def get_parser(self, encoding=None):
return etree.XMLParser(encoding=encoding, strip_cdata=False)