add an xml parser
This commit is contained in:
parent
057901f796
commit
5ea0307b6a
2 changed files with 43 additions and 7 deletions
|
|
@ -38,6 +38,11 @@ def load_lxmlsoup():
|
||||||
return LxmlSoupParser
|
return LxmlSoupParser
|
||||||
|
|
||||||
|
|
||||||
|
def load_xml():
|
||||||
|
from .lxmlparser import LxmlXmlParser
|
||||||
|
return LxmlXmlParser
|
||||||
|
|
||||||
|
|
||||||
def load_html5lib():
|
def load_html5lib():
|
||||||
from .html5libparser import Html5libParser
|
from .html5libparser import Html5libParser
|
||||||
return Html5libParser
|
return Html5libParser
|
||||||
|
|
@ -82,6 +87,7 @@ def get_parser(preference_order=('lxml', 'lxmlsoup')):
|
||||||
"""
|
"""
|
||||||
if not isinstance(preference_order, (tuple, list)):
|
if not isinstance(preference_order, (tuple, list)):
|
||||||
preference_order = [preference_order]
|
preference_order = [preference_order]
|
||||||
|
|
||||||
for kind in preference_order:
|
for kind in preference_order:
|
||||||
if not 'load_%s' % kind in globals():
|
if not 'load_%s' % kind in globals():
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -19,31 +19,35 @@
|
||||||
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import lxml.html
|
import lxml.html as html
|
||||||
|
import lxml.etree as etree
|
||||||
|
|
||||||
from .iparser import IParser
|
from .iparser import IParser
|
||||||
from ..browser import BrokenPageError
|
from ..browser import BrokenPageError
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['LxmlHtmlParser']
|
__all__ = ['LxmlHtmlParser', 'LxmlXmlParser']
|
||||||
|
|
||||||
|
|
||||||
class LxmlHtmlParser(IParser):
|
class LxmlParser(IParser):
|
||||||
"""
|
"""
|
||||||
Parser using lxml.
|
Parser using lxml.
|
||||||
|
|
||||||
Note that it is not available on every systems.
|
Note that it is not available on every systems.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def get_parser(encoding=None):
|
||||||
|
pass
|
||||||
|
|
||||||
def parse(self, data, encoding=None):
|
def parse(self, data, encoding=None):
|
||||||
if encoding is None:
|
if encoding is None:
|
||||||
parser = None
|
parser = None
|
||||||
else:
|
else:
|
||||||
parser = lxml.html.HTMLParser(encoding=encoding)
|
parser = self.get_parser(encoding=encoding)
|
||||||
return lxml.html.parse(data, parser)
|
return self.module.parse(data, parser)
|
||||||
|
|
||||||
def tostring(self, element):
|
def tostring(self, element):
|
||||||
return lxml.html.tostring(element, encoding=unicode)
|
return self.module.tostring(element, encoding=unicode)
|
||||||
|
|
||||||
def tocleanstring(self, element):
|
def tocleanstring(self, element):
|
||||||
txt = [txt.strip() for txt in element.itertext()]
|
txt = [txt.strip() for txt in element.itertext()]
|
||||||
|
|
@ -52,7 +56,7 @@ class LxmlHtmlParser(IParser):
|
||||||
return txt.strip()
|
return txt.strip()
|
||||||
|
|
||||||
def strip(self, s):
|
def strip(self, s):
|
||||||
doc = lxml.html.fromstring(s) # parse html string
|
doc = self.module.fromstring(s) # parse html/xml string
|
||||||
return self.tocleanstring(doc)
|
return self.tocleanstring(doc)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -97,3 +101,29 @@ class LxmlHtmlParser(IParser):
|
||||||
return results[0] if nb == 1 else results
|
return results[0] if nb == 1 else results
|
||||||
else:
|
else:
|
||||||
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
||||||
|
|
||||||
|
|
||||||
|
class LxmlHtmlParser(LxmlParser):
|
||||||
|
"""
|
||||||
|
Parser using lxml.
|
||||||
|
|
||||||
|
Note that it is not available on every systems.
|
||||||
|
"""
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.module = html
|
||||||
|
|
||||||
|
def get_parser(self, encoding=None):
|
||||||
|
return html.HTMLParser(encoding=encoding)
|
||||||
|
|
||||||
|
|
||||||
|
class LxmlXmlParser(LxmlParser):
|
||||||
|
"""
|
||||||
|
Parser using lxml.
|
||||||
|
|
||||||
|
Note that it is not available on every systems.
|
||||||
|
"""
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.module = etree
|
||||||
|
|
||||||
|
def get_parser(self, encoding=None):
|
||||||
|
return etree.XMLParser(encoding=encoding, strip_cdata=False)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue