use LxmlHtmlParser as default parser

This commit is contained in:
Romain Bignon 2010-04-16 14:06:28 +02:00
commit 3703adb44e
10 changed files with 130 additions and 45 deletions

View file

@ -20,14 +20,14 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from datetime import datetime from datetime import datetime
from weboob.tools.parser import tostring
from weboob.tools.misc import local2utc from weboob.tools.misc import local2utc
from weboob.backends.dlfp.tools import url2id from weboob.backends.dlfp.tools import url2id
from .index import DLFPPage from .index import DLFPPage
class Comment(object): class Comment(object):
def __init__(self, div, reply_id): def __init__(self, browser, div, reply_id):
self.browser = browser
self.id = '' self.id = ''
self.reply_id = reply_id self.reply_id = reply_id
self.title = u'' self.title = u''
@ -46,9 +46,9 @@ class Comment(object):
self.author = sub.find('a').text self.author = sub.find('a').text
self.date = self.parse_date(sub.find('i').tail) self.date = self.parse_date(sub.find('i').tail)
self.score = int(sub.findall('i')[1].find('span').text) self.score = int(sub.findall('i')[1].find('span').text)
self.body = tostring(sub.find('p')) self.body = self.browser.tostring(sub.find('p'))
elif sub.attrib.get('class', '') == 'commentsul': elif sub.attrib.get('class', '') == 'commentsul':
comment = Comment(sub.find('li'), self.id) comment = Comment(self.browser, sub.find('li'), self.id)
self.comments.append(comment) self.comments.append(comment)
def parse_date(self, date_s): def parse_date(self, date_s):
@ -64,7 +64,8 @@ class Comment(object):
return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title) return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title)
class Article(object): class Article(object):
def __init__(self, _id, tree): def __init__(self, browser, _id, tree):
self.browser = browser
self.id = _id self.id = _id
self.title = u'' self.title = u''
self.author = u'' self.author = u''
@ -87,7 +88,7 @@ class Article(object):
date_s = unicode(div.find('i').tail) date_s = unicode(div.find('i').tail)
#print date_s #print date_s
if div.attrib.get('class', '').startswith('bodydiv '): if div.attrib.get('class', '').startswith('bodydiv '):
self.body = tostring(div) self.body = self.browser.tostring(div)
def append_comment(self, comment): def append_comment(self, comment):
self.comments.append(comment) self.comments.append(comment)
@ -99,7 +100,7 @@ class Article(object):
yield c yield c
def parse_part2(self, div): def parse_part2(self, div):
self.part2 = tostring(div) self.part2 = self.browser.tostring(div)
class ContentPage(DLFPPage): class ContentPage(DLFPPage):
def loaded(self): def loaded(self):
@ -112,11 +113,11 @@ class ContentPage(DLFPPage):
def parse_div(self, div): def parse_div(self, div):
if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'): if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
self.article = Article(url2id(self.url), div) self.article = Article(self.browser, url2id(self.url), div)
if div.attrib.get('class', '') == 'articlediv': if div.attrib.get('class', '') == 'articlediv':
self.article.parse_part2(div) self.article.parse_part2(div)
if div.attrib.get('class', '') == 'comments': if div.attrib.get('class', '') == 'comments':
comment = Comment(div, 0) comment = Comment(self.browser, div, 0)
self.article.append_comment(comment) self.article.append_comment(comment)
def get_article(self): def get_article(self):

View file

@ -39,7 +39,7 @@ class Boobank(ConsoleApplication):
@ConsoleApplication.command('List every available accounts') @ConsoleApplication.command('List every available accounts')
def command_list(self): def command_list(self):
accounts = [] accounts = []
for backend, in self.weboob.iter_backends(): for backend in self.weboob.iter_backends():
try: try:
for account in backend.iter_accounts(): for account in backend.iter_accounts():
accounts.append('%17s %-20s %11.2f %11.2f' % ( accounts.append('%17s %-20s %11.2f %11.2f' % (

View file

@ -35,7 +35,7 @@ class Travel(ConsoleApplication):
print '| ID | Name |' print '| ID | Name |'
print '+--------------------------------+---------------------------------------------+' print '+--------------------------------+---------------------------------------------+'
count = 0 count = 0
for backend, in self.weboob.iter_backends(): for backend in self.weboob.iter_backends():
for station in backend.iter_station_search(pattern): for station in backend.iter_station_search(pattern):
print '| %-31s| %-44s|' % (station.id, station.name) print '| %-31s| %-44s|' % (station.id, station.name)
count += 1 count += 1
@ -49,7 +49,7 @@ class Travel(ConsoleApplication):
print "| ID | Type | Time | Arrival | Late | Info | Plateform |" print "| ID | Type | Time | Arrival | Late | Info | Plateform |"
print "+-----+-----------+-------+-----------------------+-------+--------------------+-----------+" print "+-----+-----------+-------+-----------------------+-------+--------------------+-----------+"
count = 0 count = 0
for backend, in self.weboob.iter_backends(): for backend in self.weboob.iter_backends():
for departure in backend.iter_station_departures(station, arrival): for departure in backend.iter_station_departures(station, arrival):
print u"|%4d | %-10s|%6s | %-22s|%6s | %-19s| %-10s|" % (departure.id, print u"|%4d | %-10s|%6s | %-22s|%6s | %-19s| %-10s|" % (departure.id,
departure.type, departure.type,
@ -59,6 +59,6 @@ class Travel(ConsoleApplication):
departure.information, departure.information,
departure.plateform) departure.plateform)
count += 1 count += 1
print "+-----'-----------'-------'-----------------------'-------'--------------------+" print "+-----'-----------'-------'-----------------------'-------'--------------------'-----------+"
print "| %3d departures listed |" % count print "| %3d departures listed |" % count
print "'------------------------------------------------------------------------------'" print "'------------------------------------------------------------------------------------------'"

View file

@ -245,6 +245,12 @@ class Browser(mechanize.Browser):
if self.__cookie: if self.__cookie:
self.__cookie.save() self.__cookie.save()
def tostring(self, elem):
"""
Get HTML string from document.
"""
return self.__parser.dump(elem)
def str(self, s): def str(self, s):
if isinstance(s, unicode): if isinstance(s, unicode):
s = s.encode('iso-8859-15', 'replace') s = s.encode('iso-8859-15', 'replace')

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Copyright(C) 2010 Christophe Benz Copyright(C) 2010 Christophe Benz, Romain Bignon
This program is free software; you can redistribute it and/or modify This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -18,17 +18,28 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
from .standardparser import StandardParser, tostring # Low performances
# v
# v
try: try:
from .elementtidyparser import ElementTidyParser from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser
except ImportError: except ImportError:
pass pass
# v
try: try:
from .html5libparser import Html5libParser from .htmlparser import HTMLParser, HTMLParser as StandardParser
except ImportError: except ImportError:
pass pass
# v
try: try:
from .lxmlparser import LxmlHtmlParser from .html5libparser import Html5libParser, Html5libParser as StandardParser
except ImportError: except ImportError:
pass pass
# v
try:
from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser
except ImportError:
pass
# v
# v
# High performances

View file

@ -32,7 +32,9 @@ try:
except ImportError: except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
class ElementTidyParser(object): from .iparser import IParser
class ElementTidyParser(IParser):
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
TidyHTMLTreeBuilder.ElementTree = ElementTree TidyHTMLTreeBuilder.ElementTree = ElementTree
HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder
@ -42,3 +44,20 @@ class ElementTidyParser(object):
if elem.tag.startswith('{'): if elem.tag.startswith('{'):
elem.tag = elem.tag[elem.tag.find('}')+1:] elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree return tree
def dump(self, element):
e = ElementTree.Element('body')
e.text = element.text
e.tail = element.tail
for sub in element.getchildren():
e.append(sub)
s = ''
# XXX OK if it doesn't work with utf-8, the result will be fucking ugly.
for encoding in ('utf-8', 'ISO-8859-1'):
try:
s = ElementTree.tostring(e, encoding)
except UnicodeError:
continue
else:
break
return unicode(s)

View file

@ -24,7 +24,9 @@ try:
except ImportError: except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
class Html5libParser(HTMLParser): from .iparser import IParser
class Html5libParser(HTMLParser, IParser):
""" """
Parser using html5lib. Parser using html5lib.
@ -42,3 +44,7 @@ class Html5libParser(HTMLParser):
def parse(self, data, encoding): def parse(self, data, encoding):
return HTMLParser.parse(self, data, encoding=encoding) return HTMLParser.parse(self, data, encoding=encoding)
def dump(self, elem):
# TODO
raise NotImplementedError()

View file

@ -18,18 +18,20 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
__all__ = ['StandardParser', 'tostring'] __all__ = ['HTMLParser']
from HTMLParser import HTMLParser from HTMLParser import HTMLParser as _HTMLParser
import htmlentitydefs import htmlentitydefs
try: try:
from xml.etree import cElementTree as ElementTree from xml.etree import cElementTree as ElementTree
except ImportError: except ImportError:
from xml.etree import ElementTree from xml.etree import ElementTree
class HTMLTreeBuilder(HTMLParser): from .iparser import IParser
class HTMLTreeBuilder(_HTMLParser):
def __init__(self, encoding=None): def __init__(self, encoding=None):
HTMLParser.__init__(self) _HTMLParser.__init__(self)
self._target = ElementTree.TreeBuilder() self._target = ElementTree.TreeBuilder()
def doctype(self, name, pubid, system): def doctype(self, name, pubid, system):
@ -64,7 +66,7 @@ class HTMLTreeBuilder(HTMLParser):
except: except:
pass pass
class StandardParser(object): class HTMLParser(IParser):
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
parser = HTMLTreeBuilder(encoding) parser = HTMLTreeBuilder(encoding)
tree = ElementTree.parse(data, parser) tree = ElementTree.parse(data, parser)
@ -73,19 +75,19 @@ class StandardParser(object):
elem.tag = elem.tag[elem.tag.find('}')+1:] elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree return tree
def tostring(element): def dump(self, element):
e = ElementTree.Element('body') e = ElementTree.Element('body')
e.text = element.text e.text = element.text
e.tail = element.tail e.tail = element.tail
for sub in element.getchildren(): for sub in element.getchildren():
e.append(sub) e.append(sub)
s = '' s = ''
# XXX OK if it doesn't work with utf-8, the result will be fucking ugly. # XXX OK if it doesn't work with utf-8, the result will be fucking ugly.
for encoding in ('utf-8', 'ISO-8859-1'): for encoding in ('utf-8', 'ISO-8859-1'):
try: try:
s = ElementTree.tostring(e, encoding) s = ElementTree.tostring(e, encoding)
except UnicodeError: except UnicodeError:
continue continue
else: else:
break break
return unicode(s) return unicode(s)

View file

@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Romain Bignon
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
class IParser(object):
def dump(self, elem):
"""
Get HTML string from an element.
"""
raise NotImplementedError()
def parse(self, data, encoding=None):
"""
Parse a HTML document with a specific encoding to get a tree.
@param data [str] HTML document
@param encoding [str] encoding to use
@return an object with the structured document
"""
raise NotImplementedError()

View file

@ -19,8 +19,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
import lxml.html import lxml.html
from .iparser import IParser
class LxmlHtmlParser(object): class LxmlHtmlParser(IParser):
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
parser = lxml.html.HTMLParser(encoding=encoding) parser = lxml.html.HTMLParser(encoding=encoding)
return lxml.html.parse(data, parser) return lxml.html.parse(data, parser)
def dump(self, element):
return lxml.html.tostring(element, encoding=unicode)