From 3703adb44eff325ebe8a8c4568c415702270d3f3 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Fri, 16 Apr 2010 14:06:28 +0200 Subject: [PATCH] use LxmlHtmlParser as default parser --- weboob/backends/dlfp/pages/news.py | 19 ++++---- weboob/frontends/boobank/boobank.py | 2 +- weboob/frontends/travel/application.py | 10 ++--- weboob/tools/browser.py | 6 +++ weboob/tools/parser/__init__.py | 23 +++++++--- weboob/tools/parser/elementtidyparser.py | 21 ++++++++- weboob/tools/parser/html5libparser.py | 8 +++- .../{standardparser.py => htmlparser.py} | 44 ++++++++++--------- weboob/tools/parser/iparser.py | 36 +++++++++++++++ weboob/tools/parser/lxmlparser.py | 6 ++- 10 files changed, 130 insertions(+), 45 deletions(-) rename weboob/tools/parser/{standardparser.py => htmlparser.py} (73%) create mode 100644 weboob/tools/parser/iparser.py diff --git a/weboob/backends/dlfp/pages/news.py b/weboob/backends/dlfp/pages/news.py index d36693ff..a7287800 100644 --- a/weboob/backends/dlfp/pages/news.py +++ b/weboob/backends/dlfp/pages/news.py @@ -20,14 +20,14 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. from datetime import datetime -from weboob.tools.parser import tostring from weboob.tools.misc import local2utc from weboob.backends.dlfp.tools import url2id from .index import DLFPPage class Comment(object): - def __init__(self, div, reply_id): + def __init__(self, browser, div, reply_id): + self.browser = browser self.id = '' self.reply_id = reply_id self.title = u'' @@ -46,9 +46,9 @@ class Comment(object): self.author = sub.find('a').text self.date = self.parse_date(sub.find('i').tail) self.score = int(sub.findall('i')[1].find('span').text) - self.body = tostring(sub.find('p')) + self.body = self.browser.tostring(sub.find('p')) elif sub.attrib.get('class', '') == 'commentsul': - comment = Comment(sub.find('li'), self.id) + comment = Comment(self.browser, sub.find('li'), self.id) self.comments.append(comment) def parse_date(self, date_s): @@ -64,7 +64,8 @@ class Comment(object): return u"" % (self.id, self.author, self.title) class Article(object): - def __init__(self, _id, tree): + def __init__(self, browser, _id, tree): + self.browser = browser self.id = _id self.title = u'' self.author = u'' @@ -87,7 +88,7 @@ class Article(object): date_s = unicode(div.find('i').tail) #print date_s if div.attrib.get('class', '').startswith('bodydiv '): - self.body = tostring(div) + self.body = self.browser.tostring(div) def append_comment(self, comment): self.comments.append(comment) @@ -99,7 +100,7 @@ class Article(object): yield c def parse_part2(self, div): - self.part2 = tostring(div) + self.part2 = self.browser.tostring(div) class ContentPage(DLFPPage): def loaded(self): @@ -112,11 +113,11 @@ class ContentPage(DLFPPage): def parse_div(self, div): if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'): - self.article = Article(url2id(self.url), div) + self.article = Article(self.browser, url2id(self.url), div) if div.attrib.get('class', '') == 'articlediv': self.article.parse_part2(div) if div.attrib.get('class', '') == 'comments': - comment = Comment(div, 0) + comment = Comment(self.browser, div, 0) self.article.append_comment(comment) def get_article(self): diff --git a/weboob/frontends/boobank/boobank.py b/weboob/frontends/boobank/boobank.py index 5fe3194e..7adbcb95 100644 --- a/weboob/frontends/boobank/boobank.py +++ b/weboob/frontends/boobank/boobank.py @@ -39,7 +39,7 @@ class Boobank(ConsoleApplication): @ConsoleApplication.command('List every available accounts') def command_list(self): accounts = [] - for backend, in self.weboob.iter_backends(): + for backend in self.weboob.iter_backends(): try: for account in backend.iter_accounts(): accounts.append('%17s %-20s %11.2f %11.2f' % ( diff --git a/weboob/frontends/travel/application.py b/weboob/frontends/travel/application.py index 7226f212..533af093 100644 --- a/weboob/frontends/travel/application.py +++ b/weboob/frontends/travel/application.py @@ -35,7 +35,7 @@ class Travel(ConsoleApplication): print '| ID | Name |' print '+--------------------------------+---------------------------------------------+' count = 0 - for backend, in self.weboob.iter_backends(): + for backend in self.weboob.iter_backends(): for station in backend.iter_station_search(pattern): print '| %-31s| %-44s|' % (station.id, station.name) count += 1 @@ -49,7 +49,7 @@ class Travel(ConsoleApplication): print "| ID | Type | Time | Arrival | Late | Info | Plateform |" print "+-----+-----------+-------+-----------------------+-------+--------------------+-----------+" count = 0 - for backend, in self.weboob.iter_backends(): + for backend in self.weboob.iter_backends(): for departure in backend.iter_station_departures(station, arrival): print u"|%4d | %-10s|%6s | %-22s|%6s | %-19s| %-10s|" % (departure.id, departure.type, @@ -59,6 +59,6 @@ class Travel(ConsoleApplication): departure.information, departure.plateform) count += 1 - print "+-----'-----------'-------'-----------------------'-------'--------------------+" - print "| %3d departures listed |" % count - print "'------------------------------------------------------------------------------'" + print "+-----'-----------'-------'-----------------------'-------'--------------------'-----------+" + print "| %3d departures listed |" % count + print "'------------------------------------------------------------------------------------------'" diff --git a/weboob/tools/browser.py b/weboob/tools/browser.py index 9775a949..e0a5b090 100644 --- a/weboob/tools/browser.py +++ b/weboob/tools/browser.py @@ -245,6 +245,12 @@ class Browser(mechanize.Browser): if self.__cookie: self.__cookie.save() + def tostring(self, elem): + """ + Get HTML string from document. + """ + return self.__parser.dump(elem) + def str(self, s): if isinstance(s, unicode): s = s.encode('iso-8859-15', 'replace') diff --git a/weboob/tools/parser/__init__.py b/weboob/tools/parser/__init__.py index 5915b6da..a1d6f651 100644 --- a/weboob/tools/parser/__init__.py +++ b/weboob/tools/parser/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ -Copyright(C) 2010 Christophe Benz +Copyright(C) 2010 Christophe Benz, Romain Bignon This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,17 +18,28 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ -from .standardparser import StandardParser, tostring - +# Low performances +# v +# v try: - from .elementtidyparser import ElementTidyParser + from .elementtidyparser import ElementTidyParser, ElementTidyParser as StandardParser except ImportError: pass +# v try: - from .html5libparser import Html5libParser + from .htmlparser import HTMLParser, HTMLParser as StandardParser except ImportError: pass +# v try: - from .lxmlparser import LxmlHtmlParser + from .html5libparser import Html5libParser, Html5libParser as StandardParser except ImportError: pass +# v +try: + from .lxmlparser import LxmlHtmlParser, LxmlHtmlParser as StandardParser +except ImportError: + pass +# v +# v +# High performances diff --git a/weboob/tools/parser/elementtidyparser.py b/weboob/tools/parser/elementtidyparser.py index 1ecd1f22..10e4e94e 100644 --- a/weboob/tools/parser/elementtidyparser.py +++ b/weboob/tools/parser/elementtidyparser.py @@ -32,7 +32,9 @@ try: except ImportError: from xml.etree import ElementTree -class ElementTidyParser(object): +from .iparser import IParser + +class ElementTidyParser(IParser): def parse(self, data, encoding=None): TidyHTMLTreeBuilder.ElementTree = ElementTree HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder @@ -42,3 +44,20 @@ class ElementTidyParser(object): if elem.tag.startswith('{'): elem.tag = elem.tag[elem.tag.find('}')+1:] return tree + + def dump(self, element): + e = ElementTree.Element('body') + e.text = element.text + e.tail = element.tail + for sub in element.getchildren(): + e.append(sub) + s = '' + # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. + for encoding in ('utf-8', 'ISO-8859-1'): + try: + s = ElementTree.tostring(e, encoding) + except UnicodeError: + continue + else: + break + return unicode(s) diff --git a/weboob/tools/parser/html5libparser.py b/weboob/tools/parser/html5libparser.py index 8592fad5..81ec20f9 100644 --- a/weboob/tools/parser/html5libparser.py +++ b/weboob/tools/parser/html5libparser.py @@ -24,7 +24,9 @@ try: except ImportError: from xml.etree import ElementTree -class Html5libParser(HTMLParser): +from .iparser import IParser + +class Html5libParser(HTMLParser, IParser): """ Parser using html5lib. @@ -42,3 +44,7 @@ class Html5libParser(HTMLParser): def parse(self, data, encoding): return HTMLParser.parse(self, data, encoding=encoding) + + def dump(self, elem): + # TODO + raise NotImplementedError() diff --git a/weboob/tools/parser/standardparser.py b/weboob/tools/parser/htmlparser.py similarity index 73% rename from weboob/tools/parser/standardparser.py rename to weboob/tools/parser/htmlparser.py index 28d1cfde..3239dabc 100644 --- a/weboob/tools/parser/standardparser.py +++ b/weboob/tools/parser/htmlparser.py @@ -18,18 +18,20 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ -__all__ = ['StandardParser', 'tostring'] +__all__ = ['HTMLParser'] -from HTMLParser import HTMLParser +from HTMLParser import HTMLParser as _HTMLParser import htmlentitydefs try: from xml.etree import cElementTree as ElementTree except ImportError: from xml.etree import ElementTree -class HTMLTreeBuilder(HTMLParser): +from .iparser import IParser + +class HTMLTreeBuilder(_HTMLParser): def __init__(self, encoding=None): - HTMLParser.__init__(self) + _HTMLParser.__init__(self) self._target = ElementTree.TreeBuilder() def doctype(self, name, pubid, system): @@ -64,7 +66,7 @@ class HTMLTreeBuilder(HTMLParser): except: pass -class StandardParser(object): +class HTMLParser(IParser): def parse(self, data, encoding=None): parser = HTMLTreeBuilder(encoding) tree = ElementTree.parse(data, parser) @@ -73,19 +75,19 @@ class StandardParser(object): elem.tag = elem.tag[elem.tag.find('}')+1:] return tree -def tostring(element): - e = ElementTree.Element('body') - e.text = element.text - e.tail = element.tail - for sub in element.getchildren(): - e.append(sub) - s = '' - # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. - for encoding in ('utf-8', 'ISO-8859-1'): - try: - s = ElementTree.tostring(e, encoding) - except UnicodeError: - continue - else: - break - return unicode(s) + def dump(self, element): + e = ElementTree.Element('body') + e.text = element.text + e.tail = element.tail + for sub in element.getchildren(): + e.append(sub) + s = '' + # XXX OK if it doesn't work with utf-8, the result will be fucking ugly. + for encoding in ('utf-8', 'ISO-8859-1'): + try: + s = ElementTree.tostring(e, encoding) + except UnicodeError: + continue + else: + break + return unicode(s) diff --git a/weboob/tools/parser/iparser.py b/weboob/tools/parser/iparser.py new file mode 100644 index 00000000..2e2db4cc --- /dev/null +++ b/weboob/tools/parser/iparser.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +""" +Copyright(C) 2010 Romain Bignon + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, version 3 of the License. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +""" + +class IParser(object): + def dump(self, elem): + """ + Get HTML string from an element. + """ + raise NotImplementedError() + + def parse(self, data, encoding=None): + """ + Parse a HTML document with a specific encoding to get a tree. + + @param data [str] HTML document + @param encoding [str] encoding to use + @return an object with the structured document + """ + raise NotImplementedError() diff --git a/weboob/tools/parser/lxmlparser.py b/weboob/tools/parser/lxmlparser.py index 8f865357..057ecca0 100644 --- a/weboob/tools/parser/lxmlparser.py +++ b/weboob/tools/parser/lxmlparser.py @@ -19,8 +19,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ import lxml.html +from .iparser import IParser -class LxmlHtmlParser(object): +class LxmlHtmlParser(IParser): def parse(self, data, encoding=None): parser = lxml.html.HTMLParser(encoding=encoding) return lxml.html.parse(data, parser) + + def dump(self, element): + return lxml.html.tostring(element, encoding=unicode)