From 937e8ca370b2390bdc41e643794d5aa98515a91f Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Tue, 11 May 2010 15:08:40 +0200 Subject: [PATCH] fix parsing of crappy DLFP pages --- weboob/backends/dlfp/browser.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/weboob/backends/dlfp/browser.py b/weboob/backends/dlfp/browser.py index d4845857..5d54a024 100644 --- a/weboob/backends/dlfp/browser.py +++ b/weboob/backends/dlfp/browser.py @@ -19,12 +19,21 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ import urllib +from cStringIO import StringIO from weboob.tools.browser import BaseBrowser +from weboob.tools.parsers.elementtidyparser import ElementTidyParser + from .pages.index import IndexPage, LoginPage from .pages.news import ContentPage from .tools import id2url, id2threadid, id2contenttype +class Parser(ElementTidyParser): + def parse(self, data, encoding=None): + # Want to kill templeet coders + data = StringIO(data.read().replace('<<', '<')) + return ElementTidyParser.parse(self, data, encoding) + # Browser class DLFP(BaseBrowser): DOMAIN = 'linuxfr.org' @@ -36,6 +45,10 @@ class DLFP(BaseBrowser): 'https://linuxfr.org/.*/\d+.html': ContentPage } + def __init__(self, *args, **kwargs): + kwargs['parser'] = Parser() + BaseBrowser.__init__(self, *args, **kwargs) + def home(self): return self.location('https://linuxfr.org')