fix parsing of crappy DLFP pages

This commit is contained in:
Romain Bignon 2010-05-11 15:08:40 +02:00
commit 937e8ca370

View file

@ -19,12 +19,21 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
import urllib
from cStringIO import StringIO
from weboob.tools.browser import BaseBrowser
from weboob.tools.parsers.elementtidyparser import ElementTidyParser
from .pages.index import IndexPage, LoginPage
from .pages.news import ContentPage
from .tools import id2url, id2threadid, id2contenttype
class Parser(ElementTidyParser):
def parse(self, data, encoding=None):
# Want to kill templeet coders
data = StringIO(data.read().replace('<<', '<'))
return ElementTidyParser.parse(self, data, encoding)
# Browser
class DLFP(BaseBrowser):
DOMAIN = 'linuxfr.org'
@ -36,6 +45,10 @@ class DLFP(BaseBrowser):
'https://linuxfr.org/.*/\d+.html': ContentPage
}
def __init__(self, *args, **kwargs):
kwargs['parser'] = Parser()
BaseBrowser.__init__(self, *args, **kwargs)
def home(self):
return self.location('https://linuxfr.org')