fix parsing of crappy DLFP pages
This commit is contained in:
parent
eb9118133b
commit
937e8ca370
1 changed files with 13 additions and 0 deletions
|
|
@ -19,12 +19,21 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|||
"""
|
||||
|
||||
import urllib
|
||||
from cStringIO import StringIO
|
||||
|
||||
from weboob.tools.browser import BaseBrowser
|
||||
from weboob.tools.parsers.elementtidyparser import ElementTidyParser
|
||||
|
||||
from .pages.index import IndexPage, LoginPage
|
||||
from .pages.news import ContentPage
|
||||
from .tools import id2url, id2threadid, id2contenttype
|
||||
|
||||
class Parser(ElementTidyParser):
|
||||
def parse(self, data, encoding=None):
|
||||
# Want to kill templeet coders
|
||||
data = StringIO(data.read().replace('<<', '<'))
|
||||
return ElementTidyParser.parse(self, data, encoding)
|
||||
|
||||
# Browser
|
||||
class DLFP(BaseBrowser):
|
||||
DOMAIN = 'linuxfr.org'
|
||||
|
|
@ -36,6 +45,10 @@ class DLFP(BaseBrowser):
|
|||
'https://linuxfr.org/.*/\d+.html': ContentPage
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['parser'] = Parser()
|
||||
BaseBrowser.__init__(self, *args, **kwargs)
|
||||
|
||||
def home(self):
|
||||
return self.location('https://linuxfr.org')
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue