fix parsing of crappy DLFP pages
This commit is contained in:
parent
eb9118133b
commit
937e8ca370
1 changed files with 13 additions and 0 deletions
|
|
@ -19,12 +19,21 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import urllib
|
import urllib
|
||||||
|
from cStringIO import StringIO
|
||||||
|
|
||||||
from weboob.tools.browser import BaseBrowser
|
from weboob.tools.browser import BaseBrowser
|
||||||
|
from weboob.tools.parsers.elementtidyparser import ElementTidyParser
|
||||||
|
|
||||||
from .pages.index import IndexPage, LoginPage
|
from .pages.index import IndexPage, LoginPage
|
||||||
from .pages.news import ContentPage
|
from .pages.news import ContentPage
|
||||||
from .tools import id2url, id2threadid, id2contenttype
|
from .tools import id2url, id2threadid, id2contenttype
|
||||||
|
|
||||||
|
class Parser(ElementTidyParser):
|
||||||
|
def parse(self, data, encoding=None):
|
||||||
|
# Want to kill templeet coders
|
||||||
|
data = StringIO(data.read().replace('<<', '<'))
|
||||||
|
return ElementTidyParser.parse(self, data, encoding)
|
||||||
|
|
||||||
# Browser
|
# Browser
|
||||||
class DLFP(BaseBrowser):
|
class DLFP(BaseBrowser):
|
||||||
DOMAIN = 'linuxfr.org'
|
DOMAIN = 'linuxfr.org'
|
||||||
|
|
@ -36,6 +45,10 @@ class DLFP(BaseBrowser):
|
||||||
'https://linuxfr.org/.*/\d+.html': ContentPage
|
'https://linuxfr.org/.*/\d+.html': ContentPage
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
kwargs['parser'] = Parser()
|
||||||
|
BaseBrowser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
def home(self):
|
def home(self):
|
||||||
return self.location('https://linuxfr.org')
|
return self.location('https://linuxfr.org')
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue