From 3710bb87ae9db41911d4322b18550a54e71b9093 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Sun, 5 May 2013 21:23:30 +0200 Subject: [PATCH] boobot: Support servers with no HEAD --- contrib/boobot.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/contrib/boobot.py b/contrib/boobot.py index 93a02ac5..4c09a68b 100755 --- a/contrib/boobot.py +++ b/contrib/boobot.py @@ -34,7 +34,7 @@ from mechanize import _headersutil as headersutil from mechanize._html import EncodingFinder from weboob.core import Weboob -from weboob.tools.browser import StandardBrowser, BrowserUnavailable +from weboob.tools.browser import StandardBrowser, BrowserUnavailable, BrowserHTTPError from weboob.tools.misc import get_backtrace from weboob.tools.misc import to_unicode from weboob.tools.storage import StandardStorage @@ -88,7 +88,13 @@ class BoobotBrowser(StandardBrowser): ENCODING = None def urlinfo(self, url): - r = self.openurl(HeadRequest(url)) + try: + r = self.openurl(HeadRequest(url)) + body = False + except BrowserHTTPError as e: + if 'HTTP Error 501' in unicode(e): + r = self.openurl(url) + body = True headers = r.info() content_type = headers.get('Content-Type') try: @@ -100,7 +106,8 @@ class BoobotBrowser(StandardBrowser): is_html = headersutil.is_html([content_type], url, True) title = None if is_html: - r = self.openurl(url) + if not body: + r = self.openurl(url) encoding = EncodingFinder('windows-1252').encoding(r).lower().replace('iso-8859-1', 'windows-1252') h = self.get_document(r, parser='lxml', encoding=encoding) for title in h.xpath('//head/title'):