boobot: Better handling of zero/invalid HTML

This commit is contained in:
Laurent Bachelier 2013-06-23 19:43:59 +02:00
commit 321a7114cd

View file

@ -114,9 +114,14 @@ class BoobotBrowser(StandardBrowser):
if is_html: if is_html:
if not body: if not body:
r = self.openurl(url, _tries=2, _delay=0.2) r = self.openurl(url, _tries=2, _delay=0.2)
# update size has we might not have it from headers
size = len(r.read())
hsize = self.human_size(size)
r.seek(0)
encoding = EncodingFinder('windows-1252').encoding(r).lower() encoding = EncodingFinder('windows-1252').encoding(r).lower()
if encoding == 'iso-8859-1': if encoding == 'iso-8859-1':
encoding = 'windows-1252' encoding = 'windows-1252'
try:
h = self.get_document(r, parser='lxml', encoding=encoding) h = self.get_document(r, parser='lxml', encoding=encoding)
for title in h.xpath('//head/title'): for title in h.xpath('//head/title'):
title = to_unicode(title.text_content()).strip() title = to_unicode(title.text_content()).strip()
@ -125,6 +130,10 @@ class BoobotBrowser(StandardBrowser):
for title in h.getroot().cssselect('.permalink-tweet .tweet-text'): for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
title = to_unicode(title.text_content()).strip() title = to_unicode(title.text_content()).strip()
title = ' '.join(title.splitlines()) title = ' '.join(title.splitlines())
except AssertionError as e:
# invalid HTML
print e
return content_type, hsize, title return content_type, hsize, title
def human_size(self, size): def human_size(self, size):