boobot: More encoding fixes

This commit is contained in:
Laurent Bachelier 2013-06-25 00:31:49 +02:00
commit 70442b3044

View file

@ -23,6 +23,7 @@ import logging
import re import re
import os import os
import sys import sys
import codecs
from threading import Thread, Event from threading import Thread, Event
from math import log from math import log
import urlparse import urlparse
@ -116,17 +117,29 @@ class BoobotBrowser(StandardBrowser):
size = len(r.read()) size = len(r.read())
hsize = self.human_size(size) hsize = self.human_size(size)
r.seek(0) r.seek(0)
encoding = EncodingFinder('windows-1252').encoding(r).lower() encoding = EncodingFinder('windows-1252').encoding(r).lower()
try: try:
h = self.get_document(r, parser='lxml', encoding=encoding) h = self.get_document(r, parser='lxml', encoding=encoding)
for meta in h.xpath('//head/meta'): for meta in h.xpath('//head/meta'):
# meta http-equiv=content-type content=...
if meta.attrib.get('http-equiv', '').lower() == 'content-type':
for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
if k == 'charset':
encoding = v
# meta charset=...
encoding = meta.attrib.get('charset', encoding).lower() encoding = meta.attrib.get('charset', encoding).lower()
except Exception as e: except Exception as e:
print e print e
finally: finally:
r.seek(0) r.seek(0)
if encoding == 'iso-8859-1': if encoding == 'iso-8859-1' or not encoding:
encoding = 'windows-1252' encoding = 'windows-1252'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'windows-1252'
try: try:
h = self.get_document(r, parser='lxml', encoding=encoding) h = self.get_document(r, parser='lxml', encoding=encoding)
for title in h.xpath('//head/title'): for title in h.xpath('//head/title'):