boobot: More encoding fixes

2013-06-25 00:31:49 +02:00 · 2013-06-25 00:31:49 +02:00 · 70442b3044
commit 70442b3044
parent a488edbd69
1 changed files with 14 additions and 1 deletions
--- a/contrib/boobot.py
+++ b/contrib/boobot.py
@ -23,6 +23,7 @@ import logging
 import re
 import os
 import sys
+import codecs
 from threading import Thread, Event
 from math import log
 import urlparse
@ -116,17 +117,29 @@ class BoobotBrowser(StandardBrowser):
            size = len(r.read())
            hsize = self.human_size(size)
            r.seek(0)
+
            encoding = EncodingFinder('windows-1252').encoding(r).lower()
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for meta in h.xpath('//head/meta'):
+                    # meta http-equiv=content-type content=...
+                    if meta.attrib.get('http-equiv', '').lower() == 'content-type':
+                        for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
+                            if k == 'charset':
+                                encoding = v
+                    # meta charset=...
                    encoding = meta.attrib.get('charset', encoding).lower()
            except Exception as e:
                print e
            finally:
                r.seek(0)
-            if encoding == 'iso-8859-1':
+            if encoding == 'iso-8859-1' or not encoding:
                encoding = 'windows-1252'
+            try:
+                codecs.lookup(encoding)
+            except LookupError:
+                encoding = 'windows-1252'
+
            try:
                h = self.get_document(r, parser='lxml', encoding=encoding)
                for title in h.xpath('//head/title'):