From b6021ec90d4b3e2b3bcc933784daa438186214e2 Mon Sep 17 00:00:00 2001 From: Laurent Bachelier Date: Sun, 23 Jun 2013 20:26:12 +0200 Subject: [PATCH] boobot: Try to use HTML encoding --- contrib/boobot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/boobot.py b/contrib/boobot.py index e90a69f7..219a7613 100755 --- a/contrib/boobot.py +++ b/contrib/boobot.py @@ -116,6 +116,14 @@ class BoobotBrowser(StandardBrowser): hsize = self.human_size(size) r.seek(0) encoding = EncodingFinder('windows-1252').encoding(r).lower() + try: + h = self.get_document(r, parser='lxml', encoding=encoding) + for meta in h.xpath('//head/meta'): + encoding = meta.attrib.get('charset', encoding).lower() + except Exception as e: + print e + finally: + r.seek(0) if encoding == 'iso-8859-1': encoding = 'windows-1252' try: