From ab7287747a422d34d91e6d3a69f81eb1d330e4e0 Mon Sep 17 00:00:00 2001
From: Laurent Bachelier <laurent@bachelier.name>
Date: Sun, 5 May 2013 20:40:51 +0200
Subject: [PATCH] boobot: Guess page encoding

This seems to work with most pages.
---
 contrib/boobot.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/contrib/boobot.py b/contrib/boobot.py
index eb0d8818..f4e7b62c 100755
--- a/contrib/boobot.py
+++ b/contrib/boobot.py
@@ -31,7 +31,7 @@ import urllib
 from irc.bot import SingleServerIRCBot
 import mechanize
 from mechanize import _headersutil as headersutil
-import lxml.html
+from mechanize._html import EncodingFinder
 
 from weboob.core import Weboob
 from weboob.tools.browser import StandardBrowser, BrowserUnavailable
@@ -85,9 +85,10 @@ class HeadRequest(mechanize.Request):
 
 
 class BoobotBrowser(StandardBrowser):
+    ENCODING = None
+
     def urlinfo(self, url):
-        b = StandardBrowser()
-        r = b.openurl(HeadRequest(url))
+        r = self.openurl(HeadRequest(url))
         headers = r.info()
         content_type = headers.get('Content-Type')
         try:
@@ -99,7 +100,9 @@ class BoobotBrowser(StandardBrowser):
         is_html = headersutil.is_html([content_type], url, True)
         title = None
         if is_html:
-            h = lxml.html.fromstring(self.readurl(url))
+            r = self.openurl(url)
+            encoding = EncodingFinder('windows-1252').encoding(r)
+            h = self.get_document(r, parser='lxml', encoding=encoding)
             for title in h.xpath('//head/title'):
                 title = to_unicode(title.text_content())
         return content_type, hsize, title