use the new browser

This commit is contained in:
Romain Bignon 2014-10-08 14:14:03 +02:00
commit 3ed6103740

View file

@ -25,7 +25,6 @@ import logging
import re
import os
import sys
import codecs
from threading import Thread, Event
from math import log
import urlparse
@ -33,12 +32,12 @@ import urllib
from random import randint, choice
import itertools
from irc.bot import SingleServerIRCBot
import mechanize
from mechanize import _headersutil as headersutil
from mechanize._html import EncodingFinder
from weboob.core import Weboob
from weboob.deprecated.browser import StandardBrowser, BrowserUnavailable
from weboob.exceptions import BrowserUnavailable, BrowserHTTPError
from weboob.browser import Browser
from weboob.browser.exceptions import HTTPNotFound
from weboob.browser.pages import HTMLPage
from weboob.tools.misc import get_backtrace
from weboob.tools.misc import to_unicode
from weboob.tools.storage import StandardStorage
@ -46,7 +45,7 @@ from weboob.tools.application.base import ApplicationStorage
IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',')
IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot')
IRC_SERVER = os.getenv('BOOBOT_SERVER', 'chat.freenode.net')
IRC_SERVER = os.getenv('BOOBOT_SERVER', 'dickson.freenode.net')
IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')]
STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage')
@ -85,82 +84,50 @@ def fixurl(url):
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
class HeadRequest(mechanize.Request):
def get_method(self):
return "HEAD"
class BoobotBrowser(StandardBrowser):
ENCODING = None
DEFAULT_TIMEOUT = 3
class BoobotBrowser(Browser):
TIMEOUT = 3.0
def urlinfo(self, url, maxback=2):
if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
url = url.replace('mobile.twitter.com', 'twitter.com', 1)
try:
r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
r = self.open(url, method='HEAD')
body = False
except BrowserUnavailable as e:
if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e):
r = self.openurl(url, _tries=2, _delay=0.2)
body = True
elif u'HTTP Error 404' in unicode(e) \
and maxback and not url[-1].isalnum():
except HTTPNotFound as e:
if maxback and not url[-1].isalnum():
return self.urlinfo(url[:-1], maxback-1)
raise e
except BrowserHTTPError as e:
if e.response.status_code in (501, 405):
r = self.open(url)
body = True
else:
raise e
headers = r.info()
content_type = headers.get('Content-Type')
content_type = r.headers.get('Content-Type')
try:
size = int(headers.get('Content-Length'))
size = int(r.headers.get('Content-Length'))
hsize = self.human_size(size)
except TypeError:
size = None
hsize = None
is_html = headersutil.is_html([content_type], url, True)
is_html = ('html' in content_type) if content_type else re.match(r'\.x?html?$', url)
title = None
if is_html:
if not body:
r = self.openurl(url, _tries=2, _delay=0.2)
r = self.open(url)
# update size has we might not have it from headers
size = len(r.read())
size = len(r.content)
hsize = self.human_size(size)
r.seek(0)
encoding = EncodingFinder('windows-1252').encoding(r).lower()
try:
h = self.get_document(r, parser='lxml', encoding=encoding)
for meta in h.xpath('//head/meta'):
# meta http-equiv=content-type content=...
if meta.attrib.get('http-equiv', '').lower() == 'content-type':
for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
if k == 'charset':
encoding = v
# meta charset=...
encoding = meta.attrib.get('charset', encoding).lower()
except Exception as e:
print(e)
finally:
r.seek(0)
if encoding == 'iso-8859-1' or not encoding:
encoding = 'windows-1252'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'windows-1252'
page = HTMLPage(self, r)
try:
h = self.get_document(r, parser='lxml', encoding=encoding)
for title in h.xpath('//head/title'):
for title in page.doc.xpath('//head/title'):
title = to_unicode(title.text_content()).strip()
title = ' '.join(title.split())
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
for title in page.doc.getroot().cssselect('.permalink-tweet .tweet-text'):
title = to_unicode(title.text_content()).strip()
title = ' '.join(title.split())
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
title = to_unicode(title.text_content()).strip()
title = ' '.join(title.splitlines())
except AssertionError as e:
# invalid HTML
print(e)
title = ' '.join(title.splitlines())
return content_type, hsize, title
@ -198,8 +165,7 @@ class MyThread(Thread):
'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg',
'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob',
'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs',
'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'assnet',
'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
'webcontentedit', 'weboorrents', 'assnet', 'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
if word in text.lower():
return word
return None
@ -315,7 +281,7 @@ class Boobot(SingleServerIRCBot):
quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text})
self.storage.set(channel, 'quotes', quotes)
self.storage.save()
self.send_message('Quote #%s added' % len(quotes) - 1, channel)
self.send_message('Quote #%s added' % (len(quotes) - 1), channel)
def cmd_delquote(self, nick, channel, text):
quotes = self.storage.get(channel, 'quotes', default=[])
@ -372,7 +338,7 @@ class Boobot(SingleServerIRCBot):
if backend_name in self.weboob.backend_instances:
backend = self.weboob.backend_instances[backend_name]
for cap in backend.iter_caps():
func = 'obj_info_%s' % cap.__name__[4:].lower()
func = 'obj_info_%s' % cap.__name__[3:].lower()
if hasattr(self, func):
try:
for msg in getattr(self, func)(backend, _id):