use the new browser

This commit is contained in:
Romain Bignon 2014-10-08 14:14:03 +02:00
commit 3ed6103740

View file

@ -25,7 +25,6 @@ import logging
import re import re
import os import os
import sys import sys
import codecs
from threading import Thread, Event from threading import Thread, Event
from math import log from math import log
import urlparse import urlparse
@ -33,12 +32,12 @@ import urllib
from random import randint, choice from random import randint, choice
import itertools import itertools
from irc.bot import SingleServerIRCBot from irc.bot import SingleServerIRCBot
import mechanize
from mechanize import _headersutil as headersutil
from mechanize._html import EncodingFinder
from weboob.core import Weboob from weboob.core import Weboob
from weboob.deprecated.browser import StandardBrowser, BrowserUnavailable from weboob.exceptions import BrowserUnavailable, BrowserHTTPError
from weboob.browser import Browser
from weboob.browser.exceptions import HTTPNotFound
from weboob.browser.pages import HTMLPage
from weboob.tools.misc import get_backtrace from weboob.tools.misc import get_backtrace
from weboob.tools.misc import to_unicode from weboob.tools.misc import to_unicode
from weboob.tools.storage import StandardStorage from weboob.tools.storage import StandardStorage
@ -46,7 +45,7 @@ from weboob.tools.application.base import ApplicationStorage
IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',') IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',')
IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot') IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot')
IRC_SERVER = os.getenv('BOOBOT_SERVER', 'chat.freenode.net') IRC_SERVER = os.getenv('BOOBOT_SERVER', 'dickson.freenode.net')
IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')] IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')]
STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage') STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage')
@ -85,82 +84,50 @@ def fixurl(url):
return urlparse.urlunsplit((scheme, netloc, path, query, fragment)) return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
class HeadRequest(mechanize.Request): class BoobotBrowser(Browser):
def get_method(self): TIMEOUT = 3.0
return "HEAD"
class BoobotBrowser(StandardBrowser):
ENCODING = None
DEFAULT_TIMEOUT = 3
def urlinfo(self, url, maxback=2): def urlinfo(self, url, maxback=2):
if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
url = url.replace('mobile.twitter.com', 'twitter.com', 1) url = url.replace('mobile.twitter.com', 'twitter.com', 1)
try: try:
r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) r = self.open(url, method='HEAD')
body = False body = False
except BrowserUnavailable as e: except HTTPNotFound as e:
if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e): if maxback and not url[-1].isalnum():
r = self.openurl(url, _tries=2, _delay=0.2)
body = True
elif u'HTTP Error 404' in unicode(e) \
and maxback and not url[-1].isalnum():
return self.urlinfo(url[:-1], maxback-1) return self.urlinfo(url[:-1], maxback-1)
raise e
except BrowserHTTPError as e:
if e.response.status_code in (501, 405):
r = self.open(url)
body = True
else: else:
raise e raise e
headers = r.info() content_type = r.headers.get('Content-Type')
content_type = headers.get('Content-Type')
try: try:
size = int(headers.get('Content-Length')) size = int(r.headers.get('Content-Length'))
hsize = self.human_size(size) hsize = self.human_size(size)
except TypeError: except TypeError:
size = None size = None
hsize = None hsize = None
is_html = headersutil.is_html([content_type], url, True) is_html = ('html' in content_type) if content_type else re.match(r'\.x?html?$', url)
title = None title = None
if is_html: if is_html:
if not body: if not body:
r = self.openurl(url, _tries=2, _delay=0.2) r = self.open(url)
# update size has we might not have it from headers # update size has we might not have it from headers
size = len(r.read()) size = len(r.content)
hsize = self.human_size(size) hsize = self.human_size(size)
r.seek(0)
encoding = EncodingFinder('windows-1252').encoding(r).lower() page = HTMLPage(self, r)
try:
h = self.get_document(r, parser='lxml', encoding=encoding)
for meta in h.xpath('//head/meta'):
# meta http-equiv=content-type content=...
if meta.attrib.get('http-equiv', '').lower() == 'content-type':
for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
if k == 'charset':
encoding = v
# meta charset=...
encoding = meta.attrib.get('charset', encoding).lower()
except Exception as e:
print(e)
finally:
r.seek(0)
if encoding == 'iso-8859-1' or not encoding:
encoding = 'windows-1252'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'windows-1252'
try: for title in page.doc.xpath('//head/title'):
h = self.get_document(r, parser='lxml', encoding=encoding) title = to_unicode(title.text_content()).strip()
for title in h.xpath('//head/title'): title = ' '.join(title.split())
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
for title in page.doc.getroot().cssselect('.permalink-tweet .tweet-text'):
title = to_unicode(title.text_content()).strip() title = to_unicode(title.text_content()).strip()
title = ' '.join(title.split()) title = ' '.join(title.splitlines())
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
title = to_unicode(title.text_content()).strip()
title = ' '.join(title.splitlines())
except AssertionError as e:
# invalid HTML
print(e)
return content_type, hsize, title return content_type, hsize, title
@ -198,8 +165,7 @@ class MyThread(Thread):
'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg', 'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg',
'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob', 'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob',
'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs', 'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs',
'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'assnet', 'webcontentedit', 'weboorrents', 'assnet', 'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
if word in text.lower(): if word in text.lower():
return word return word
return None return None
@ -315,7 +281,7 @@ class Boobot(SingleServerIRCBot):
quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text}) quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text})
self.storage.set(channel, 'quotes', quotes) self.storage.set(channel, 'quotes', quotes)
self.storage.save() self.storage.save()
self.send_message('Quote #%s added' % len(quotes) - 1, channel) self.send_message('Quote #%s added' % (len(quotes) - 1), channel)
def cmd_delquote(self, nick, channel, text): def cmd_delquote(self, nick, channel, text):
quotes = self.storage.get(channel, 'quotes', default=[]) quotes = self.storage.get(channel, 'quotes', default=[])
@ -372,7 +338,7 @@ class Boobot(SingleServerIRCBot):
if backend_name in self.weboob.backend_instances: if backend_name in self.weboob.backend_instances:
backend = self.weboob.backend_instances[backend_name] backend = self.weboob.backend_instances[backend_name]
for cap in backend.iter_caps(): for cap in backend.iter_caps():
func = 'obj_info_%s' % cap.__name__[4:].lower() func = 'obj_info_%s' % cap.__name__[3:].lower()
if hasattr(self, func): if hasattr(self, func):
try: try:
for msg in getattr(self, func)(backend, _id): for msg in getattr(self, func)(backend, _id):