diff --git a/contrib/boobot.py b/contrib/boobot.py index b6e48dec..4ee0f302 100755 --- a/contrib/boobot.py +++ b/contrib/boobot.py @@ -25,7 +25,6 @@ import logging import re import os import sys -import codecs from threading import Thread, Event from math import log import urlparse @@ -33,12 +32,12 @@ import urllib from random import randint, choice import itertools from irc.bot import SingleServerIRCBot -import mechanize -from mechanize import _headersutil as headersutil -from mechanize._html import EncodingFinder from weboob.core import Weboob -from weboob.deprecated.browser import StandardBrowser, BrowserUnavailable +from weboob.exceptions import BrowserUnavailable, BrowserHTTPError +from weboob.browser import Browser +from weboob.browser.exceptions import HTTPNotFound +from weboob.browser.pages import HTMLPage from weboob.tools.misc import get_backtrace from weboob.tools.misc import to_unicode from weboob.tools.storage import StandardStorage @@ -46,7 +45,7 @@ from weboob.tools.application.base import ApplicationStorage IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',') IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot') -IRC_SERVER = os.getenv('BOOBOT_SERVER', 'chat.freenode.net') +IRC_SERVER = os.getenv('BOOBOT_SERVER', 'dickson.freenode.net') IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')] STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage') @@ -85,82 +84,50 @@ def fixurl(url): return urlparse.urlunsplit((scheme, netloc, path, query, fragment)) -class HeadRequest(mechanize.Request): - def get_method(self): - return "HEAD" - - -class BoobotBrowser(StandardBrowser): - ENCODING = None - DEFAULT_TIMEOUT = 3 +class BoobotBrowser(Browser): + TIMEOUT = 3.0 def urlinfo(self, url, maxback=2): if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': url = url.replace('mobile.twitter.com', 'twitter.com', 1) try: - r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) + r = self.open(url, method='HEAD') body = False - except BrowserUnavailable as e: - if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e): - r = self.openurl(url, _tries=2, _delay=0.2) - body = True - elif u'HTTP Error 404' in unicode(e) \ - and maxback and not url[-1].isalnum(): + except HTTPNotFound as e: + if maxback and not url[-1].isalnum(): return self.urlinfo(url[:-1], maxback-1) + raise e + except BrowserHTTPError as e: + if e.response.status_code in (501, 405): + r = self.open(url) + body = True else: raise e - headers = r.info() - content_type = headers.get('Content-Type') + content_type = r.headers.get('Content-Type') try: - size = int(headers.get('Content-Length')) + size = int(r.headers.get('Content-Length')) hsize = self.human_size(size) except TypeError: size = None hsize = None - is_html = headersutil.is_html([content_type], url, True) + is_html = ('html' in content_type) if content_type else re.match(r'\.x?html?$', url) title = None if is_html: if not body: - r = self.openurl(url, _tries=2, _delay=0.2) + r = self.open(url) # update size has we might not have it from headers - size = len(r.read()) + size = len(r.content) hsize = self.human_size(size) - r.seek(0) - encoding = EncodingFinder('windows-1252').encoding(r).lower() - try: - h = self.get_document(r, parser='lxml', encoding=encoding) - for meta in h.xpath('//head/meta'): - # meta http-equiv=content-type content=... - if meta.attrib.get('http-equiv', '').lower() == 'content-type': - for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]): - if k == 'charset': - encoding = v - # meta charset=... - encoding = meta.attrib.get('charset', encoding).lower() - except Exception as e: - print(e) - finally: - r.seek(0) - if encoding == 'iso-8859-1' or not encoding: - encoding = 'windows-1252' - try: - codecs.lookup(encoding) - except LookupError: - encoding = 'windows-1252' + page = HTMLPage(self, r) - try: - h = self.get_document(r, parser='lxml', encoding=encoding) - for title in h.xpath('//head/title'): + for title in page.doc.xpath('//head/title'): + title = to_unicode(title.text_content()).strip() + title = ' '.join(title.split()) + if urlparse.urlsplit(url).netloc.endswith('twitter.com'): + for title in page.doc.getroot().cssselect('.permalink-tweet .tweet-text'): title = to_unicode(title.text_content()).strip() - title = ' '.join(title.split()) - if urlparse.urlsplit(url).netloc.endswith('twitter.com'): - for title in h.getroot().cssselect('.permalink-tweet .tweet-text'): - title = to_unicode(title.text_content()).strip() - title = ' '.join(title.splitlines()) - except AssertionError as e: - # invalid HTML - print(e) + title = ' '.join(title.splitlines()) return content_type, hsize, title @@ -198,8 +165,7 @@ class MyThread(Thread): 'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg', 'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob', 'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs', - 'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'assnet', - 'budget insight', 'budget-insight', 'budgetinsight', 'budgea']: + 'webcontentedit', 'weboorrents', 'assnet', 'budget insight', 'budget-insight', 'budgetinsight', 'budgea']: if word in text.lower(): return word return None @@ -315,7 +281,7 @@ class Boobot(SingleServerIRCBot): quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text}) self.storage.set(channel, 'quotes', quotes) self.storage.save() - self.send_message('Quote #%s added' % len(quotes) - 1, channel) + self.send_message('Quote #%s added' % (len(quotes) - 1), channel) def cmd_delquote(self, nick, channel, text): quotes = self.storage.get(channel, 'quotes', default=[]) @@ -372,7 +338,7 @@ class Boobot(SingleServerIRCBot): if backend_name in self.weboob.backend_instances: backend = self.weboob.backend_instances[backend_name] for cap in backend.iter_caps(): - func = 'obj_info_%s' % cap.__name__[4:].lower() + func = 'obj_info_%s' % cap.__name__[3:].lower() if hasattr(self, func): try: for msg in getattr(self, func)(backend, _id):