use the new browser
This commit is contained in:
parent
65ece349db
commit
3ed6103740
1 changed files with 32 additions and 66 deletions
|
|
@ -25,7 +25,6 @@ import logging
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import codecs
|
|
||||||
from threading import Thread, Event
|
from threading import Thread, Event
|
||||||
from math import log
|
from math import log
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
@ -33,12 +32,12 @@ import urllib
|
||||||
from random import randint, choice
|
from random import randint, choice
|
||||||
import itertools
|
import itertools
|
||||||
from irc.bot import SingleServerIRCBot
|
from irc.bot import SingleServerIRCBot
|
||||||
import mechanize
|
|
||||||
from mechanize import _headersutil as headersutil
|
|
||||||
from mechanize._html import EncodingFinder
|
|
||||||
|
|
||||||
from weboob.core import Weboob
|
from weboob.core import Weboob
|
||||||
from weboob.deprecated.browser import StandardBrowser, BrowserUnavailable
|
from weboob.exceptions import BrowserUnavailable, BrowserHTTPError
|
||||||
|
from weboob.browser import Browser
|
||||||
|
from weboob.browser.exceptions import HTTPNotFound
|
||||||
|
from weboob.browser.pages import HTMLPage
|
||||||
from weboob.tools.misc import get_backtrace
|
from weboob.tools.misc import get_backtrace
|
||||||
from weboob.tools.misc import to_unicode
|
from weboob.tools.misc import to_unicode
|
||||||
from weboob.tools.storage import StandardStorage
|
from weboob.tools.storage import StandardStorage
|
||||||
|
|
@ -46,7 +45,7 @@ from weboob.tools.application.base import ApplicationStorage
|
||||||
|
|
||||||
IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',')
|
IRC_CHANNELS = os.getenv('BOOBOT_CHANNELS', '#weboob').split(',')
|
||||||
IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot')
|
IRC_NICKNAME = os.getenv('BOOBOT_NICKNAME', 'boobot')
|
||||||
IRC_SERVER = os.getenv('BOOBOT_SERVER', 'chat.freenode.net')
|
IRC_SERVER = os.getenv('BOOBOT_SERVER', 'dickson.freenode.net')
|
||||||
IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')]
|
IRC_IGNORE = [re.compile(i) for i in os.getenv('BOOBOT_IGNORE', '!~?irker@').split(',')]
|
||||||
STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage')
|
STORAGE_FILE = os.getenv('BOOBOT_STORAGE', 'boobot.storage')
|
||||||
|
|
||||||
|
|
@ -85,82 +84,50 @@ def fixurl(url):
|
||||||
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
|
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
|
||||||
|
|
||||||
|
|
||||||
class HeadRequest(mechanize.Request):
|
class BoobotBrowser(Browser):
|
||||||
def get_method(self):
|
TIMEOUT = 3.0
|
||||||
return "HEAD"
|
|
||||||
|
|
||||||
|
|
||||||
class BoobotBrowser(StandardBrowser):
|
|
||||||
ENCODING = None
|
|
||||||
DEFAULT_TIMEOUT = 3
|
|
||||||
|
|
||||||
def urlinfo(self, url, maxback=2):
|
def urlinfo(self, url, maxback=2):
|
||||||
if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
|
if urlparse.urlsplit(url).netloc == 'mobile.twitter.com':
|
||||||
url = url.replace('mobile.twitter.com', 'twitter.com', 1)
|
url = url.replace('mobile.twitter.com', 'twitter.com', 1)
|
||||||
try:
|
try:
|
||||||
r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2)
|
r = self.open(url, method='HEAD')
|
||||||
body = False
|
body = False
|
||||||
except BrowserUnavailable as e:
|
except HTTPNotFound as e:
|
||||||
if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e):
|
if maxback and not url[-1].isalnum():
|
||||||
r = self.openurl(url, _tries=2, _delay=0.2)
|
|
||||||
body = True
|
|
||||||
elif u'HTTP Error 404' in unicode(e) \
|
|
||||||
and maxback and not url[-1].isalnum():
|
|
||||||
return self.urlinfo(url[:-1], maxback-1)
|
return self.urlinfo(url[:-1], maxback-1)
|
||||||
|
raise e
|
||||||
|
except BrowserHTTPError as e:
|
||||||
|
if e.response.status_code in (501, 405):
|
||||||
|
r = self.open(url)
|
||||||
|
body = True
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
headers = r.info()
|
content_type = r.headers.get('Content-Type')
|
||||||
content_type = headers.get('Content-Type')
|
|
||||||
try:
|
try:
|
||||||
size = int(headers.get('Content-Length'))
|
size = int(r.headers.get('Content-Length'))
|
||||||
hsize = self.human_size(size)
|
hsize = self.human_size(size)
|
||||||
except TypeError:
|
except TypeError:
|
||||||
size = None
|
size = None
|
||||||
hsize = None
|
hsize = None
|
||||||
is_html = headersutil.is_html([content_type], url, True)
|
is_html = ('html' in content_type) if content_type else re.match(r'\.x?html?$', url)
|
||||||
title = None
|
title = None
|
||||||
if is_html:
|
if is_html:
|
||||||
if not body:
|
if not body:
|
||||||
r = self.openurl(url, _tries=2, _delay=0.2)
|
r = self.open(url)
|
||||||
# update size has we might not have it from headers
|
# update size has we might not have it from headers
|
||||||
size = len(r.read())
|
size = len(r.content)
|
||||||
hsize = self.human_size(size)
|
hsize = self.human_size(size)
|
||||||
r.seek(0)
|
|
||||||
|
|
||||||
encoding = EncodingFinder('windows-1252').encoding(r).lower()
|
page = HTMLPage(self, r)
|
||||||
try:
|
|
||||||
h = self.get_document(r, parser='lxml', encoding=encoding)
|
|
||||||
for meta in h.xpath('//head/meta'):
|
|
||||||
# meta http-equiv=content-type content=...
|
|
||||||
if meta.attrib.get('http-equiv', '').lower() == 'content-type':
|
|
||||||
for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]):
|
|
||||||
if k == 'charset':
|
|
||||||
encoding = v
|
|
||||||
# meta charset=...
|
|
||||||
encoding = meta.attrib.get('charset', encoding).lower()
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
finally:
|
|
||||||
r.seek(0)
|
|
||||||
if encoding == 'iso-8859-1' or not encoding:
|
|
||||||
encoding = 'windows-1252'
|
|
||||||
try:
|
|
||||||
codecs.lookup(encoding)
|
|
||||||
except LookupError:
|
|
||||||
encoding = 'windows-1252'
|
|
||||||
|
|
||||||
try:
|
for title in page.doc.xpath('//head/title'):
|
||||||
h = self.get_document(r, parser='lxml', encoding=encoding)
|
title = to_unicode(title.text_content()).strip()
|
||||||
for title in h.xpath('//head/title'):
|
title = ' '.join(title.split())
|
||||||
|
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
|
||||||
|
for title in page.doc.getroot().cssselect('.permalink-tweet .tweet-text'):
|
||||||
title = to_unicode(title.text_content()).strip()
|
title = to_unicode(title.text_content()).strip()
|
||||||
title = ' '.join(title.split())
|
title = ' '.join(title.splitlines())
|
||||||
if urlparse.urlsplit(url).netloc.endswith('twitter.com'):
|
|
||||||
for title in h.getroot().cssselect('.permalink-tweet .tweet-text'):
|
|
||||||
title = to_unicode(title.text_content()).strip()
|
|
||||||
title = ' '.join(title.splitlines())
|
|
||||||
except AssertionError as e:
|
|
||||||
# invalid HTML
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
return content_type, hsize, title
|
return content_type, hsize, title
|
||||||
|
|
||||||
|
|
@ -198,8 +165,7 @@ class MyThread(Thread):
|
||||||
'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg',
|
'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg',
|
||||||
'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob',
|
'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'handjoob',
|
||||||
'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs',
|
'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs',
|
||||||
'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'assnet',
|
'webcontentedit', 'weboorrents', 'assnet', 'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
|
||||||
'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
|
|
||||||
if word in text.lower():
|
if word in text.lower():
|
||||||
return word
|
return word
|
||||||
return None
|
return None
|
||||||
|
|
@ -315,7 +281,7 @@ class Boobot(SingleServerIRCBot):
|
||||||
quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text})
|
quotes.append({'author': nick, 'timestamp': datetime.now(), 'text': text})
|
||||||
self.storage.set(channel, 'quotes', quotes)
|
self.storage.set(channel, 'quotes', quotes)
|
||||||
self.storage.save()
|
self.storage.save()
|
||||||
self.send_message('Quote #%s added' % len(quotes) - 1, channel)
|
self.send_message('Quote #%s added' % (len(quotes) - 1), channel)
|
||||||
|
|
||||||
def cmd_delquote(self, nick, channel, text):
|
def cmd_delquote(self, nick, channel, text):
|
||||||
quotes = self.storage.get(channel, 'quotes', default=[])
|
quotes = self.storage.get(channel, 'quotes', default=[])
|
||||||
|
|
@ -372,7 +338,7 @@ class Boobot(SingleServerIRCBot):
|
||||||
if backend_name in self.weboob.backend_instances:
|
if backend_name in self.weboob.backend_instances:
|
||||||
backend = self.weboob.backend_instances[backend_name]
|
backend = self.weboob.backend_instances[backend_name]
|
||||||
for cap in backend.iter_caps():
|
for cap in backend.iter_caps():
|
||||||
func = 'obj_info_%s' % cap.__name__[4:].lower()
|
func = 'obj_info_%s' % cap.__name__[3:].lower()
|
||||||
if hasattr(self, func):
|
if hasattr(self, func):
|
||||||
try:
|
try:
|
||||||
for msg in getattr(self, func)(backend, _id):
|
for msg in getattr(self, func)(backend, _id):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue