boobot: Fix weird/unicode URLs

http://stackoverflow.com/questions/1916684/cant-open-unicode-url-with-python
This commit is contained in:
Laurent Bachelier 2013-05-05 20:05:45 +02:00
commit 45151afa3e

View file

@ -25,6 +25,8 @@ import os
import sys import sys
from threading import Thread, Event from threading import Thread, Event
from math import log from math import log
import urlparse
import urllib
from irc.bot import SingleServerIRCBot from irc.bot import SingleServerIRCBot
import mechanize import mechanize
@ -43,6 +45,40 @@ IRC_SERVER = os.getenv('BOOBOT_SERVER', 'chat.freenode.net')
STORAGE_FILE = 'boobot.storage' STORAGE_FILE = 'boobot.storage'
def fixurl(url):
# turn string into unicode
if not isinstance(url, unicode):
url = url.decode('utf8')
# parse it
parsed = urlparse.urlsplit(url)
# divide the netloc further
userpass, at, hostport = parsed.netloc.rpartition('@')
user, colon1, pass_ = userpass.partition(':')
host, colon2, port = hostport.partition(':')
# encode each component
scheme = parsed.scheme.encode('utf8')
user = urllib.quote(user.encode('utf8'))
colon1 = colon1.encode('utf8')
pass_ = urllib.quote(pass_.encode('utf8'))
at = at.encode('utf8')
host = host.encode('idna')
colon2 = colon2.encode('utf8')
port = port.encode('utf8')
path = '/'.join( # could be encoded slashes!
urllib.quote(urllib.unquote(pce).encode('utf8'), '')
for pce in parsed.path.split('/')
)
query = urllib.quote(urllib.unquote(parsed.query).encode('utf8'), '=&?/')
fragment = urllib.quote(urllib.unquote(parsed.fragment).encode('utf8'))
# put it back together
netloc = ''.join((user, colon1, pass_, at, host, colon2, port))
return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
class HeadRequest(mechanize.Request): class HeadRequest(mechanize.Request):
def get_method(self): def get_method(self):
return "HEAD" return "HEAD"
@ -70,7 +106,8 @@ class BoobotBrowser(StandardBrowser):
def human_size(self, size): def human_size(self, size):
if size: if size:
units = ('B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB') units = ('B', 'KiB', 'MiB', 'GiB',
'TiB', 'PiB', 'EiB', 'ZiB', 'YiB')
exponent = int(log(size, 1024)) exponent = int(log(size, 1024))
return "%.1f %s" % (float(size) / pow(1024, exponent), units[exponent]) return "%.1f %s" % (float(size) / pow(1024, exponent), units[exponent])
return '0 B' return '0 B'
@ -94,11 +131,12 @@ class MyThread(Thread):
self.weboob.loop() self.weboob.loop()
def find_keywords(self, text): def find_keywords(self, text):
for word in ['weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg', for word in [
'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob', 'weboob', 'videoob', 'havesex', 'havedate', 'monboob', 'boobmsg',
'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs', 'flatboob', 'boobill', 'pastoob', 'radioob', 'translaboob', 'traveloob',
'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'ass2m', 'boobathon', 'boobank', 'boobtracker', 'comparoob', 'wetboobs',
'budget insight', 'budget-insight', 'budgetinsight', 'budgea']: 'webcontentedit', 'weboorrents', u'sàt', u'salut à toi', 'ass2m',
'budget insight', 'budget-insight', 'budgetinsight', 'budgea']:
if word in text.lower(): if word in text.lower():
return word return word
return None return None
@ -108,7 +146,8 @@ class MyThread(Thread):
word = self.find_keywords(msg.content) word = self.find_keywords(msg.content)
if word is not None: if word is not None:
url = msg.signature[msg.signature.find('https://linuxfr'):] url = msg.signature[msg.signature.find('https://linuxfr'):]
self.bot.send_message('[DLFP] %s talks about %s: %s' % (msg.sender, word, url)) self.bot.send_message('[DLFP] %s talks about %s: %s' % (
msg.sender, word, url))
backend.set_message_read(msg) backend.set_message_read(msg)
def check_board(self): def check_board(self):
@ -129,7 +168,7 @@ class MyThread(Thread):
class Boobot(SingleServerIRCBot): class Boobot(SingleServerIRCBot):
def __init__(self, channels, nickname, server, port=6667): def __init__(self, channels, nickname, server, port=6667):
SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname) SingleServerIRCBot.__init__(self, [(server, port)], nickname, nickname)
#self.connection.add_global_handler('pubmsg', self.on_pubmsg) # self.connection.add_global_handler('pubmsg', self.on_pubmsg)
self.connection.add_global_handler('join', self.on_join) self.connection.add_global_handler('join', self.on_join)
self.connection.add_global_handler('welcome', self.on_welcome) self.connection.add_global_handler('welcome', self.on_welcome)
@ -185,6 +224,7 @@ class Boobot(SingleServerIRCBot):
break break
def on_url(self, url): def on_url(self, url):
url = fixurl(url)
try: try:
content_type, hsize, title = BoobotBrowser().urlinfo(url) content_type, hsize, title = BoobotBrowser().urlinfo(url)
if title: if title: