From b55812c5a2706df39ff948752d12ce818f4372cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= Date: Sun, 24 Mar 2013 01:35:46 +0100 Subject: [PATCH] gdcvault: Fix various weird cases * comment out or remove some debug prints * try to cope with non-UTF-8 title passed (1013483). Couldn't get it to be recoded yet. * handle boggus url to xml config file (http:/...) (1015020) * handle iframes with page name different than 'player.html' (1013798) * handle xml config filenames with spaces (1441) * catch xml config names with 'smil' as hostname, which means speakervideo points to a smil file describing the streams, and take the file with the highest bitrate * account for xml config with only valid slidesvideo (1016627) and use it as fallback * handle configs with 'Invalid Date' as date text (1016634) We can now dump the entire gdcvault video files urls with a few exceptions (403 HTTP errors), and many /mediaProxy.php urls failing to redirect, which are still to be investigated. --- modules/gdcvault/pages.py | 127 ++++++++++++++++++++++++++++++-------- 1 file changed, 101 insertions(+), 26 deletions(-) diff --git a/modules/gdcvault/pages.py b/modules/gdcvault/pages.py index 712117ef..a73f9e51 100644 --- a/modules/gdcvault/pages.py +++ b/modules/gdcvault/pages.py @@ -23,6 +23,7 @@ ControlNotFoundError = ClientForm.ControlNotFoundError from weboob.tools.browser import BasePage +import urllib import re import datetime from dateutil.parser import parse as parse_dt @@ -38,6 +39,7 @@ from .video import GDCVaultVideo #import lxml.etree +# TODO: check title on 1439 __all__ = ['IndexPage', 'SearchPage', 'VideoPage'] @@ -46,11 +48,11 @@ class IndexPage(BasePage): def iter_videos(self): for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'): href = a.attrib.get('href', '') - print href + # print href m = re.match('/play/(\d+)/.*', href) if not m: continue - print m.group(1) + # print m.group(1) video = GDCVaultVideo(m.group(1)) # get title @@ -107,12 +109,33 @@ class VideoPage(BasePage): # the config file has it too, but in CDATA and only for type 4 obj = self.parser.select(self.document.getroot(), 'title') + title = None if len(obj) > 0: - title = obj[0].text.strip() + try: + title = unicode(obj[0].text) + except UnicodeDecodeError, e: + title = None + + + if title is None: + obj = self.parser.select(self.document.getroot(), 'meta[name=title]') + if len(obj) > 0: + if 'content' in obj[0].attrib: + try: + # FIXME: 1013483 has buggus title (latin1) + # for now we just pass it as-is + title = obj[0].attrib['content'] + except UnicodeDecodeError, e: + # XXX: this doesn't even works!? + title = obj[0].attrib['content'].decode('iso-5589-15') + + + if title is not None: + title = title.strip() m = re.match('GDC Vault\s+-\s+(.*)', title) if m: title = m.group(1) - video.title = unicode(title) + video.title = title #TODO: POST back the title to /search.php and filter == id to get # cleaner (JSON) data... (though it'd be much slower) @@ -144,10 +167,11 @@ class VideoPage(BasePage): try: req = self.browser.open_novisit(video.url) headers = req.info() - if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2': - print 'BUG' + # if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2': + # print 'BUG' + - print req.code + #print req.code except HTTPError, e: #print e.getcode() if e.getcode() == 302 and hasattr(e, 'hdrs'): @@ -167,7 +191,19 @@ class VideoPage(BasePage): # type 3 or 4 (iframe) # get the config file for the rest iframe_url = obj.attrib['src'] - m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url) + + # 1015020 has a boggus url + m = re.match('http:/event(.+)', iframe_url) + if m: + iframe_url = 'http://event' + m.group(1) + + # print iframe_url + # 1013798 has player169.html + # 1012186 has player16x9.html + # some other have /somethingplayer.html... + # 1441 has a space in the xml filename, which we must not strip + m = re.match('(http:.*/)[^/]*player[0-9a-z]*\.html\?.*xmlURL=([^&]+\.xml).*\&token=([^& ]+)', iframe_url) + if not m: m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url) if m is None: @@ -178,7 +214,7 @@ class VideoPage(BasePage): video.url = "http://gdcvault.com%s" % (unicode(iframe_url)) # HACK: we use mechanize directly here for now... FIXME - print "asking for redirect on '%s'" % (video.url) + # print "asking for redirect on '%s'" % (video.url) self.browser.set_handle_redirect(False) try: req = self.browser.open_novisit(video.url) @@ -193,8 +229,13 @@ class VideoPage(BasePage): # type 4 (dual screen video) # token doesn't actually seem required - config_url = m.group(1) + m.group(2) + '?token=' + m.group(3) + # 1441 has a space in the xml filename + xml_filename = urllib.quote(m.group(2)) + config_url = m.group(1) + xml_filename + '?token=' + m.group(3) + # self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]] + # print self.browser.addheaders + # TODO: fix for 1015021 & others (forbidden) #config = self.browser.openurl(config_url).read() config = self.browser.get_document(self.browser.openurl(config_url)) @@ -203,24 +244,62 @@ class VideoPage(BasePage): if host is None: raise BrokenPageError('Missing tag in xml config file') - # for id 1373 host is missing '/ondemand' - # only add it when only a domain is specified without path - m = re.match('^[^\/]+$', host) - if m: - host += "/ondemand" + if host == "smil": + # the rtmp URL is described in a smil file, + # with several available bitrates + obj = self.parser.select(config.getroot(), 'speakervideo', 1) + smil = self.browser.get_document(self.browser.openurl(obj.text)) + obj = self.parser.select(smil.getroot(), 'meta', 1) + # TODO: error checking + base = obj.attrib.get('base', '') + best_bitrate = 0 + path = None + obj = self.parser.select(smil.getroot(), 'video') + # choose the best bitrate + for o in obj: + rate = int(o.attrib.get('system-bitrate', 0)) + if rate > best_bitrate: + path = o.attrib.get('src', '') + video.url = unicode(base + '/' + path) - videos = {} + else: + # not smil, the rtmp url is directly here as host + path + # for id 1373 host is missing '/ondemand' + # only add it when only a domain is specified without path + m = re.match('^[^\/]+$', host) + if m: + host += "/ondemand" - obj = self.parser.select(config.getroot(), 'speakervideo', 1) - videos['speaker'] = 'rtmp://' + host + '/' + obj.text + videos = {} - obj = self.parser.select(config.getroot(), 'slidevideo', 1) - videos['slides'] = 'rtmp://' + host + '/' + obj.text + obj = self.parser.select(config.getroot(), 'speakervideo', 1) + if obj.text is not None: + videos['speaker'] = 'rtmp://' + host + '/' + urllib.quote(obj.text) - #print videos + obj = self.parser.select(config.getroot(), 'slidevideo', 1) + if obj.text is not None: + videos['slides'] = 'rtmp://' + host + '/' + urllib.quote(obj.text) + + # print videos + # XXX + if 'speaker' in videos: + video.url = unicode(videos['speaker']) + elif 'slides' in videos: + # 1016627 only has slides, so fallback to them + video.url = unicode(videos['slides']) + + if want_slides: + if 'slides' in videos: + video.url = unicode(videos['slides']) + # if video.url is none: raise ? XXX obj = self.parser.select(config.getroot(), 'date', 1) - video.date = parse_dt(obj.text) + if obj.text is not None: + # 1016634 has "Invalid Date" + try: + video.date = parse_dt(obj.text) + except ValueError, e: + video.date = NotAvailable obj = self.parser.select(config.getroot(), 'duration', 1) m = re.match('(\d\d):(\d\d):(\d\d)', obj.text) @@ -232,10 +311,6 @@ class VideoPage(BasePage): obj = self.parser.select(config.getroot(), 'speaker', 1) #print obj.text_content() - #XXX - video.url = unicode(videos['speaker']) - if want_slides: - video.url = unicode(videos['slides']) #self.set_details(video) video.set_empty_fields(NotAvailable)