gdcvault: Fix various weird cases
* comment out or remove some debug prints * try to cope with non-UTF-8 title passed (1013483). Couldn't get it to be recoded yet. * handle boggus url to xml config file (http:/...) (1015020) * handle iframes with page name different than 'player.html' (1013798) * handle xml config filenames with spaces (1441) * catch xml config names with 'smil' as hostname, which means speakervideo points to a smil file describing the streams, and take the file with the highest bitrate * account for xml config with only valid slidesvideo (1016627) and use it as fallback * handle configs with 'Invalid Date' as date text (1016634) We can now dump the entire gdcvault video files urls with a few exceptions (403 HTTP errors), and many /mediaProxy.php urls failing to redirect, which are still to be investigated.
This commit is contained in:
parent
6f089c795e
commit
b55812c5a2
1 changed files with 101 additions and 26 deletions
|
|
@ -23,6 +23,7 @@ ControlNotFoundError = ClientForm.ControlNotFoundError
|
||||||
|
|
||||||
from weboob.tools.browser import BasePage
|
from weboob.tools.browser import BasePage
|
||||||
|
|
||||||
|
import urllib
|
||||||
import re
|
import re
|
||||||
import datetime
|
import datetime
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
@ -38,6 +39,7 @@ from .video import GDCVaultVideo
|
||||||
|
|
||||||
#import lxml.etree
|
#import lxml.etree
|
||||||
|
|
||||||
|
# TODO: check title on 1439
|
||||||
|
|
||||||
__all__ = ['IndexPage', 'SearchPage', 'VideoPage']
|
__all__ = ['IndexPage', 'SearchPage', 'VideoPage']
|
||||||
|
|
||||||
|
|
@ -46,11 +48,11 @@ class IndexPage(BasePage):
|
||||||
def iter_videos(self):
|
def iter_videos(self):
|
||||||
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
|
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
|
||||||
href = a.attrib.get('href', '')
|
href = a.attrib.get('href', '')
|
||||||
print href
|
# print href
|
||||||
m = re.match('/play/(\d+)/.*', href)
|
m = re.match('/play/(\d+)/.*', href)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
print m.group(1)
|
# print m.group(1)
|
||||||
video = GDCVaultVideo(m.group(1))
|
video = GDCVaultVideo(m.group(1))
|
||||||
|
|
||||||
# get title
|
# get title
|
||||||
|
|
@ -107,12 +109,33 @@ class VideoPage(BasePage):
|
||||||
|
|
||||||
# the config file has it too, but in CDATA and only for type 4
|
# the config file has it too, but in CDATA and only for type 4
|
||||||
obj = self.parser.select(self.document.getroot(), 'title')
|
obj = self.parser.select(self.document.getroot(), 'title')
|
||||||
|
title = None
|
||||||
if len(obj) > 0:
|
if len(obj) > 0:
|
||||||
title = obj[0].text.strip()
|
try:
|
||||||
|
title = unicode(obj[0].text)
|
||||||
|
except UnicodeDecodeError, e:
|
||||||
|
title = None
|
||||||
|
|
||||||
|
|
||||||
|
if title is None:
|
||||||
|
obj = self.parser.select(self.document.getroot(), 'meta[name=title]')
|
||||||
|
if len(obj) > 0:
|
||||||
|
if 'content' in obj[0].attrib:
|
||||||
|
try:
|
||||||
|
# FIXME: 1013483 has buggus title (latin1)
|
||||||
|
# for now we just pass it as-is
|
||||||
|
title = obj[0].attrib['content']
|
||||||
|
except UnicodeDecodeError, e:
|
||||||
|
# XXX: this doesn't even works!?
|
||||||
|
title = obj[0].attrib['content'].decode('iso-5589-15')
|
||||||
|
|
||||||
|
|
||||||
|
if title is not None:
|
||||||
|
title = title.strip()
|
||||||
m = re.match('GDC Vault\s+-\s+(.*)', title)
|
m = re.match('GDC Vault\s+-\s+(.*)', title)
|
||||||
if m:
|
if m:
|
||||||
title = m.group(1)
|
title = m.group(1)
|
||||||
video.title = unicode(title)
|
video.title = title
|
||||||
|
|
||||||
#TODO: POST back the title to /search.php and filter == id to get
|
#TODO: POST back the title to /search.php and filter == id to get
|
||||||
# cleaner (JSON) data... (though it'd be much slower)
|
# cleaner (JSON) data... (though it'd be much slower)
|
||||||
|
|
@ -144,10 +167,11 @@ class VideoPage(BasePage):
|
||||||
try:
|
try:
|
||||||
req = self.browser.open_novisit(video.url)
|
req = self.browser.open_novisit(video.url)
|
||||||
headers = req.info()
|
headers = req.info()
|
||||||
if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
|
# if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
|
||||||
print 'BUG'
|
# print 'BUG'
|
||||||
|
|
||||||
print req.code
|
|
||||||
|
#print req.code
|
||||||
except HTTPError, e:
|
except HTTPError, e:
|
||||||
#print e.getcode()
|
#print e.getcode()
|
||||||
if e.getcode() == 302 and hasattr(e, 'hdrs'):
|
if e.getcode() == 302 and hasattr(e, 'hdrs'):
|
||||||
|
|
@ -167,7 +191,19 @@ class VideoPage(BasePage):
|
||||||
# type 3 or 4 (iframe)
|
# type 3 or 4 (iframe)
|
||||||
# get the config file for the rest
|
# get the config file for the rest
|
||||||
iframe_url = obj.attrib['src']
|
iframe_url = obj.attrib['src']
|
||||||
m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url)
|
|
||||||
|
# 1015020 has a boggus url
|
||||||
|
m = re.match('http:/event(.+)', iframe_url)
|
||||||
|
if m:
|
||||||
|
iframe_url = 'http://event' + m.group(1)
|
||||||
|
|
||||||
|
# print iframe_url
|
||||||
|
# 1013798 has player169.html
|
||||||
|
# 1012186 has player16x9.html
|
||||||
|
# some other have /somethingplayer.html...
|
||||||
|
# 1441 has a space in the xml filename, which we must not strip
|
||||||
|
m = re.match('(http:.*/)[^/]*player[0-9a-z]*\.html\?.*xmlURL=([^&]+\.xml).*\&token=([^& ]+)', iframe_url)
|
||||||
|
|
||||||
if not m:
|
if not m:
|
||||||
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
|
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
|
||||||
if m is None:
|
if m is None:
|
||||||
|
|
@ -178,7 +214,7 @@ class VideoPage(BasePage):
|
||||||
video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
|
video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
|
||||||
|
|
||||||
# HACK: we use mechanize directly here for now... FIXME
|
# HACK: we use mechanize directly here for now... FIXME
|
||||||
print "asking for redirect on '%s'" % (video.url)
|
# print "asking for redirect on '%s'" % (video.url)
|
||||||
self.browser.set_handle_redirect(False)
|
self.browser.set_handle_redirect(False)
|
||||||
try:
|
try:
|
||||||
req = self.browser.open_novisit(video.url)
|
req = self.browser.open_novisit(video.url)
|
||||||
|
|
@ -193,8 +229,13 @@ class VideoPage(BasePage):
|
||||||
# type 4 (dual screen video)
|
# type 4 (dual screen video)
|
||||||
|
|
||||||
# token doesn't actually seem required
|
# token doesn't actually seem required
|
||||||
config_url = m.group(1) + m.group(2) + '?token=' + m.group(3)
|
# 1441 has a space in the xml filename
|
||||||
|
xml_filename = urllib.quote(m.group(2))
|
||||||
|
config_url = m.group(1) + xml_filename + '?token=' + m.group(3)
|
||||||
|
|
||||||
|
# self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
|
||||||
|
# print self.browser.addheaders
|
||||||
|
# TODO: fix for 1015021 & others (forbidden)
|
||||||
#config = self.browser.openurl(config_url).read()
|
#config = self.browser.openurl(config_url).read()
|
||||||
config = self.browser.get_document(self.browser.openurl(config_url))
|
config = self.browser.get_document(self.browser.openurl(config_url))
|
||||||
|
|
||||||
|
|
@ -203,24 +244,62 @@ class VideoPage(BasePage):
|
||||||
if host is None:
|
if host is None:
|
||||||
raise BrokenPageError('Missing tag in xml config file')
|
raise BrokenPageError('Missing tag in xml config file')
|
||||||
|
|
||||||
# for id 1373 host is missing '/ondemand'
|
if host == "smil":
|
||||||
# only add it when only a domain is specified without path
|
# the rtmp URL is described in a smil file,
|
||||||
m = re.match('^[^\/]+$', host)
|
# with several available bitrates
|
||||||
if m:
|
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
|
||||||
host += "/ondemand"
|
smil = self.browser.get_document(self.browser.openurl(obj.text))
|
||||||
|
obj = self.parser.select(smil.getroot(), 'meta', 1)
|
||||||
|
# TODO: error checking
|
||||||
|
base = obj.attrib.get('base', '')
|
||||||
|
best_bitrate = 0
|
||||||
|
path = None
|
||||||
|
obj = self.parser.select(smil.getroot(), 'video')
|
||||||
|
# choose the best bitrate
|
||||||
|
for o in obj:
|
||||||
|
rate = int(o.attrib.get('system-bitrate', 0))
|
||||||
|
if rate > best_bitrate:
|
||||||
|
path = o.attrib.get('src', '')
|
||||||
|
video.url = unicode(base + '/' + path)
|
||||||
|
|
||||||
videos = {}
|
else:
|
||||||
|
# not smil, the rtmp url is directly here as host + path
|
||||||
|
# for id 1373 host is missing '/ondemand'
|
||||||
|
# only add it when only a domain is specified without path
|
||||||
|
m = re.match('^[^\/]+$', host)
|
||||||
|
if m:
|
||||||
|
host += "/ondemand"
|
||||||
|
|
||||||
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
|
videos = {}
|
||||||
videos['speaker'] = 'rtmp://' + host + '/' + obj.text
|
|
||||||
|
|
||||||
obj = self.parser.select(config.getroot(), 'slidevideo', 1)
|
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
|
||||||
videos['slides'] = 'rtmp://' + host + '/' + obj.text
|
if obj.text is not None:
|
||||||
|
videos['speaker'] = 'rtmp://' + host + '/' + urllib.quote(obj.text)
|
||||||
|
|
||||||
#print videos
|
obj = self.parser.select(config.getroot(), 'slidevideo', 1)
|
||||||
|
if obj.text is not None:
|
||||||
|
videos['slides'] = 'rtmp://' + host + '/' + urllib.quote(obj.text)
|
||||||
|
|
||||||
|
# print videos
|
||||||
|
# XXX
|
||||||
|
if 'speaker' in videos:
|
||||||
|
video.url = unicode(videos['speaker'])
|
||||||
|
elif 'slides' in videos:
|
||||||
|
# 1016627 only has slides, so fallback to them
|
||||||
|
video.url = unicode(videos['slides'])
|
||||||
|
|
||||||
|
if want_slides:
|
||||||
|
if 'slides' in videos:
|
||||||
|
video.url = unicode(videos['slides'])
|
||||||
|
# if video.url is none: raise ? XXX
|
||||||
|
|
||||||
obj = self.parser.select(config.getroot(), 'date', 1)
|
obj = self.parser.select(config.getroot(), 'date', 1)
|
||||||
video.date = parse_dt(obj.text)
|
if obj.text is not None:
|
||||||
|
# 1016634 has "Invalid Date"
|
||||||
|
try:
|
||||||
|
video.date = parse_dt(obj.text)
|
||||||
|
except ValueError, e:
|
||||||
|
video.date = NotAvailable
|
||||||
|
|
||||||
obj = self.parser.select(config.getroot(), 'duration', 1)
|
obj = self.parser.select(config.getroot(), 'duration', 1)
|
||||||
m = re.match('(\d\d):(\d\d):(\d\d)', obj.text)
|
m = re.match('(\d\d):(\d\d):(\d\d)', obj.text)
|
||||||
|
|
@ -232,10 +311,6 @@ class VideoPage(BasePage):
|
||||||
obj = self.parser.select(config.getroot(), 'speaker', 1)
|
obj = self.parser.select(config.getroot(), 'speaker', 1)
|
||||||
#print obj.text_content()
|
#print obj.text_content()
|
||||||
|
|
||||||
#XXX
|
|
||||||
video.url = unicode(videos['speaker'])
|
|
||||||
if want_slides:
|
|
||||||
video.url = unicode(videos['slides'])
|
|
||||||
#self.set_details(video)
|
#self.set_details(video)
|
||||||
|
|
||||||
video.set_empty_fields(NotAvailable)
|
video.set_empty_fields(NotAvailable)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue