gdcvault: Implement searching; partial download fix
* Implement search functionality, using POST to get JSON data * Fix download for most items even for non-free ones. For now only missing are non-free mp3 files it seems (like 769), /mediaProxy.php returns 2bytes html crap.
This commit is contained in:
parent
c8685b8e3b
commit
38b80491f1
4 changed files with 177 additions and 23 deletions
|
|
@ -66,9 +66,9 @@ class GDCVaultBackend(BaseBackend, ICapVideo, ICapCollection):
|
||||||
|
|
||||||
SORTBY = ['relevance', 'rating', 'views', 'time']
|
SORTBY = ['relevance', 'rating', 'views', 'time']
|
||||||
|
|
||||||
# def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
|
def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
|
||||||
# with self.browser:
|
with self.browser:
|
||||||
# return self.browser.search_videos(pattern, self.SORTBY[sortby])
|
return self.browser.search_videos(pattern, self.SORTBY[sortby])
|
||||||
|
|
||||||
def fill_video(self, video, fields):
|
def fill_video(self, video, fields):
|
||||||
if fields != ['thumbnail']:
|
if fields != ['thumbnail']:
|
||||||
|
|
|
||||||
|
|
@ -25,7 +25,7 @@ from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword, BrowserU
|
||||||
from weboob.tools.browser.decorators import id2url
|
from weboob.tools.browser.decorators import id2url
|
||||||
|
|
||||||
#from .pages.index import IndexPage
|
#from .pages.index import IndexPage
|
||||||
from .pages import VideoPage, IndexPage
|
from .pages import VideoPage, IndexPage, SearchPage
|
||||||
from .video import GDCVaultVideo
|
from .video import GDCVaultVideo
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -36,7 +36,8 @@ class GDCVaultBrowser(BaseBrowser):
|
||||||
DOMAIN = 'gdcvault.com'
|
DOMAIN = 'gdcvault.com'
|
||||||
ENCODING = 'utf-8'
|
ENCODING = 'utf-8'
|
||||||
PAGES = {r'http://[w\.]*gdcvault.com/play/(?P<id>[\d]+)/?.*': VideoPage,
|
PAGES = {r'http://[w\.]*gdcvault.com/play/(?P<id>[\d]+)/?.*': VideoPage,
|
||||||
r'http://[w\.]*gdcvault.com/': IndexPage,
|
r'http://[w\.]*gdcvault.com/search\.php.*': (SearchPage, "json"),
|
||||||
|
r'http://[w\.]*gdcvault.com/.*': IndexPage,
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_logged(self):
|
def is_logged(self):
|
||||||
|
|
@ -63,9 +64,9 @@ class GDCVaultBrowser(BaseBrowser):
|
||||||
|
|
||||||
data = self.readurl('http://gdcvault.com/api/login.php',
|
data = self.readurl('http://gdcvault.com/api/login.php',
|
||||||
urllib.urlencode(params))
|
urllib.urlencode(params))
|
||||||
# data is returned as JSON, not sure yet if it's useful
|
# some data returned as JSON, not sure yet if it's useful
|
||||||
|
#print data
|
||||||
|
|
||||||
print data
|
|
||||||
if data is None:
|
if data is None:
|
||||||
raise BrowserBanned('Too many open sessions?')
|
raise BrowserBanned('Too many open sessions?')
|
||||||
|
|
||||||
|
|
@ -75,7 +76,7 @@ class GDCVaultBrowser(BaseBrowser):
|
||||||
raise BrowserIncorrectPassword()
|
raise BrowserIncorrectPassword()
|
||||||
|
|
||||||
def close_session(self):
|
def close_session(self):
|
||||||
print "logging out..."
|
# XXX: only if is_logged? or was used?
|
||||||
self.openurl('/logout', '')
|
self.openurl('/logout', '')
|
||||||
|
|
||||||
@id2url(GDCVaultVideo.id2url)
|
@id2url(GDCVaultVideo.id2url)
|
||||||
|
|
@ -86,13 +87,27 @@ class GDCVaultBrowser(BaseBrowser):
|
||||||
raise BrowserUnavailable('Requires account')
|
raise BrowserUnavailable('Requires account')
|
||||||
return self.page.get_video(video)
|
return self.page.get_video(video)
|
||||||
|
|
||||||
# def search_videos(self, pattern, sortby):
|
def search_videos(self, pattern, sortby):
|
||||||
# return None
|
post_data = {"firstfocus" : "",
|
||||||
# self.location(self.buildurl('http://gdcvault.com/en/search%s' % sortby, query=pattern.encode('utf-8')))
|
"category" : "free",
|
||||||
# assert self.is_on_page(IndexPage)
|
"keyword" : pattern.encode('utf-8'),
|
||||||
# return self.page.iter_videos()
|
"conference_id" : "", }
|
||||||
|
post_data = urllib.urlencode(post_data)
|
||||||
|
# probably not required
|
||||||
|
self.addheaders = [('Referer', 'http://gdcvault.com/'),
|
||||||
|
("Content-Type" , 'application/x-www-form-urlencoded') ]
|
||||||
|
|
||||||
# def latest_videos(self):
|
#print post_data
|
||||||
# self.home()
|
# is_logged assumes html page
|
||||||
# assert self.is_on_page(IndexPage)
|
self.location('http://gdcvault.com/search.php',
|
||||||
# return self.page.iter_videos()
|
data=post_data, no_login=True)
|
||||||
|
|
||||||
|
assert self.is_on_page(SearchPage)
|
||||||
|
return self.page.iter_videos()
|
||||||
|
|
||||||
|
def latest_videos(self):
|
||||||
|
print "browser:latest_videos()"
|
||||||
|
#self.home()
|
||||||
|
self.location('/free')
|
||||||
|
assert self.is_on_page(IndexPage)
|
||||||
|
return self.page.iter_videos()
|
||||||
|
|
|
||||||
|
|
@ -28,25 +28,69 @@ import datetime
|
||||||
from dateutil.parser import parse as parse_dt
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
||||||
from weboob.capabilities.base import NotAvailable
|
from weboob.capabilities.base import NotAvailable
|
||||||
|
from weboob.tools.capabilities.thumbnail import Thumbnail
|
||||||
from weboob.tools.browser import BrokenPageError
|
from weboob.tools.browser import BrokenPageError
|
||||||
|
|
||||||
|
#HACK
|
||||||
|
from urllib2 import HTTPError
|
||||||
|
|
||||||
from .video import GDCVaultVideo
|
from .video import GDCVaultVideo
|
||||||
|
|
||||||
#import lxml.etree
|
#import lxml.etree
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['IndexPage', 'VideoPage']
|
__all__ = ['IndexPage', 'SearchPage', 'VideoPage']
|
||||||
|
|
||||||
|
|
||||||
class IndexPage(BasePage):
|
class IndexPage(BasePage):
|
||||||
def iter_videos(self):
|
def iter_videos(self):
|
||||||
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
|
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
|
||||||
print a
|
href = a.attrib.get('href', '')
|
||||||
|
print href
|
||||||
|
m = re.match('/play/(\d+)/.*', href)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
print m.group(1)
|
||||||
|
video = GDCVaultVideo(m.group(1))
|
||||||
|
|
||||||
|
# get title
|
||||||
|
try:
|
||||||
|
video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text)
|
||||||
|
except IndexError:
|
||||||
|
video.title = NotAvailable
|
||||||
|
|
||||||
|
# get description
|
||||||
|
try:
|
||||||
|
video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text)
|
||||||
|
except IndexError:
|
||||||
|
video.description = NotAvailable
|
||||||
|
|
||||||
|
# get thumbnail
|
||||||
|
img = self.parser.select(a, 'div.featured_image img', 1)
|
||||||
|
if img is not None:
|
||||||
|
video.thumbnail = Thumbnail(unicode(img.attrib['src']))
|
||||||
|
else:
|
||||||
|
video.thumbnail = NotAvailable
|
||||||
|
|
||||||
|
|
||||||
#m = re.match('id-(\d+)', a.attrib.get('class', ''))
|
#m = re.match('id-(\d+)', a.attrib.get('class', ''))
|
||||||
#if not m:
|
#if not m:
|
||||||
# continue
|
# continue
|
||||||
# FIXME
|
# FIXME
|
||||||
yield None
|
yield video
|
||||||
|
|
||||||
|
# the search page class uses a JSON parser,
|
||||||
|
# since it's what search.php returns when POSTed (from Ajax)
|
||||||
|
class SearchPage(BasePage):
|
||||||
|
def iter_videos(self):
|
||||||
|
if self.document is None or self.document['data'] is None:
|
||||||
|
raise BrokenPageError('Unable to find JSON data')
|
||||||
|
for data in self.document['data']:
|
||||||
|
video = GDCVaultVideo.get_video_from_json(data)
|
||||||
|
# TODO: split type 4 videos into id and id#slides
|
||||||
|
if video is None:
|
||||||
|
continue
|
||||||
|
yield video
|
||||||
|
|
||||||
class VideoPage(BasePage):
|
class VideoPage(BasePage):
|
||||||
def get_video(self, video=None):
|
def get_video(self, video=None):
|
||||||
|
|
@ -86,8 +130,34 @@ class VideoPage(BasePage):
|
||||||
m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
|
m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
|
||||||
if m:
|
if m:
|
||||||
video.url = "http://gdcvault.com%s" % (m.group(1))
|
video.url = "http://gdcvault.com%s" % (m.group(1))
|
||||||
|
# TODO: for non-free (like 769),
|
||||||
|
# must be logged to use /mediaProxy.php
|
||||||
|
|
||||||
|
# FIXME: doesn't seem to work yet, we get 2 bytes as html
|
||||||
|
# 769 should give:
|
||||||
|
# http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3
|
||||||
|
# HACK: we use mechanize directly here for now... FIXME
|
||||||
|
#print "asking for redirect on '%s'" % (video.url)
|
||||||
|
#self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
|
||||||
|
#print self.browser.addheaders
|
||||||
|
self.browser.set_handle_redirect(False)
|
||||||
|
try:
|
||||||
|
req = self.browser.open_novisit(video.url)
|
||||||
|
headers = req.info()
|
||||||
|
if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
|
||||||
|
print 'BUG'
|
||||||
|
|
||||||
|
print req.code
|
||||||
|
except HTTPError, e:
|
||||||
|
#print e.getcode()
|
||||||
|
if e.getcode() == 302 and hasattr(e, 'hdrs'):
|
||||||
|
#print e.hdrs['Location']
|
||||||
|
video.url = unicode(e.hdrs['Location'])
|
||||||
|
self.browser.set_handle_redirect(True)
|
||||||
|
|
||||||
video.set_empty_fields(NotAvailable)
|
video.set_empty_fields(NotAvailable)
|
||||||
return video
|
return video
|
||||||
|
|
||||||
#XXX: raise error?
|
#XXX: raise error?
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -97,19 +167,33 @@ class VideoPage(BasePage):
|
||||||
# type 3 or 4 (iframe)
|
# type 3 or 4 (iframe)
|
||||||
# get the config file for the rest
|
# get the config file for the rest
|
||||||
iframe_url = obj.attrib['src']
|
iframe_url = obj.attrib['src']
|
||||||
m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
|
m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url)
|
||||||
if not m:
|
if not m:
|
||||||
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
|
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
|
||||||
if m is None:
|
if m is None:
|
||||||
return None
|
return None
|
||||||
|
# TODO: must be logged to use /mediaProxy.php
|
||||||
# type 3 (pdf slides)
|
# type 3 (pdf slides)
|
||||||
video.ext = u'pdf'
|
video.ext = u'pdf'
|
||||||
video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
|
video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
|
||||||
|
|
||||||
|
# HACK: we use mechanize directly here for now... FIXME
|
||||||
|
print "asking for redirect on '%s'" % (video.url)
|
||||||
|
self.browser.set_handle_redirect(False)
|
||||||
|
try:
|
||||||
|
req = self.browser.open_novisit(video.url)
|
||||||
|
except HTTPError, e:
|
||||||
|
if e.getcode() == 302 and hasattr(e, 'hdrs'):
|
||||||
|
video.url = unicode(e.hdrs['Location'])
|
||||||
|
self.browser.set_handle_redirect(True)
|
||||||
|
|
||||||
video.set_empty_fields(NotAvailable)
|
video.set_empty_fields(NotAvailable)
|
||||||
return video
|
return video
|
||||||
|
|
||||||
# type 4 (dual screen video)
|
# type 4 (dual screen video)
|
||||||
config_url = m.group(1) + m.group(2)
|
|
||||||
|
# token doesn't actually seem required
|
||||||
|
config_url = m.group(1) + m.group(2) + '?token=' + m.group(3)
|
||||||
|
|
||||||
#config = self.browser.openurl(config_url).read()
|
#config = self.browser.openurl(config_url).read()
|
||||||
config = self.browser.get_document(self.browser.openurl(config_url))
|
config = self.browser.get_document(self.browser.openurl(config_url))
|
||||||
|
|
@ -119,6 +203,12 @@ class VideoPage(BasePage):
|
||||||
if host is None:
|
if host is None:
|
||||||
raise BrokenPageError('Missing tag in xml config file')
|
raise BrokenPageError('Missing tag in xml config file')
|
||||||
|
|
||||||
|
# for id 1373 host is missing '/ondemand'
|
||||||
|
# only add it when only a domain is specified without path
|
||||||
|
m = re.match('^[^\/]+$', host)
|
||||||
|
if m:
|
||||||
|
host += "/ondemand"
|
||||||
|
|
||||||
videos = {}
|
videos = {}
|
||||||
|
|
||||||
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
|
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
|
||||||
|
|
|
||||||
|
|
@ -19,8 +19,11 @@
|
||||||
|
|
||||||
|
|
||||||
from weboob.capabilities.video import BaseVideo
|
from weboob.capabilities.video import BaseVideo
|
||||||
|
from weboob.capabilities.base import NotAvailable
|
||||||
|
from weboob.tools.capabilities.thumbnail import Thumbnail
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from dateutil.parser import parse as parse_dt
|
||||||
|
|
||||||
__all__ = ['GDCVaultVideo']
|
__all__ = ['GDCVaultVideo']
|
||||||
|
|
||||||
|
|
@ -41,3 +44,49 @@ class GDCVaultVideo(BaseVideo):
|
||||||
if m:
|
if m:
|
||||||
return u'http://www.gdcvault.com/play/%s#slides' % _id
|
return u'http://www.gdcvault.com/play/%s#slides' % _id
|
||||||
return u'http://www.gdcvault.com/play/%s' % _id
|
return u'http://www.gdcvault.com/play/%s' % _id
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_video_from_json(self, data):
|
||||||
|
# session_id is unique per talk
|
||||||
|
# vault_media_id is unique per page
|
||||||
|
# (but can refer to 2 video files for dual screen)
|
||||||
|
# solr_id is "${vault_media_id}.${conference_id}.${session_id}.$vault_media_type_id{}"
|
||||||
|
|
||||||
|
# XXX: do we filter them or let people know about them?
|
||||||
|
#if 'anchor' in data:
|
||||||
|
# if data['anchor']['href'] == '#':
|
||||||
|
# # file will not be accessible (not free and not logged in)
|
||||||
|
# return None
|
||||||
|
|
||||||
|
if not 'vault_media_id' in data:
|
||||||
|
return None
|
||||||
|
media_id = int(data['vault_media_id'])
|
||||||
|
video = GDCVaultVideo(media_id)
|
||||||
|
|
||||||
|
# 1013679 has \n in title...
|
||||||
|
video.title = unicode(data.get('session_name', '').replace('\n', ''))
|
||||||
|
|
||||||
|
# TODO: strip out <p>, <br> and other html...
|
||||||
|
# XXX: 1013422 has all 3 and !=
|
||||||
|
if 'overview' in data:
|
||||||
|
video.description = unicode(data['overview'])
|
||||||
|
elif 'spell' in data:
|
||||||
|
video.description = unicode(data['spell'])
|
||||||
|
else:
|
||||||
|
video.description = unicode(data.get('description', ''))
|
||||||
|
|
||||||
|
if 'image' in data:
|
||||||
|
video.thumbnail = Thumbnail(unicode(data['image']))
|
||||||
|
|
||||||
|
if 'speakers_name' in data:
|
||||||
|
video.author = unicode(", ".join(data['speakers_name']))
|
||||||
|
|
||||||
|
if 'start_date' in data:
|
||||||
|
video.date = parse_dt(data['start_date'])
|
||||||
|
|
||||||
|
if 'score' in data:
|
||||||
|
video.rating = data['score']
|
||||||
|
|
||||||
|
video.set_empty_fields(NotAvailable)
|
||||||
|
|
||||||
|
return video
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue