gdcvault: Implement searching; partial download fix

* Implement search functionality, using POST to get JSON data
* Fix download for most items even for non-free ones.
For now only missing are non-free mp3 files it seems (like 769),
/mediaProxy.php returns 2bytes html crap.
This commit is contained in:
François Revol 2013-03-22 06:32:37 +01:00 committed by Romain Bignon
commit 38b80491f1
4 changed files with 177 additions and 23 deletions

View file

@ -66,9 +66,9 @@ class GDCVaultBackend(BaseBackend, ICapVideo, ICapCollection):
SORTBY = ['relevance', 'rating', 'views', 'time'] SORTBY = ['relevance', 'rating', 'views', 'time']
# def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None): def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
# with self.browser: with self.browser:
# return self.browser.search_videos(pattern, self.SORTBY[sortby]) return self.browser.search_videos(pattern, self.SORTBY[sortby])
def fill_video(self, video, fields): def fill_video(self, video, fields):
if fields != ['thumbnail']: if fields != ['thumbnail']:

View file

@ -25,7 +25,7 @@ from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword, BrowserU
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
#from .pages.index import IndexPage #from .pages.index import IndexPage
from .pages import VideoPage, IndexPage from .pages import VideoPage, IndexPage, SearchPage
from .video import GDCVaultVideo from .video import GDCVaultVideo
@ -36,7 +36,8 @@ class GDCVaultBrowser(BaseBrowser):
DOMAIN = 'gdcvault.com' DOMAIN = 'gdcvault.com'
ENCODING = 'utf-8' ENCODING = 'utf-8'
PAGES = {r'http://[w\.]*gdcvault.com/play/(?P<id>[\d]+)/?.*': VideoPage, PAGES = {r'http://[w\.]*gdcvault.com/play/(?P<id>[\d]+)/?.*': VideoPage,
r'http://[w\.]*gdcvault.com/': IndexPage, r'http://[w\.]*gdcvault.com/search\.php.*': (SearchPage, "json"),
r'http://[w\.]*gdcvault.com/.*': IndexPage,
} }
def is_logged(self): def is_logged(self):
@ -63,9 +64,9 @@ class GDCVaultBrowser(BaseBrowser):
data = self.readurl('http://gdcvault.com/api/login.php', data = self.readurl('http://gdcvault.com/api/login.php',
urllib.urlencode(params)) urllib.urlencode(params))
# data is returned as JSON, not sure yet if it's useful # some data returned as JSON, not sure yet if it's useful
#print data
print data
if data is None: if data is None:
raise BrowserBanned('Too many open sessions?') raise BrowserBanned('Too many open sessions?')
@ -75,7 +76,7 @@ class GDCVaultBrowser(BaseBrowser):
raise BrowserIncorrectPassword() raise BrowserIncorrectPassword()
def close_session(self): def close_session(self):
print "logging out..." # XXX: only if is_logged? or was used?
self.openurl('/logout', '') self.openurl('/logout', '')
@id2url(GDCVaultVideo.id2url) @id2url(GDCVaultVideo.id2url)
@ -86,13 +87,27 @@ class GDCVaultBrowser(BaseBrowser):
raise BrowserUnavailable('Requires account') raise BrowserUnavailable('Requires account')
return self.page.get_video(video) return self.page.get_video(video)
# def search_videos(self, pattern, sortby): def search_videos(self, pattern, sortby):
# return None post_data = {"firstfocus" : "",
# self.location(self.buildurl('http://gdcvault.com/en/search%s' % sortby, query=pattern.encode('utf-8'))) "category" : "free",
# assert self.is_on_page(IndexPage) "keyword" : pattern.encode('utf-8'),
# return self.page.iter_videos() "conference_id" : "", }
post_data = urllib.urlencode(post_data)
# probably not required
self.addheaders = [('Referer', 'http://gdcvault.com/'),
("Content-Type" , 'application/x-www-form-urlencoded') ]
# def latest_videos(self): #print post_data
# self.home() # is_logged assumes html page
# assert self.is_on_page(IndexPage) self.location('http://gdcvault.com/search.php',
# return self.page.iter_videos() data=post_data, no_login=True)
assert self.is_on_page(SearchPage)
return self.page.iter_videos()
def latest_videos(self):
print "browser:latest_videos()"
#self.home()
self.location('/free')
assert self.is_on_page(IndexPage)
return self.page.iter_videos()

View file

@ -28,25 +28,69 @@ import datetime
from dateutil.parser import parse as parse_dt from dateutil.parser import parse as parse_dt
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.tools.capabilities.thumbnail import Thumbnail
from weboob.tools.browser import BrokenPageError from weboob.tools.browser import BrokenPageError
#HACK
from urllib2 import HTTPError
from .video import GDCVaultVideo from .video import GDCVaultVideo
#import lxml.etree #import lxml.etree
__all__ = ['IndexPage', 'VideoPage'] __all__ = ['IndexPage', 'SearchPage', 'VideoPage']
class IndexPage(BasePage): class IndexPage(BasePage):
def iter_videos(self): def iter_videos(self):
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'): for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
print a href = a.attrib.get('href', '')
print href
m = re.match('/play/(\d+)/.*', href)
if not m:
continue
print m.group(1)
video = GDCVaultVideo(m.group(1))
# get title
try:
video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text)
except IndexError:
video.title = NotAvailable
# get description
try:
video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text)
except IndexError:
video.description = NotAvailable
# get thumbnail
img = self.parser.select(a, 'div.featured_image img', 1)
if img is not None:
video.thumbnail = Thumbnail(unicode(img.attrib['src']))
else:
video.thumbnail = NotAvailable
#m = re.match('id-(\d+)', a.attrib.get('class', '')) #m = re.match('id-(\d+)', a.attrib.get('class', ''))
#if not m: #if not m:
# continue # continue
# FIXME # FIXME
yield None yield video
# the search page class uses a JSON parser,
# since it's what search.php returns when POSTed (from Ajax)
class SearchPage(BasePage):
def iter_videos(self):
if self.document is None or self.document['data'] is None:
raise BrokenPageError('Unable to find JSON data')
for data in self.document['data']:
video = GDCVaultVideo.get_video_from_json(data)
# TODO: split type 4 videos into id and id#slides
if video is None:
continue
yield video
class VideoPage(BasePage): class VideoPage(BasePage):
def get_video(self, video=None): def get_video(self, video=None):
@ -86,8 +130,34 @@ class VideoPage(BasePage):
m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL) m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
if m: if m:
video.url = "http://gdcvault.com%s" % (m.group(1)) video.url = "http://gdcvault.com%s" % (m.group(1))
# TODO: for non-free (like 769),
# must be logged to use /mediaProxy.php
# FIXME: doesn't seem to work yet, we get 2 bytes as html
# 769 should give:
# http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3
# HACK: we use mechanize directly here for now... FIXME
#print "asking for redirect on '%s'" % (video.url)
#self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
#print self.browser.addheaders
self.browser.set_handle_redirect(False)
try:
req = self.browser.open_novisit(video.url)
headers = req.info()
if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
print 'BUG'
print req.code
except HTTPError, e:
#print e.getcode()
if e.getcode() == 302 and hasattr(e, 'hdrs'):
#print e.hdrs['Location']
video.url = unicode(e.hdrs['Location'])
self.browser.set_handle_redirect(True)
video.set_empty_fields(NotAvailable) video.set_empty_fields(NotAvailable)
return video return video
#XXX: raise error? #XXX: raise error?
return None return None
@ -97,19 +167,33 @@ class VideoPage(BasePage):
# type 3 or 4 (iframe) # type 3 or 4 (iframe)
# get the config file for the rest # get the config file for the rest
iframe_url = obj.attrib['src'] iframe_url = obj.attrib['src']
m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url) m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url)
if not m: if not m:
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url) m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
if m is None: if m is None:
return None return None
# TODO: must be logged to use /mediaProxy.php
# type 3 (pdf slides) # type 3 (pdf slides)
video.ext = u'pdf' video.ext = u'pdf'
video.url = "http://gdcvault.com%s" % (unicode(iframe_url)) video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
# HACK: we use mechanize directly here for now... FIXME
print "asking for redirect on '%s'" % (video.url)
self.browser.set_handle_redirect(False)
try:
req = self.browser.open_novisit(video.url)
except HTTPError, e:
if e.getcode() == 302 and hasattr(e, 'hdrs'):
video.url = unicode(e.hdrs['Location'])
self.browser.set_handle_redirect(True)
video.set_empty_fields(NotAvailable) video.set_empty_fields(NotAvailable)
return video return video
# type 4 (dual screen video) # type 4 (dual screen video)
config_url = m.group(1) + m.group(2)
# token doesn't actually seem required
config_url = m.group(1) + m.group(2) + '?token=' + m.group(3)
#config = self.browser.openurl(config_url).read() #config = self.browser.openurl(config_url).read()
config = self.browser.get_document(self.browser.openurl(config_url)) config = self.browser.get_document(self.browser.openurl(config_url))
@ -119,6 +203,12 @@ class VideoPage(BasePage):
if host is None: if host is None:
raise BrokenPageError('Missing tag in xml config file') raise BrokenPageError('Missing tag in xml config file')
# for id 1373 host is missing '/ondemand'
# only add it when only a domain is specified without path
m = re.match('^[^\/]+$', host)
if m:
host += "/ondemand"
videos = {} videos = {}
obj = self.parser.select(config.getroot(), 'speakervideo', 1) obj = self.parser.select(config.getroot(), 'speakervideo', 1)

View file

@ -19,8 +19,11 @@
from weboob.capabilities.video import BaseVideo from weboob.capabilities.video import BaseVideo
from weboob.capabilities.base import NotAvailable
from weboob.tools.capabilities.thumbnail import Thumbnail
import re import re
from dateutil.parser import parse as parse_dt
__all__ = ['GDCVaultVideo'] __all__ = ['GDCVaultVideo']
@ -41,3 +44,49 @@ class GDCVaultVideo(BaseVideo):
if m: if m:
return u'http://www.gdcvault.com/play/%s#slides' % _id return u'http://www.gdcvault.com/play/%s#slides' % _id
return u'http://www.gdcvault.com/play/%s' % _id return u'http://www.gdcvault.com/play/%s' % _id
@classmethod
def get_video_from_json(self, data):
# session_id is unique per talk
# vault_media_id is unique per page
# (but can refer to 2 video files for dual screen)
# solr_id is "${vault_media_id}.${conference_id}.${session_id}.$vault_media_type_id{}"
# XXX: do we filter them or let people know about them?
#if 'anchor' in data:
# if data['anchor']['href'] == '#':
# # file will not be accessible (not free and not logged in)
# return None
if not 'vault_media_id' in data:
return None
media_id = int(data['vault_media_id'])
video = GDCVaultVideo(media_id)
# 1013679 has \n in title...
video.title = unicode(data.get('session_name', '').replace('\n', ''))
# TODO: strip out <p>, <br> and other html...
# XXX: 1013422 has all 3 and !=
if 'overview' in data:
video.description = unicode(data['overview'])
elif 'spell' in data:
video.description = unicode(data['spell'])
else:
video.description = unicode(data.get('description', ''))
if 'image' in data:
video.thumbnail = Thumbnail(unicode(data['image']))
if 'speakers_name' in data:
video.author = unicode(", ".join(data['speakers_name']))
if 'start_date' in data:
video.date = parse_dt(data['start_date'])
if 'score' in data:
video.rating = data['score']
video.set_empty_fields(NotAvailable)
return video