From 38b80491f1854f2d7bc0f0e896da85a65b7f51c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= Date: Fri, 22 Mar 2013 06:32:37 +0100 Subject: [PATCH] gdcvault: Implement searching; partial download fix * Implement search functionality, using POST to get JSON data * Fix download for most items even for non-free ones. For now only missing are non-free mp3 files it seems (like 769), /mediaProxy.php returns 2bytes html crap. --- modules/gdcvault/backend.py | 6 +-- modules/gdcvault/browser.py | 43 +++++++++++----- modules/gdcvault/pages.py | 100 ++++++++++++++++++++++++++++++++++-- modules/gdcvault/video.py | 49 ++++++++++++++++++ 4 files changed, 176 insertions(+), 22 deletions(-) diff --git a/modules/gdcvault/backend.py b/modules/gdcvault/backend.py index cdb827b5..1b881e57 100644 --- a/modules/gdcvault/backend.py +++ b/modules/gdcvault/backend.py @@ -66,9 +66,9 @@ class GDCVaultBackend(BaseBackend, ICapVideo, ICapCollection): SORTBY = ['relevance', 'rating', 'views', 'time'] - # def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None): - # with self.browser: - # return self.browser.search_videos(pattern, self.SORTBY[sortby]) + def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None): + with self.browser: + return self.browser.search_videos(pattern, self.SORTBY[sortby]) def fill_video(self, video, fields): if fields != ['thumbnail']: diff --git a/modules/gdcvault/browser.py b/modules/gdcvault/browser.py index d423eae9..d98103f5 100644 --- a/modules/gdcvault/browser.py +++ b/modules/gdcvault/browser.py @@ -25,7 +25,7 @@ from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword, BrowserU from weboob.tools.browser.decorators import id2url #from .pages.index import IndexPage -from .pages import VideoPage, IndexPage +from .pages import VideoPage, IndexPage, SearchPage from .video import GDCVaultVideo @@ -36,7 +36,8 @@ class GDCVaultBrowser(BaseBrowser): DOMAIN = 'gdcvault.com' ENCODING = 'utf-8' PAGES = {r'http://[w\.]*gdcvault.com/play/(?P[\d]+)/?.*': VideoPage, - r'http://[w\.]*gdcvault.com/': IndexPage, + r'http://[w\.]*gdcvault.com/search\.php.*': (SearchPage, "json"), + r'http://[w\.]*gdcvault.com/.*': IndexPage, } def is_logged(self): @@ -63,9 +64,9 @@ class GDCVaultBrowser(BaseBrowser): data = self.readurl('http://gdcvault.com/api/login.php', urllib.urlencode(params)) - # data is returned as JSON, not sure yet if it's useful + # some data returned as JSON, not sure yet if it's useful + #print data - print data if data is None: raise BrowserBanned('Too many open sessions?') @@ -75,7 +76,7 @@ class GDCVaultBrowser(BaseBrowser): raise BrowserIncorrectPassword() def close_session(self): - print "logging out..." + # XXX: only if is_logged? or was used? self.openurl('/logout', '') @id2url(GDCVaultVideo.id2url) @@ -86,13 +87,27 @@ class GDCVaultBrowser(BaseBrowser): raise BrowserUnavailable('Requires account') return self.page.get_video(video) - # def search_videos(self, pattern, sortby): - # return None - # self.location(self.buildurl('http://gdcvault.com/en/search%s' % sortby, query=pattern.encode('utf-8'))) - # assert self.is_on_page(IndexPage) - # return self.page.iter_videos() + def search_videos(self, pattern, sortby): + post_data = {"firstfocus" : "", + "category" : "free", + "keyword" : pattern.encode('utf-8'), + "conference_id" : "", } + post_data = urllib.urlencode(post_data) + # probably not required + self.addheaders = [('Referer', 'http://gdcvault.com/'), + ("Content-Type" , 'application/x-www-form-urlencoded') ] - # def latest_videos(self): - # self.home() - # assert self.is_on_page(IndexPage) - # return self.page.iter_videos() + #print post_data + # is_logged assumes html page + self.location('http://gdcvault.com/search.php', + data=post_data, no_login=True) + + assert self.is_on_page(SearchPage) + return self.page.iter_videos() + + def latest_videos(self): + print "browser:latest_videos()" + #self.home() + self.location('/free') + assert self.is_on_page(IndexPage) + return self.page.iter_videos() diff --git a/modules/gdcvault/pages.py b/modules/gdcvault/pages.py index aa48a6f0..712117ef 100644 --- a/modules/gdcvault/pages.py +++ b/modules/gdcvault/pages.py @@ -28,25 +28,69 @@ import datetime from dateutil.parser import parse as parse_dt from weboob.capabilities.base import NotAvailable +from weboob.tools.capabilities.thumbnail import Thumbnail from weboob.tools.browser import BrokenPageError +#HACK +from urllib2 import HTTPError + from .video import GDCVaultVideo #import lxml.etree -__all__ = ['IndexPage', 'VideoPage'] +__all__ = ['IndexPage', 'SearchPage', 'VideoPage'] class IndexPage(BasePage): def iter_videos(self): for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'): - print a + href = a.attrib.get('href', '') + print href + m = re.match('/play/(\d+)/.*', href) + if not m: + continue + print m.group(1) + video = GDCVaultVideo(m.group(1)) + + # get title + try: + video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text) + except IndexError: + video.title = NotAvailable + + # get description + try: + video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text) + except IndexError: + video.description = NotAvailable + + # get thumbnail + img = self.parser.select(a, 'div.featured_image img', 1) + if img is not None: + video.thumbnail = Thumbnail(unicode(img.attrib['src'])) + else: + video.thumbnail = NotAvailable + + #m = re.match('id-(\d+)', a.attrib.get('class', '')) #if not m: # continue # FIXME - yield None + yield video + +# the search page class uses a JSON parser, +# since it's what search.php returns when POSTed (from Ajax) +class SearchPage(BasePage): + def iter_videos(self): + if self.document is None or self.document['data'] is None: + raise BrokenPageError('Unable to find JSON data') + for data in self.document['data']: + video = GDCVaultVideo.get_video_from_json(data) + # TODO: split type 4 videos into id and id#slides + if video is None: + continue + yield video class VideoPage(BasePage): def get_video(self, video=None): @@ -86,8 +130,34 @@ class VideoPage(BasePage): m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL) if m: video.url = "http://gdcvault.com%s" % (m.group(1)) + # TODO: for non-free (like 769), + # must be logged to use /mediaProxy.php + + # FIXME: doesn't seem to work yet, we get 2 bytes as html + # 769 should give: + # http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3 + # HACK: we use mechanize directly here for now... FIXME + #print "asking for redirect on '%s'" % (video.url) + #self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]] + #print self.browser.addheaders + self.browser.set_handle_redirect(False) + try: + req = self.browser.open_novisit(video.url) + headers = req.info() + if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2': + print 'BUG' + + print req.code + except HTTPError, e: + #print e.getcode() + if e.getcode() == 302 and hasattr(e, 'hdrs'): + #print e.hdrs['Location'] + video.url = unicode(e.hdrs['Location']) + self.browser.set_handle_redirect(True) + video.set_empty_fields(NotAvailable) return video + #XXX: raise error? return None @@ -97,19 +167,33 @@ class VideoPage(BasePage): # type 3 or 4 (iframe) # get the config file for the rest iframe_url = obj.attrib['src'] - m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url) + m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url) if not m: m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url) if m is None: return None + # TODO: must be logged to use /mediaProxy.php # type 3 (pdf slides) video.ext = u'pdf' video.url = "http://gdcvault.com%s" % (unicode(iframe_url)) + + # HACK: we use mechanize directly here for now... FIXME + print "asking for redirect on '%s'" % (video.url) + self.browser.set_handle_redirect(False) + try: + req = self.browser.open_novisit(video.url) + except HTTPError, e: + if e.getcode() == 302 and hasattr(e, 'hdrs'): + video.url = unicode(e.hdrs['Location']) + self.browser.set_handle_redirect(True) + video.set_empty_fields(NotAvailable) return video # type 4 (dual screen video) - config_url = m.group(1) + m.group(2) + + # token doesn't actually seem required + config_url = m.group(1) + m.group(2) + '?token=' + m.group(3) #config = self.browser.openurl(config_url).read() config = self.browser.get_document(self.browser.openurl(config_url)) @@ -119,6 +203,12 @@ class VideoPage(BasePage): if host is None: raise BrokenPageError('Missing tag in xml config file') + # for id 1373 host is missing '/ondemand' + # only add it when only a domain is specified without path + m = re.match('^[^\/]+$', host) + if m: + host += "/ondemand" + videos = {} obj = self.parser.select(config.getroot(), 'speakervideo', 1) diff --git a/modules/gdcvault/video.py b/modules/gdcvault/video.py index 6aab4804..eaa0394a 100644 --- a/modules/gdcvault/video.py +++ b/modules/gdcvault/video.py @@ -19,8 +19,11 @@ from weboob.capabilities.video import BaseVideo +from weboob.capabilities.base import NotAvailable +from weboob.tools.capabilities.thumbnail import Thumbnail import re +from dateutil.parser import parse as parse_dt __all__ = ['GDCVaultVideo'] @@ -41,3 +44,49 @@ class GDCVaultVideo(BaseVideo): if m: return u'http://www.gdcvault.com/play/%s#slides' % _id return u'http://www.gdcvault.com/play/%s' % _id + + @classmethod + def get_video_from_json(self, data): + # session_id is unique per talk + # vault_media_id is unique per page + # (but can refer to 2 video files for dual screen) + # solr_id is "${vault_media_id}.${conference_id}.${session_id}.$vault_media_type_id{}" + + # XXX: do we filter them or let people know about them? + #if 'anchor' in data: + # if data['anchor']['href'] == '#': + # # file will not be accessible (not free and not logged in) + # return None + + if not 'vault_media_id' in data: + return None + media_id = int(data['vault_media_id']) + video = GDCVaultVideo(media_id) + + # 1013679 has \n in title... + video.title = unicode(data.get('session_name', '').replace('\n', '')) + + # TODO: strip out

,
and other html... + # XXX: 1013422 has all 3 and != + if 'overview' in data: + video.description = unicode(data['overview']) + elif 'spell' in data: + video.description = unicode(data['spell']) + else: + video.description = unicode(data.get('description', '')) + + if 'image' in data: + video.thumbnail = Thumbnail(unicode(data['image'])) + + if 'speakers_name' in data: + video.author = unicode(", ".join(data['speakers_name'])) + + if 'start_date' in data: + video.date = parse_dt(data['start_date']) + + if 'score' in data: + video.rating = data['score'] + + video.set_empty_fields(NotAvailable) + + return video