weboob-devel/modules/gdcvault/pages.py
François Revol 38b80491f1 gdcvault: Implement searching; partial download fix
* Implement search functionality, using POST to get JSON data
* Fix download for most items even for non-free ones.
For now only missing are non-free mp3 files it seems (like 769),
/mediaProxy.php returns 2bytes html crap.
2013-03-22 16:45:45 +01:00

296 lines
11 KiB
Python

# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
# Copyright(C) 2012 François Revol
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.mech import ClientForm
ControlNotFoundError = ClientForm.ControlNotFoundError
from weboob.tools.browser import BasePage
import re
import datetime
from dateutil.parser import parse as parse_dt
from weboob.capabilities.base import NotAvailable
from weboob.tools.capabilities.thumbnail import Thumbnail
from weboob.tools.browser import BrokenPageError
#HACK
from urllib2 import HTTPError
from .video import GDCVaultVideo
#import lxml.etree
__all__ = ['IndexPage', 'SearchPage', 'VideoPage']
class IndexPage(BasePage):
def iter_videos(self):
for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
href = a.attrib.get('href', '')
print href
m = re.match('/play/(\d+)/.*', href)
if not m:
continue
print m.group(1)
video = GDCVaultVideo(m.group(1))
# get title
try:
video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text)
except IndexError:
video.title = NotAvailable
# get description
try:
video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text)
except IndexError:
video.description = NotAvailable
# get thumbnail
img = self.parser.select(a, 'div.featured_image img', 1)
if img is not None:
video.thumbnail = Thumbnail(unicode(img.attrib['src']))
else:
video.thumbnail = NotAvailable
#m = re.match('id-(\d+)', a.attrib.get('class', ''))
#if not m:
# continue
# FIXME
yield video
# the search page class uses a JSON parser,
# since it's what search.php returns when POSTed (from Ajax)
class SearchPage(BasePage):
def iter_videos(self):
if self.document is None or self.document['data'] is None:
raise BrokenPageError('Unable to find JSON data')
for data in self.document['data']:
video = GDCVaultVideo.get_video_from_json(data)
# TODO: split type 4 videos into id and id#slides
if video is None:
continue
yield video
class VideoPage(BasePage):
def get_video(self, video=None):
# check for slides id variant
want_slides = False
m = re.match('.*#slides', self.url)
if m:
want_slides = True
# not sure it's safe
self.group_dict['id'] += '#slides'
if video is None:
video = GDCVaultVideo(self.group_dict['id'])
# the config file has it too, but in CDATA and only for type 4
obj = self.parser.select(self.document.getroot(), 'title')
if len(obj) > 0:
title = obj[0].text.strip()
m = re.match('GDC Vault\s+-\s+(.*)', title)
if m:
title = m.group(1)
video.title = unicode(title)
#TODO: POST back the title to /search.php and filter == id to get
# cleaner (JSON) data... (though it'd be much slower)
# try to find an iframe (type 3 and 4)
obj = self.parser.select(self.document.getroot(), 'iframe')
if len(obj) == 0:
# type 1 or 2 (swf+js)
# find which script element contains the swf args
for script in self.parser.select(self.document.getroot(), 'script'):
m = re.match(".*new SWFObject.*addVariable\('type', '(.*)'\).*", unicode(script.text), re.DOTALL)
if m:
video.ext = m.group(1)
m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
if m:
video.url = "http://gdcvault.com%s" % (m.group(1))
# TODO: for non-free (like 769),
# must be logged to use /mediaProxy.php
# FIXME: doesn't seem to work yet, we get 2 bytes as html
# 769 should give:
# http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3
# HACK: we use mechanize directly here for now... FIXME
#print "asking for redirect on '%s'" % (video.url)
#self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
#print self.browser.addheaders
self.browser.set_handle_redirect(False)
try:
req = self.browser.open_novisit(video.url)
headers = req.info()
if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
print 'BUG'
print req.code
except HTTPError, e:
#print e.getcode()
if e.getcode() == 302 and hasattr(e, 'hdrs'):
#print e.hdrs['Location']
video.url = unicode(e.hdrs['Location'])
self.browser.set_handle_redirect(True)
video.set_empty_fields(NotAvailable)
return video
#XXX: raise error?
return None
obj = obj[0]
if obj is None:
return None
# type 3 or 4 (iframe)
# get the config file for the rest
iframe_url = obj.attrib['src']
m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url)
if not m:
m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
if m is None:
return None
# TODO: must be logged to use /mediaProxy.php
# type 3 (pdf slides)
video.ext = u'pdf'
video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
# HACK: we use mechanize directly here for now... FIXME
print "asking for redirect on '%s'" % (video.url)
self.browser.set_handle_redirect(False)
try:
req = self.browser.open_novisit(video.url)
except HTTPError, e:
if e.getcode() == 302 and hasattr(e, 'hdrs'):
video.url = unicode(e.hdrs['Location'])
self.browser.set_handle_redirect(True)
video.set_empty_fields(NotAvailable)
return video
# type 4 (dual screen video)
# token doesn't actually seem required
config_url = m.group(1) + m.group(2) + '?token=' + m.group(3)
#config = self.browser.openurl(config_url).read()
config = self.browser.get_document(self.browser.openurl(config_url))
obj = self.parser.select(config.getroot(), 'akamaihost', 1)
host = obj.text
if host is None:
raise BrokenPageError('Missing tag in xml config file')
# for id 1373 host is missing '/ondemand'
# only add it when only a domain is specified without path
m = re.match('^[^\/]+$', host)
if m:
host += "/ondemand"
videos = {}
obj = self.parser.select(config.getroot(), 'speakervideo', 1)
videos['speaker'] = 'rtmp://' + host + '/' + obj.text
obj = self.parser.select(config.getroot(), 'slidevideo', 1)
videos['slides'] = 'rtmp://' + host + '/' + obj.text
#print videos
obj = self.parser.select(config.getroot(), 'date', 1)
video.date = parse_dt(obj.text)
obj = self.parser.select(config.getroot(), 'duration', 1)
m = re.match('(\d\d):(\d\d):(\d\d)', obj.text)
if m:
video.duration = datetime.timedelta(hours = int(m.group(1)),
minutes = int(m.group(2)),
seconds = int(m.group(3)))
obj = self.parser.select(config.getroot(), 'speaker', 1)
#print obj.text_content()
#XXX
video.url = unicode(videos['speaker'])
if want_slides:
video.url = unicode(videos['slides'])
#self.set_details(video)
video.set_empty_fields(NotAvailable)
return video
obj = self.parser.select(self.document.getroot(), 'title')
if len(obj) < 1:
return None
title = obj[0].text.strip()
m = re.match('GDC Vault\s+-\s+(.*)', title)
if m:
title = m.group(1)
def set_details(self, v):
obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1)
if obj is not None:
value = obj.attrib['content']
m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value)
if not m:
raise BrokenPageError('Unable to parse datetime: %r' % value)
day = m.group(1)
month = m.group(2)
year = m.group(3)
hour = m.group(4)
minute = m.group(5)
v.date = datetime.datetime(year=int(year),
month=int(month),
day=int(day),
hour=int(hour),
minute=int(minute))
obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1)
if obj is not None:
span = self.parser.select(obj, 'span.ep_date', 1)
value = span.text
m = re.match('(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value)
if not m:
raise BrokenPageError('Unable to parse datetime: %r' % value)
bhour = m.group(1)
bminute = m.group(2)
ehour = m.group(3)
eminute = m.group(4)
day = m.group(5)
month = m.group(6)
year = m.group(7)
start = datetime.datetime(year=int(year),
month=int(month),
day=int(day),
hour=int(bhour),
minute=int(bminute))
end = datetime.datetime(year=int(year),
month=int(month),
day=int(day),
hour=int(ehour),
minute=int(eminute))
v.duration = end - start