# -*- coding: utf-8 -*- # Copyright(C) 2010-2011 Romain Bignon # Copyright(C) 2012 François Revol # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . from weboob.tools.mech import ClientForm ControlNotFoundError = ClientForm.ControlNotFoundError from weboob.tools.browser import BasePage import re import datetime from dateutil.parser import parse as parse_dt from weboob.capabilities.base import NotAvailable from weboob.tools.browser import BrokenPageError from .video import GDCVaultVideo #import lxml.etree __all__ = ['IndexPage', 'VideoPage'] class IndexPage(BasePage): def iter_videos(self): for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'): print a #m = re.match('id-(\d+)', a.attrib.get('class', '')) #if not m: # continue # FIXME yield None class VideoPage(BasePage): def get_video(self, video=None): # check for slides id variant want_slides = False m = re.match('.*#slides', self.url) if m: want_slides = True # not sure it's safe self.group_dict['id'] += '#slides' if video is None: video = GDCVaultVideo(self.group_dict['id']) # the config file has it too, but in CDATA and only for type 4 obj = self.parser.select(self.document.getroot(), 'title') if len(obj) > 0: title = obj[0].text.strip() m = re.match('GDC Vault\s+-\s+(.*)', title) if m: title = m.group(1) video.title = unicode(title) #TODO: POST back the title to /search.php and filter == id to get # cleaner (JSON) data... (though it'd be much slower) # try to find an iframe (type 3 and 4) obj = self.parser.select(self.document.getroot(), 'iframe') if len(obj) == 0: # type 1 or 2 (swf+js) # find which script element contains the swf args for script in self.parser.select(self.document.getroot(), 'script'): m = re.match(".*new SWFObject.*addVariable\('type', '(.*)'\).*", unicode(script.text), re.DOTALL) if m: video.ext = m.group(1) m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL) if m: video.url = "http://gdcvault.com%s" % (m.group(1)) video.set_empty_fields(NotAvailable) return video #XXX: raise error? return None obj = obj[0] if obj is None: return None # type 3 or 4 (iframe) # get the config file for the rest iframe_url = obj.attrib['src'] m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url) if not m: m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url) if m is None: return None # type 3 (pdf slides) video.ext = u'pdf' video.url = "http://gdcvault.com%s" % (unicode(iframe_url)) video.set_empty_fields(NotAvailable) return video # type 4 (dual screen video) config_url = m.group(1) + m.group(2) #config = self.browser.openurl(config_url).read() config = self.browser.get_document(self.browser.openurl(config_url)) obj = self.parser.select(config.getroot(), 'akamaihost', 1) host = obj.text if host is None: raise BrokenPageError('Missing tag in xml config file') videos = {} obj = self.parser.select(config.getroot(), 'speakervideo', 1) videos['speaker'] = 'rtmp://' + host + '/' + obj.text obj = self.parser.select(config.getroot(), 'slidevideo', 1) videos['slides'] = 'rtmp://' + host + '/' + obj.text #print videos obj = self.parser.select(config.getroot(), 'date', 1) video.date = parse_dt(obj.text) obj = self.parser.select(config.getroot(), 'duration', 1) m = re.match('(\d\d):(\d\d):(\d\d)', obj.text) if m: video.duration = datetime.timedelta(hours = int(m.group(1)), minutes = int(m.group(2)), seconds = int(m.group(3))) obj = self.parser.select(config.getroot(), 'speaker', 1) #print obj.text_content() #XXX video.url = unicode(videos['speaker']) if want_slides: video.url = unicode(videos['slides']) #self.set_details(video) video.set_empty_fields(NotAvailable) return video obj = self.parser.select(self.document.getroot(), 'title') if len(obj) < 1: return None title = obj[0].text.strip() m = re.match('GDC Vault\s+-\s+(.*)', title) if m: title = m.group(1) def set_details(self, v): obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1) if obj is not None: value = obj.attrib['content'] m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value) if not m: raise BrokenPageError('Unable to parse datetime: %r' % value) day = m.group(1) month = m.group(2) year = m.group(3) hour = m.group(4) minute = m.group(5) v.date = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(hour), minute=int(minute)) obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1) if obj is not None: span = self.parser.select(obj, 'span.ep_date', 1) value = span.text m = re.match('(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value) if not m: raise BrokenPageError('Unable to parse datetime: %r' % value) bhour = m.group(1) bminute = m.group(2) ehour = m.group(3) eminute = m.group(4) day = m.group(5) month = m.group(6) year = m.group(7) start = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(bhour), minute=int(bminute)) end = datetime.datetime(year=int(year), month=int(month), day=int(day), hour=int(ehour), minute=int(eminute)) v.duration = end - start