Add a video module for gdcvault.com

For now it only fetches the speaker video, but each page can have both a speaker and slides video feed. TODO: search Signed-off-by: François Revol <revol@free.fr> Signed-off-by: Romain Bignon <romain@symlink.me>
2012-08-31 19:23:29 +02:00 · 2012-08-31 19:23:29 +02:00 · d24ca46ef6
commit d24ca46ef6
parent bb9a62b566
8 changed files with 382 additions and 0 deletions
--- a/modules/gdcvault/pages.py
+++ b/modules/gdcvault/pages.py
@ -0,0 +1,159 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+# Copyright(C) 2012 François Revol
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+from weboob.tools.mech import ClientForm
+ControlNotFoundError = ClientForm.ControlNotFoundError
+
+from weboob.tools.browser import BasePage
+
+import re
+import datetime
+from dateutil.parser import parse as parse_dt
+
+from weboob.capabilities.base import NotAvailable
+from weboob.tools.browser import BrokenPageError
+
+from .video import GDCVaultVideo
+
+#import lxml.etree
+
+
+
+
+__all__ = ['VideoPage']
+
+class VideoPage(BasePage):
+    def get_video(self, video=None):
+        if video is None:
+            video = GDCVaultVideo(self.group_dict['id'])
+
+        # the config file has it too, but in CDATA
+        obj = self.parser.select(self.document.getroot(), 'title')
+        if len(obj) > 0:
+            title = obj[0].text.strip()
+            m = re.match('GDC Vault\s+-\s+(.*)', title)
+            if m:
+                title = m.group(1)
+        video.title = unicode(title)
+
+        # get the config file for the rest
+        obj = self.parser.select(self.document.getroot(), 'iframe', 1)
+        if obj is None:
+            return None
+        iframe_url = obj.attrib['src']
+        m = re.match('(http:.*)player.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
+        if not m:
+            return None
+        config_url = m.group(1) + m.group(2)
+
+        #config = self.browser.openurl(config_url).read()
+        config = self.browser.get_document(self.browser.openurl(config_url))
+
+        obj = self.parser.select(config.getroot(), 'akamaihost', 1)
+        host = obj.text
+        if host is None:
+            raise BrokenPageError('Missing tag in xml config file')
+
+        videos = {}
+
+        obj = self.parser.select(config.getroot(), 'speakervideo', 1)
+        videos['speaker'] = 'rtmp://' + host + '/' + obj.text
+
+        obj = self.parser.select(config.getroot(), 'slidevideo', 1)
+        videos['slides'] = 'rtmp://' + host + '/' + obj.text
+
+        #print videos
+
+        obj = self.parser.select(config.getroot(), 'date', 1)
+        video.date = parse_dt(obj.text)
+
+        obj = self.parser.select(config.getroot(), 'duration', 1)
+        m = re.match('(\d\d):(\d\d):(\d\d)', obj.text)
+        if m:
+            video.duration = datetime.timedelta(hours = int(m.group(1)),
+                                                minutes = int(m.group(2)),
+                                                seconds = int(m.group(3)))
+
+        obj = self.parser.select(config.getroot(), 'speaker', 1)
+        #print obj.text_content()
+
+        #TODO: speaker as CDATA
+        #video.author = u'European Parliament'
+
+        #XXX
+        video.url = unicode(videos['speaker'])
+        #self.set_details(video)
+
+        video.set_empty_fields(NotAvailable)
+        return video
+
+        obj = self.parser.select(self.document.getroot(), 'title')
+        if len(obj) < 1:
+            return None
+        title = obj[0].text.strip()
+        m = re.match('GDC Vault\s+-\s+(.*)', title)
+        if m:
+            title = m.group(1)
+
+    def set_details(self, v):
+        obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1)
+        if obj is not None:
+            value = obj.attrib['content']
+            m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value)
+            if not m:
+                raise BrokenPageError('Unable to parse datetime: %r' % value)
+            day = m.group(1)
+            month = m.group(2)
+            year = m.group(3)
+            hour = m.group(4)
+            minute = m.group(5)
+            v.date = datetime.datetime(year=int(year),
+                                       month=int(month),
+                                       day=int(day),
+                                       hour=int(hour),
+                                       minute=int(minute))
+            
+        obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1)
+        if obj is not None:
+            span = self.parser.select(obj, 'span.ep_date', 1)
+            value = span.text
+            m = re.match('(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value)
+            if not m:
+                raise BrokenPageError('Unable to parse datetime: %r' % value)
+            bhour = m.group(1)
+            bminute = m.group(2)
+            ehour = m.group(3)
+            eminute = m.group(4)
+            day = m.group(5)
+            month = m.group(6)
+            year = m.group(7)
+            
+            start = datetime.datetime(year=int(year),
+                                      month=int(month),
+                                      day=int(day),
+                                      hour=int(bhour),
+                                      minute=int(bminute))
+            end = datetime.datetime(year=int(year),
+                                    month=int(month),
+                                    day=int(day),
+                                    hour=int(ehour),
+                                    minute=int(eminute))
+
+            v.duration = end - start