gdcvault: implement all 4 media types

* We now retrieve correctly all 4 media types: ** Video (plain flv file) ex: http://gdcvault.com/play/29 ** Audio (mp3) ex: http://gdcvault.com/play/22/From-One-off-to-Franchise ** Slides (pdf, not really a video but they download fine) ex: http://gdcvault.com/play/1015486 ** DS Video (dual screen, two flv files) (by default the speaker file, but id#slides gets the other video (however default filenames conflict) ex: http://gdcvault.com/play/1015841
2013-03-18 02:26:30 +01:00 · 2013-03-18 02:26:30 +01:00 · c8685b8e3b
commit c8685b8e3b
parent 3a79a21360
1 changed files with 46 additions and 8 deletions
--- a/modules/gdcvault/pages.py
+++ b/modules/gdcvault/pages.py
@ -50,10 +50,18 @@ class IndexPage(BasePage):

 class VideoPage(BasePage):
    def get_video(self, video=None):
+        # check for slides id variant
+        want_slides = False
+        m = re.match('.*#slides', self.url)
+        if m:
+            want_slides = True
+            # not sure it's safe
+            self.group_dict['id'] += '#slides'
+
        if video is None:
            video = GDCVaultVideo(self.group_dict['id'])

-        # the config file has it too, but in CDATA
+        # the config file has it too, but in CDATA and only for type 4
        obj = self.parser.select(self.document.getroot(), 'title')
        if len(obj) > 0:
            title = obj[0].text.strip()
@ -62,14 +70,45 @@ class VideoPage(BasePage):
                title = m.group(1)
        video.title = unicode(title)

-        # get the config file for the rest
-        obj = self.parser.select(self.document.getroot(), 'iframe', 1)
+        #TODO: POST back the title to /search.php and filter == id to get
+        # cleaner (JSON) data... (though it'd be much slower)
+
+        # try to find an iframe (type 3 and 4)
+        obj = self.parser.select(self.document.getroot(), 'iframe')
+        if len(obj) == 0:
+            # type 1 or 2 (swf+js)
+            # find which script element contains the swf args
+            for script in self.parser.select(self.document.getroot(), 'script'):
+                m = re.match(".*new SWFObject.*addVariable\('type', '(.*)'\).*", unicode(script.text), re.DOTALL)
+                if m:
+                    video.ext = m.group(1)
+
+                m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
+                if m:
+                    video.url = "http://gdcvault.com%s" % (m.group(1))
+                    video.set_empty_fields(NotAvailable)
+                    return video
+            #XXX: raise error?
+            return None
+
+        obj = obj[0]
        if obj is None:
            return None
+        # type 3 or 4 (iframe)
+        # get the config file for the rest
        iframe_url = obj.attrib['src']
-        m = re.match('(http:.*)player.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
+        m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
        if not m:
-            return None
+            m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
+            if m is None:
+                return None
+            # type 3 (pdf slides)
+            video.ext = u'pdf'
+            video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
+            video.set_empty_fields(NotAvailable)
+            return video
+
+        # type 4 (dual screen video)
        config_url = m.group(1) + m.group(2)

        #config = self.browser.openurl(config_url).read()
@ -103,11 +142,10 @@ class VideoPage(BasePage):
        obj = self.parser.select(config.getroot(), 'speaker', 1)
        #print obj.text_content()

-        #TODO: speaker as CDATA
-        #video.author = u'European Parliament'
-
        #XXX
        video.url = unicode(videos['speaker'])
+        if want_slides:
+            video.url = unicode(videos['slides'])
        #self.set_details(video)

        video.set_empty_fields(NotAvailable)