gdcvault: implement all 4 media types

* We now retrieve correctly all 4 media types: ** Video (plain flv file) ex: http://gdcvault.com/play/29 ** Audio (mp3) ex: http://gdcvault.com/play/22/From-One-off-to-Franchise ** Slides (pdf, not really a video but they download fine) ex: http://gdcvault.com/play/1015486 ** DS Video (dual screen, two flv files) (by default the speaker file, but id#slides gets the other video (however default filenames conflict) ex: http://gdcvault.com/play/1015841
2013-03-18 02:26:30 +01:00 · 2013-03-18 02:26:30 +01:00 · c8685b8e3b
commit c8685b8e3b
parent 3a79a21360
1 changed files with 46 additions and 8 deletions
--- a/modules/gdcvault/pages.py
+++ b/modules/gdcvault/pages.py
@ -50,10 +50,18 @@ class IndexPage(BasePage):
 class VideoPage(BasePage):
    def get_video(self, video=None):
        # check for slides id variant
        want_slides = False
        m = re.match('.*#slides', self.url)
        if m:
            want_slides = True
            # not sure it's safe
            self.group_dict['id'] += '#slides'
        if video is None:
            video = GDCVaultVideo(self.group_dict['id'])
-        # the config file has it too, but in CDATA
+        # the config file has it too, but in CDATA and only for type 4
        obj = self.parser.select(self.document.getroot(), 'title')
        if len(obj) > 0:
            title = obj[0].text.strip()
@ -62,14 +70,45 @@ class VideoPage(BasePage):
                title = m.group(1)
        video.title = unicode(title)
-        # get the config file for the rest
+        #TODO: POST back the title to /search.php and filter == id to get
-        obj = self.parser.select(self.document.getroot(), 'iframe', 1)
+        # cleaner (JSON) data... (though it'd be much slower)
        # try to find an iframe (type 3 and 4)
        obj = self.parser.select(self.document.getroot(), 'iframe')
        if len(obj) == 0:
            # type 1 or 2 (swf+js)
            # find which script element contains the swf args
            for script in self.parser.select(self.document.getroot(), 'script'):
                m = re.match(".*new SWFObject.*addVariable\('type', '(.*)'\).*", unicode(script.text), re.DOTALL)
                if m:
                    video.ext = m.group(1)
                m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
                if m:
                    video.url = "http://gdcvault.com%s" % (m.group(1))
                    video.set_empty_fields(NotAvailable)
                    return video
            #XXX: raise error?
            return None
        obj = obj[0]
        if obj is None:
            return None
        # type 3 or 4 (iframe)
        # get the config file for the rest
        iframe_url = obj.attrib['src']
-        m = re.match('(http:.*)player.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
+        m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
        if not m:
            m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
            if m is None:
                return None
            # type 3 (pdf slides)
            video.ext = u'pdf'
            video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
            video.set_empty_fields(NotAvailable)
            return video
        # type 4 (dual screen video)
        config_url = m.group(1) + m.group(2)
        #config = self.browser.openurl(config_url).read()
@ -103,11 +142,10 @@ class VideoPage(BasePage):
        obj = self.parser.select(config.getroot(), 'speaker', 1)
        #print obj.text_content()
        #TODO: speaker as CDATA
        #video.author = u'European Parliament'
        #XXX
        video.url = unicode(videos['speaker'])
        if want_slides:
            video.url = unicode(videos['slides'])
        #self.set_details(video)
        video.set_empty_fields(NotAvailable)