From 38b80491f1854f2d7bc0f0e896da85a65b7f51c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Revol?= <revol@free.fr>
Date: Fri, 22 Mar 2013 06:32:37 +0100
Subject: [PATCH] gdcvault: Implement searching; partial download fix

* Implement search functionality, using POST to get JSON data
* Fix download for most items even for non-free ones.
For now only missing are non-free mp3 files it seems (like 769),
/mediaProxy.php returns 2bytes html crap.
---
 modules/gdcvault/backend.py |   6 +--
 modules/gdcvault/browser.py |  43 +++++++++++-----
 modules/gdcvault/pages.py   | 100 ++++++++++++++++++++++++++++++++++--
 modules/gdcvault/video.py   |  49 ++++++++++++++++++
 4 files changed, 176 insertions(+), 22 deletions(-)
diff --git a/modules/gdcvault/backend.py b/modules/gdcvault/backend.py
index cdb827b5..1b881e57 100644
--- a/modules/gdcvault/backend.py
+++ b/modules/gdcvault/backend.py
@@ -66,9 +66,9 @@ class GDCVaultBackend(BaseBackend, ICapVideo, ICapCollection):
 
     SORTBY = ['relevance', 'rating', 'views', 'time']
 
-    # def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
-    #     with self.browser:
-    #         return self.browser.search_videos(pattern, self.SORTBY[sortby])
+    def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
+        with self.browser:
+            return self.browser.search_videos(pattern, self.SORTBY[sortby])
 
     def fill_video(self, video, fields):
         if fields != ['thumbnail']:
diff --git a/modules/gdcvault/browser.py b/modules/gdcvault/browser.py
index d423eae9..d98103f5 100644
--- a/modules/gdcvault/browser.py
+++ b/modules/gdcvault/browser.py
@@ -25,7 +25,7 @@ from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword, BrowserU
 from weboob.tools.browser.decorators import id2url
 
 #from .pages.index import IndexPage
-from .pages import VideoPage, IndexPage
+from .pages import VideoPage, IndexPage, SearchPage
 from .video import GDCVaultVideo
 
 
@@ -36,7 +36,8 @@ class GDCVaultBrowser(BaseBrowser):
     DOMAIN = 'gdcvault.com'
     ENCODING = 'utf-8'
     PAGES = {r'http://[w\.]*gdcvault.com/play/(?P<id>[\d]+)/?.*': VideoPage,
-             r'http://[w\.]*gdcvault.com/': IndexPage,
+             r'http://[w\.]*gdcvault.com/search\.php.*': (SearchPage, "json"),
+             r'http://[w\.]*gdcvault.com/.*': IndexPage,
             }
 
     def is_logged(self):
@@ -63,9 +64,9 @@ class GDCVaultBrowser(BaseBrowser):
 
         data = self.readurl('http://gdcvault.com/api/login.php',
                             urllib.urlencode(params))
-        # data is returned as JSON, not sure yet if it's useful
+        # some data returned as JSON, not sure yet if it's useful
+        #print data
 
-        print data
         if data is None:
             raise BrowserBanned('Too many open sessions?')
 
@@ -75,7 +76,7 @@ class GDCVaultBrowser(BaseBrowser):
             raise BrowserIncorrectPassword()
 
     def close_session(self):
-        print "logging out..."
+        # XXX: only if is_logged? or was used?
         self.openurl('/logout', '')
 
     @id2url(GDCVaultVideo.id2url)
@@ -86,13 +87,27 @@ class GDCVaultBrowser(BaseBrowser):
             raise BrowserUnavailable('Requires account')
         return self.page.get_video(video)
 
-    # def search_videos(self, pattern, sortby):
-    #     return None
-    #     self.location(self.buildurl('http://gdcvault.com/en/search%s' % sortby, query=pattern.encode('utf-8')))
-    #     assert self.is_on_page(IndexPage)
-    #     return self.page.iter_videos()
+    def search_videos(self, pattern, sortby):
+        post_data = {"firstfocus" : "",
+                     "category" : "free",
+                     "keyword" : pattern.encode('utf-8'),
+                     "conference_id" : "", }
+        post_data = urllib.urlencode(post_data)
+        # probably not required
+        self.addheaders = [('Referer', 'http://gdcvault.com/'),
+                           ("Content-Type" , 'application/x-www-form-urlencoded') ]
 
-    # def latest_videos(self):
-    #     self.home()
-    #     assert self.is_on_page(IndexPage)
-    #     return self.page.iter_videos()
+        #print post_data
+        # is_logged assumes html page
+        self.location('http://gdcvault.com/search.php',
+                      data=post_data, no_login=True)
+
+        assert self.is_on_page(SearchPage)
+        return self.page.iter_videos()
+
+    def latest_videos(self):
+        print "browser:latest_videos()"
+        #self.home()
+        self.location('/free')
+        assert self.is_on_page(IndexPage)
+        return self.page.iter_videos()
diff --git a/modules/gdcvault/pages.py b/modules/gdcvault/pages.py
index aa48a6f0..712117ef 100644
--- a/modules/gdcvault/pages.py
+++ b/modules/gdcvault/pages.py
@@ -28,25 +28,69 @@ import datetime
 from dateutil.parser import parse as parse_dt
 
 from weboob.capabilities.base import NotAvailable
+from weboob.tools.capabilities.thumbnail import Thumbnail
 from weboob.tools.browser import BrokenPageError
 
+#HACK
+from urllib2 import HTTPError
+
 from .video import GDCVaultVideo
 
 #import lxml.etree
 
 
-__all__ = ['IndexPage', 'VideoPage']
+__all__ = ['IndexPage', 'SearchPage', 'VideoPage']
 
 
 class IndexPage(BasePage):
     def iter_videos(self):
         for a in self.parser.select(self.document.getroot(), 'section.conference ul.media_items li.featured a.session_item'):
-            print a
+            href = a.attrib.get('href', '')
+            print href
+            m = re.match('/play/(\d+)/.*', href)
+            if not m:
+                continue
+            print m.group(1)
+            video = GDCVaultVideo(m.group(1))
+
+            # get title
+            try:
+                video.title = unicode(self.parser.select(a, 'div.conference_info p strong', 1).text)
+            except IndexError:
+                video.title = NotAvailable
+
+            # get description
+            try:
+                video.description = unicode(self.parser.select(a, 'div.conference_info p', 1).text)
+            except IndexError:
+                video.description = NotAvailable
+
+            # get thumbnail
+            img = self.parser.select(a, 'div.featured_image img', 1)
+            if img is not None:
+                video.thumbnail = Thumbnail(unicode(img.attrib['src']))
+            else:
+                video.thumbnail = NotAvailable
+
+
             #m = re.match('id-(\d+)', a.attrib.get('class', ''))
             #if not m:
             #    continue
             # FIXME
-            yield None
+            yield video
+
+# the search page class uses a JSON parser,
+# since it's what search.php returns when POSTed (from Ajax)
+class SearchPage(BasePage):
+    def iter_videos(self):
+        if self.document is None or self.document['data'] is None:
+            raise BrokenPageError('Unable to find JSON data')
+        for data in self.document['data']:
+            video = GDCVaultVideo.get_video_from_json(data)
+            # TODO: split type 4 videos into id and id#slides
+            if video is None:
+                continue
+            yield video
 
 class VideoPage(BasePage):
     def get_video(self, video=None):
@@ -86,8 +130,34 @@ class VideoPage(BasePage):
                 m = re.match(".*new SWFObject.*addVariable\(\"file\", encodeURIComponent\(\"(.*)\"\)\).*", unicode(script.text), re.DOTALL)
                 if m:
                     video.url = "http://gdcvault.com%s" % (m.group(1))
+                    # TODO: for non-free (like 769),
+                    # must be logged to use /mediaProxy.php
+
+                    # FIXME: doesn't seem to work yet, we get 2 bytes as html
+                    # 769 should give:
+                    # http://twvideo01.ubm-us.net/o1/gdcradio-net/2007/gdc/GDC07-4889.mp3
+                    # HACK: we use mechanize directly here for now... FIXME
+                    #print "asking for redirect on '%s'" % (video.url)
+                    #self.browser.addheaders += [['Referer', 'http://gdcvault.com/play/%s' % self.group_dict['id']]]
+                    #print self.browser.addheaders
+                    self.browser.set_handle_redirect(False)
+                    try:
+                        req = self.browser.open_novisit(video.url)
+                        headers = req.info()
+                        if headers.get('Content-Type', '') == 'text/html' and headers.get('Content-Length', '') == '2':
+                            print 'BUG'
+                        
+                        print req.code
+                    except HTTPError, e:
+                        #print e.getcode()
+                        if e.getcode() == 302 and hasattr(e, 'hdrs'):
+                            #print e.hdrs['Location']
+                            video.url = unicode(e.hdrs['Location'])
+                    self.browser.set_handle_redirect(True)
+
                     video.set_empty_fields(NotAvailable)
                     return video
+
             #XXX: raise error?
             return None
 
@@ -97,19 +167,33 @@ class VideoPage(BasePage):
         # type 3 or 4 (iframe)
         # get the config file for the rest
         iframe_url = obj.attrib['src']
-        m = re.match('(http:.*)player\.html\?.*xmlURL=([^&]+)\&token=([^&]+)', iframe_url)
+        m = re.match('(http:.*/)[^/]*player\.html\?.*xmlURL=([^&]+).*\&token=([^&]+)', iframe_url)
         if not m:
             m = re.match('/play/mediaProxy\.php\?sid=(\d+)', iframe_url)
             if m is None:
                 return None
+            # TODO: must be logged to use /mediaProxy.php
             # type 3 (pdf slides)
             video.ext = u'pdf'
             video.url = "http://gdcvault.com%s" % (unicode(iframe_url))
+
+            # HACK: we use mechanize directly here for now... FIXME
+            print "asking for redirect on '%s'" % (video.url)
+            self.browser.set_handle_redirect(False)
+            try:
+                req = self.browser.open_novisit(video.url)
+            except HTTPError, e:
+                if e.getcode() == 302 and hasattr(e, 'hdrs'):
+                    video.url = unicode(e.hdrs['Location'])
+            self.browser.set_handle_redirect(True)
+
             video.set_empty_fields(NotAvailable)
             return video
 
         # type 4 (dual screen video)
-        config_url = m.group(1) + m.group(2)
+
+        # token doesn't actually seem required
+        config_url = m.group(1) + m.group(2) + '?token=' + m.group(3)
 
         #config = self.browser.openurl(config_url).read()
         config = self.browser.get_document(self.browser.openurl(config_url))
@@ -119,6 +203,12 @@ class VideoPage(BasePage):
         if host is None:
             raise BrokenPageError('Missing tag in xml config file')
 
+        # for id 1373 host is missing '/ondemand'
+        # only add it when only a domain is specified without path
+        m = re.match('^[^\/]+$', host)
+        if m:
+            host += "/ondemand"
+
         videos = {}
 
         obj = self.parser.select(config.getroot(), 'speakervideo', 1)
diff --git a/modules/gdcvault/video.py b/modules/gdcvault/video.py
index 6aab4804..eaa0394a 100644
--- a/modules/gdcvault/video.py
+++ b/modules/gdcvault/video.py
@@ -19,8 +19,11 @@
 
 
 from weboob.capabilities.video import BaseVideo
+from weboob.capabilities.base import NotAvailable
+from weboob.tools.capabilities.thumbnail import Thumbnail
 
 import re
+from dateutil.parser import parse as parse_dt
 
 __all__ = ['GDCVaultVideo']
 
@@ -41,3 +44,49 @@ class GDCVaultVideo(BaseVideo):
         if m:
             return u'http://www.gdcvault.com/play/%s#slides' % _id
         return u'http://www.gdcvault.com/play/%s' % _id
+
+    @classmethod
+    def get_video_from_json(self, data):
+        # session_id is unique per talk
+        # vault_media_id is unique per page
+        # (but can refer to 2 video files for dual screen)
+        # solr_id is "${vault_media_id}.${conference_id}.${session_id}.$vault_media_type_id{}"
+
+        # XXX: do we filter them or let people know about them?
+        #if 'anchor' in data:
+        #    if data['anchor']['href'] == '#':
+        #        # file will not be accessible (not free and not logged in)
+        #        return None
+
+        if not 'vault_media_id' in data:
+            return None
+        media_id = int(data['vault_media_id'])
+        video = GDCVaultVideo(media_id)
+
+        # 1013679 has \n in title...
+        video.title = unicode(data.get('session_name', '').replace('\n', ''))
+
+        # TODO: strip out <p>, <br> and other html...
+        # XXX: 1013422 has all 3 and !=
+        if 'overview' in data:
+            video.description = unicode(data['overview'])
+        elif 'spell' in data:
+            video.description = unicode(data['spell'])
+        else:
+            video.description = unicode(data.get('description', ''))
+
+        if 'image' in data:
+            video.thumbnail = Thumbnail(unicode(data['image']))
+
+        if 'speakers_name' in data:
+            video.author = unicode(", ".join(data['speakers_name']))
+
+        if 'start_date' in data:
+            video.date = parse_dt(data['start_date'])
+
+        if 'score' in data:
+            video.rating = data['score']
+
+        video.set_empty_fields(NotAvailable)
+
+        return video