weboob-devel/modules/dailymotion/pages.py

# -*- coding: utf-8 -*-

# Copyright(C) 2011  Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.

from weboob.tools.json import json
import datetime
import re
import urllib
import urlparse
import mechanize

from weboob.capabilities import NotAvailable
from weboob.capabilities.image import BaseImage
from weboob.tools.misc import html2text
from weboob.tools.browser import BasePage, BrokenPageError


from .video import DailymotionVideo


__all__ = ['IndexPage', 'VideoPage', 'KidsVideoPage']


class IndexPage(BasePage):
    def iter_videos(self):
        for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'):
            _id = div.attrib.get('data-id', None)

            if _id is None:
                self.browser.logger.warning('Unable to find the ID of a video')
                continue

            video = DailymotionVideo(_id)
            video.title = unicode(self.parser.select(div, 'h3 a', 1).text).strip()
            video.author = unicode(self.parser.select(div, 'div.dmpi_user_login', 1).find('a').find('span').text).strip()
            video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip() or unicode()
            try:
                parts = self.parser.select(div, 'div.duration', 1).text.split(':')
            except BrokenPageError:
                # it's probably a live, np.
                video.duration = NotAvailable
            else:
                if len(parts) == 1:
                    seconds = parts[0]
                    hours = minutes = 0
                elif len(parts) == 2:
                    minutes, seconds = parts
                    hours = 0
                elif len(parts) == 3:
                    hours, minutes, seconds = parts
                else:
                    raise BrokenPageError('Unable to parse duration %r' % self.parser.select(div, 'div.duration', 1).text)
                video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
            url = unicode(self.parser.select(div, 'img.preview', 1).attrib['data-src'])
            # remove the useless anti-caching
            url = re.sub('\?\d+', '', url)
            video.thumbnail = BaseImage(url)
            video.thumbnail.url = video.thumbnail.id

            video.set_empty_fields(NotAvailable, ('url',))
            yield video

    def get_rate(self, div):
        m = re.match('width: *(\d+)px', div.attrib['style'])
        if m:
            return int(m.group(1))
        else:
            self.browser.logger.warning('Unable to parse rating: %s' % div.attrib['style'])
            return 0


class VideoPage(BasePage):
    def get_video(self, video=None):
        if video is None:
            video = DailymotionVideo(self.group_dict['id'])

        self.set_video_metadata(video)
        self.set_video_url(video)

        video.set_empty_fields(NotAvailable)

        return video


    def set_video_metadata(self, video):

        head = self.parser.select(self.document.getroot(), 'head', 1)

        video.title = unicode(self.parser.select(head, 'meta[property="og:title"]', 1).get("content")).strip()
        video.author = unicode(self.parser.select(head, 'meta[name="author"]', 1).get("content")).strip()

        url = unicode(self.parser.select(head, 'meta[property="og:image"]', 1).get("content")).strip()
        # remove the useless anti-caching
        url = re.sub('\?\d+', '', url)
        video.thumbnail = BaseImage(url)
        video.thumbnail.url = video.thumbnail.id

        try:
            parts = self.parser.select(head, 'meta[property="video:duration"]', 1).get("content").strip().split(':')
        except BrokenPageError:
            # it's probably a live, np.
            video.duration = NotAvailable
        else:
            if len(parts) == 1:
                seconds = parts[0]
                hours = minutes = 0
            elif len(parts) == 2:
                minutes, seconds = parts
                hours = 0
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % parts)
            video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))

        try:
            video.description = html2text(self.parser.select(head, 'meta[property="og:description"]', 1).get("content")).strip() or unicode()
        except BrokenPageError:
            video.description = u''


    def set_video_url(self, video):

        embed_page = self.browser.readurl('http://www.dailymotion.com/embed/video/%s' % video.id)

        m = re.search('var info = ({.*?}),[^{"]', embed_page)
        if not m:
            raise BrokenPageError('Unable to find information about video')

        info = json.loads(m.group(1))
        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
                    'stream_h264_hq_url','stream_h264_url',
                    'stream_h264_ld_url']:
            if info.get(key):#key in info and info[key]:
                max_quality = key
                break
        else:
            raise BrokenPageError(u'Unable to extract video URL')

        video.url = unicode(info[max_quality])


class KidsVideoPage(VideoPage):

    CONTROLLER_PAGE = 'http://kids.dailymotion.com/controller/Page_Kids_KidsUserHome?%s'

    def set_video_metadata(self, video):

        # The player html code with all the required information is loaded
        # after the main page using javascript and a special XmlHttpRequest
        # we emulate this behaviour
        from_request = self.group_dict['from']

        query = urllib.urlencode({
            'from_request': from_request,
            'request': '/video/%s?get_video=1' % video.id
            })

        request = mechanize.Request(KidsVideoPage.CONTROLLER_PAGE % query)
        # This header is mandatory to have the correct answer from dailymotion
        request.add_header('X-Requested-With', 'XMLHttpRequest')
        player_html = self.browser.readurl(request)

        try:
            m = re.search('<param name="flashvars" value="(?P<flashvars>.*?)"', player_html)
            flashvars = urlparse.parse_qs(m.group('flashvars'))
            info = json.loads(flashvars['sequence'][0])

            # The video parameters seem to be always located at the same place
            # in the structure: ['sequence'][0]['layerList'][0]['sequenceList']
            #   [0]['layerList'][0]['param']['extraParams'])
            #
            # but to be more tolerant to future changes in the structure, we
            # prefer to look for the parameters everywhere in the structure

            def find_video_params(data):
                if isinstance(data, dict):
                    if 'param' in data and 'extraParams' in data['param']:
                        return data['param']['extraParams']
                    data = data.values()

                if not isinstance(data, list):
                    return None

                for item in data:
                    ret = find_video_params(item)
                    if ret:
                        return ret

                return None

            params = find_video_params(info['sequence'])

            video.title = unicode(params['videoTitle'])
            video.author = unicode(params['videoOwnerLogin'])
            video.description = unicode(params['videoDescription'])
            video.thumbnail = BaseImage(params['videoPreviewURL'])
            video.thumbnail.url = unicode(params['videoPreviewURL'])
            video.duration = datetime.timedelta(seconds=params['mediaDuration'])

        except:
            # If anything goes wrong, we prefer to return normally, this will
            # allow video download to work even if we don't have the metadata
            pass