weboob-devel/modules/cappedtv/browser.py

# -*- coding: utf-8 -*-

# Copyright(C) 2012 Lord
#
# This module is free software. It comes without any warranty, to
# the extent permitted by applicable law. You can redistribute it
# and/or modify it under the terms of the Do What The Fuck You Want
# To Public License, Version 2, as published by Sam Hocevar. See
# http://sam.zoy.org/wtfpl/COPYING for more details.


import urllib
import datetime
from weboob.capabilities.base import NotAvailable
from weboob.tools.misc import to_unicode
from weboob.tools.browser import BasePage
from weboob.tools.browser import BrokenPageError
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from weboob.capabilities.video import BaseVideo
from weboob.tools.capabilities.thumbnail import Thumbnail
from weboob.tools.ordereddict import OrderedDict


__all__ = ['CappedBrowser']


class CappedVideo(BaseVideo):
    def __init__(self, *args, **kwargs):
        BaseVideo.__init__(self, *args, **kwargs)
        self.nsfw = False
        self.ext = u'mp4'

    @classmethod
    def id2url(cls, _id):
        return 'http://capped.tv/%s' % _id


# parser for search pages
class IndexPage(BasePage):
    def iter_videos(self):
        # When no results are found, the website returns random results
        sb = self.parser.select(self.document.getroot(), 'div.search form input.searchbox', 1)
        if sb.value == 'No Results Found':
            return

        #Extracting meta data from results page
        vidbackdrop_list = self.parser.select(self.document.getroot(), 'div.vidBackdrop    ')
        for vidbackdrop in vidbackdrop_list:
            url = self.parser.select(vidbackdrop, 'a', 1).attrib['href']
            _id = url[2:]

            video = CappedVideo(_id)
            video.set_empty_fields(NotAvailable, ('url',))

            video.title = to_unicode(self.parser.select(vidbackdrop, 'div.vidTitle a', 1).text)
            video.author = to_unicode(self.parser.select(vidbackdrop, 'div.vidAuthor a', 1).text)

            thumbnail_url = 'http://cdn.capped.tv/pre/%s.png' % _id
            video.thumbnail = Thumbnail(thumbnail_url)

            #we get the description field
            duration_tmp = self.parser.select(vidbackdrop, 'div.vidInfo', 1)
            #we remove tabs and spaces
            duration_tmp2 = duration_tmp.text[7:]
            #we remove all fields exept time
            duration_tmp3 = duration_tmp2.split(' ')[0]
            #we transform it in datetime format
            parts = duration_tmp3.split(':')
            if len(parts) == 1:
                hours = minutes = 0
                seconds = parts[0]
            elif len(parts) == 2:
                hours = 0
                minutes, seconds = parts
            elif len(parts) == 3:
                hours, minutes, seconds = parts
            else:
                raise BrokenPageError('Unable to parse duration %r' % duration_tmp)

            video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))

            yield video


# parser for the video page
class VideoPage(BasePage):
    def get_video(self, video=None):
        _id = to_unicode(self.group_dict['id'])
        if video is None:
            video = CappedVideo(_id)
            video.set_empty_fields(NotAvailable)

        title_tmp = self.parser.select(self.document.getroot(), 'title', 1)
        video.title = to_unicode(title_tmp.text.strip())

        # Videopages doesn't have duration information (only results pages)
        video.url = u'http://cdn.capped.tv/vhq/%s.mp4' % _id
        return video


class CappedBrowser(BaseBrowser):
    DOMAIN = 'capped.tv'
    PROTOCOL = 'http'
    ENCODING = None
    PAGES = OrderedDict((
            (r'http://capped\.tv/?', IndexPage),
            (r'http://capped\.tv/newest', IndexPage),
            (r'http://capped\.tv/mostviews', IndexPage),
            (r'http://capped\.tv/leastviews', IndexPage),
            (r'http://capped\.tv/monthtop', IndexPage),
            (r'http://capped\.tv/monthbottom', IndexPage),
            (r'http://capped\.tv/alpha', IndexPage),
            (r'http://capped\.tv/ahpla', IndexPage),
            (r'http://capped\.tv/search\?s\=(?P<pattern>.+)', IndexPage),
            (r'http://capped\.tv/(?P<id>.+)', VideoPage),
            ))

    @id2url(CappedVideo.id2url)
    def get_video(self, url, video=None):
        self.location(url)
        assert self.is_on_page(VideoPage), 'Should be on video page.'
        return self.page.get_video(video)

    def search_videos(self, pattern):
        self.location('/search?s=%s' % (urllib.quote_plus(pattern.encode('utf-8'))))
        assert self.is_on_page(IndexPage)
        return self.page.iter_videos()

    def latest_videos(self):
        self.home()
        assert self.is_on_page(IndexPage)
        return self.page.iter_videos()