weboob-devel/modules/cappedtv/browser.py
2012-07-29 15:51:29 +02:00

133 lines
4.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright(C) 2012 Lord
#
# This module is free software. It comes without any warranty, to
# the extent permitted by applicable law. You can redistribute it
# and/or modify it under the terms of the Do What The Fuck You Want
# To Public License, Version 2, as published by Sam Hocevar. See
# http://sam.zoy.org/wtfpl/COPYING for more details.
import urllib
import datetime
from weboob.capabilities.base import NotAvailable
from weboob.tools.misc import to_unicode
from weboob.tools.browser import BasePage
from weboob.tools.browser import BrokenPageError
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url
from weboob.capabilities.video import BaseVideo
from weboob.tools.capabilities.thumbnail import Thumbnail
from weboob.tools.ordereddict import OrderedDict
__all__ = ['CappedBrowser']
class CappedVideo(BaseVideo):
def __init__(self, *args, **kwargs):
BaseVideo.__init__(self, *args, **kwargs)
self.nsfw = False
self.ext = u'mp4'
@classmethod
def id2url(cls, _id):
return 'http://capped.tv/%s' % _id
# parser for search pages
class IndexPage(BasePage):
def iter_videos(self):
# When no results are found, the website returns random results
sb = self.parser.select(self.document.getroot(), 'div.search form input.searchbox', 1)
if sb.value == 'No Results Found':
return
#Extracting meta data from results page
vidbackdrop_list = self.parser.select(self.document.getroot(), 'div.vidBackdrop ')
for vidbackdrop in vidbackdrop_list:
url = self.parser.select(vidbackdrop, 'a', 1).attrib['href']
_id = url[2:]
video = CappedVideo(_id)
video.set_empty_fields(NotAvailable, ('url',))
video.title = to_unicode(self.parser.select(vidbackdrop, 'div.vidTitle a', 1).text)
video.author = to_unicode(self.parser.select(vidbackdrop, 'div.vidAuthor a', 1).text)
thumbnail_url = 'http://cdn.capped.tv/pre/%s.png' % _id
video.thumbnail = Thumbnail(thumbnail_url)
#we get the description field
duration_tmp = self.parser.select(vidbackdrop, 'div.vidInfo', 1)
#we remove tabs and spaces
duration_tmp2 = duration_tmp.text[7:]
#we remove all fields exept time
duration_tmp3 = duration_tmp2.split(' ')[0]
#we transform it in datetime format
parts = duration_tmp3.split(':')
if len(parts) == 1:
hours = minutes = 0
seconds = parts[0]
elif len(parts) == 2:
hours = 0
minutes, seconds = parts
elif len(parts) == 3:
hours, minutes, seconds = parts
else:
raise BrokenPageError('Unable to parse duration %r' % duration_tmp)
video.duration = datetime.timedelta(hours=int(hours), minutes=int(minutes), seconds=int(seconds))
yield video
# parser for the video page
class VideoPage(BasePage):
def get_video(self, video=None):
_id = to_unicode(self.group_dict['id'])
if video is None:
video = CappedVideo(_id)
video.set_empty_fields(NotAvailable)
title_tmp = self.parser.select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_tmp.text.strip())
# Videopages doesn't have duration information (only results pages)
video.url = u'http://cdn.capped.tv/vhq/%s.mp4' % _id
return video
class CappedBrowser(BaseBrowser):
DOMAIN = 'capped.tv'
PROTOCOL = 'http'
ENCODING = None
PAGES = OrderedDict((
(r'http://capped\.tv/?', IndexPage),
(r'http://capped\.tv/newest', IndexPage),
(r'http://capped\.tv/mostviews', IndexPage),
(r'http://capped\.tv/leastviews', IndexPage),
(r'http://capped\.tv/monthtop', IndexPage),
(r'http://capped\.tv/monthbottom', IndexPage),
(r'http://capped\.tv/alpha', IndexPage),
(r'http://capped\.tv/ahpla', IndexPage),
(r'http://capped\.tv/search\?s\=(?P<pattern>.+)', IndexPage),
(r'http://capped\.tv/(?P<id>.+)', VideoPage),
))
@id2url(CappedVideo.id2url)
def get_video(self, url, video=None):
self.location(url)
assert self.is_on_page(VideoPage), 'Should be on video page.'
return self.page.get_video(video)
def search_videos(self, pattern):
self.location('/search?s=%s' % (urllib.quote_plus(pattern.encode('utf-8'))))
assert self.is_on_page(IndexPage)
return self.page.iter_videos()
def latest_videos(self):
self.home()
assert self.is_on_page(IndexPage)
return self.page.iter_videos()