From 2d0df1b37a48373a1329a84287d9b025e2aceeda Mon Sep 17 00:00:00 2001 From: Alexandre Morignot Date: Fri, 6 Feb 2015 13:25:46 +0100 Subject: [PATCH] [youporn] rewrite with Browser 2 --- modules/youporn/browser.py | 34 +++++++------- modules/youporn/module.py | 17 ++----- modules/youporn/pages/base.py | 34 -------------- modules/youporn/pages/index.py | 60 +++++++++---------------- modules/youporn/pages/video.py | 82 +++++++--------------------------- 5 files changed, 58 insertions(+), 169 deletions(-) delete mode 100644 modules/youporn/pages/base.py diff --git a/modules/youporn/browser.py b/modules/youporn/browser.py index ed8578cd..08a75edf 100644 --- a/modules/youporn/browser.py +++ b/modules/youporn/browser.py @@ -18,40 +18,36 @@ # along with weboob. If not, see . -from weboob.deprecated.browser import Browser -from weboob.deprecated.browser.decorators import id2url +from weboob.browser import PagesBrowser, URL from .pages.index import IndexPage from .pages.video import VideoPage -from .video import YoupornVideo __all__ = ['YoupornBrowser'] -class YoupornBrowser(Browser): - DOMAIN = 'www.youporn.com' - ENCODING = None - PAGES = {r'http://[w\.]*youporn\.com/?': IndexPage, - r'http://[w\.]*youporn\.com/search.*': IndexPage, - r'http://[w\.]*youporn\.com/watch/(?P\d+)/?.*': VideoPage, - r'http://[w\.]*youporngay\.com:80/watch/(?P.+)': VideoPage, - } +class YoupornBrowser(PagesBrowser): + BASEURL = 'http://www.youporn.com' - @id2url(YoupornVideo.id2url) - def get_video(self, url, video=None): - self.location(url) - return self.page.get_video(video) + home = URL('/$', IndexPage) + search = URL('/search/\?query=(?P.*)', IndexPage) + video = URL('/watch/(?P[0-9]+)/.*', VideoPage) + + def get_video(self, _id): + self.video.go(id=_id) + assert self.video.is_here() + return self.page.get_video() def search_videos(self, pattern, sortby): if pattern == 'a' or pattern == 'i': raise ValueError('this pattern is not supported'); - self.location(self.buildurl('/search/%s' % sortby, query=pattern.encode('utf-8'))) - assert self.is_on_page(IndexPage) + self.search.go(query=pattern) + assert self.search.is_here() return self.page.iter_videos() def latest_videos(self): - self.home() - assert self.is_on_page(IndexPage) + self.home.go() + assert self.home.is_here() return self.page.iter_videos() diff --git a/modules/youporn/module.py b/modules/youporn/module.py index ca1bbbb8..6d9c7471 100644 --- a/modules/youporn/module.py +++ b/modules/youporn/module.py @@ -39,27 +39,18 @@ class YoupornModule(Module, CapVideo, CapCollection): BROWSER = YoupornBrowser def get_video(self, _id): - with self.browser: - return self.browser.get_video(_id) + return self.browser.get_video(_id) SORTBY = ['relevance', 'rating', 'views', 'time'] def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False): if not nsfw: return set() - with self.browser: - return self.browser.search_videos(pattern, self.SORTBY[sortby]) + + return self.browser.search_videos(pattern, self.SORTBY[sortby]) def fill_video(self, video, fields): - if fields != ['thumbnail']: - # if we don't want only the thumbnail, we probably want also every fields - with self.browser: - video = self.browser.get_video(YoupornVideo.id2url(video.id), video) - if 'thumbnail' in fields and video.thumbnail: - with self.browser: - video.thumbnail.data = self.browser.readurl(video.thumbnail.url) - - return video + return self.browser.get_video(video.id) def iter_resources(self, objs, split_path): if BaseVideo in objs: diff --git a/modules/youporn/pages/base.py b/modules/youporn/pages/base.py deleted file mode 100644 index 77db3889..00000000 --- a/modules/youporn/pages/base.py +++ /dev/null @@ -1,34 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2010-2011 Romain Bignon -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - -from weboob.deprecated.mech import ClientForm -ControlNotFoundError = ClientForm.ControlNotFoundError - -from mechanize import FormNotFoundError -from weboob.deprecated.browser import Page - - -class PornPage(Page): - def on_loaded(self): - try: - self.browser.select_form(nr=0) - self.browser.submit(name='user_choice') - return False - except (ControlNotFoundError, FormNotFoundError): - return True diff --git a/modules/youporn/pages/index.py b/modules/youporn/pages/index.py index 2074e912..2ea92fd7 100644 --- a/modules/youporn/pages/index.py +++ b/modules/youporn/pages/index.py @@ -18,51 +18,35 @@ # along with weboob. If not, see . -import datetime +from weboob.browser.pages import HTMLPage +from weboob.browser.elements import ItemElement, ListElement, method +from weboob.browser.filters.html import Attr, CSS +from weboob.browser.filters.standard import CleanText, Duration, Regexp, Type from weboob.capabilities.base import NotAvailable from weboob.capabilities.image import BaseImage -from .base import PornPage from ..video import YoupornVideo -class IndexPage(PornPage): - def iter_videos(self): - for li in self.document.getroot().xpath('//ul/li[@class="videoBox"]'): - a = li.find('div').find('a') - if a is None or a.find('img') is None: - continue +class IndexPage(HTMLPage): + @method + class iter_videos(ListElement): + item_xpath = '//div[@id="content"]/div/div/ul/li/div/a' - thumbnail_url = a.find('img').attrib['src'] + class item(ItemElement): + klass = YoupornVideo - a = self.parser.select(li, './/a[@class="videoTitle"]', 1, 'xpath') + def obj_thumbnail(self): + thumbnail_url = Attr('./img', 'src')(self) + thumbnail = BaseImage(thumbnail_url) + thumbnail.url = thumbnail.id + return thumbnail - url = a.attrib['href'] - _id = url[len('/watch/'):] - _id = _id[:_id.find('/')] - - video = YoupornVideo(int(_id)) - video.title = unicode(a.text.strip()) - video.thumbnail = BaseImage(thumbnail_url) - video.thumbnail.url = video.thumbnail.id - - hours = minutes = seconds = 0 - div = li.cssselect('div.duration') - if len(div) > 0: - pack = [int(s) for s in div[0].text.strip().split(':')] - if len(pack) == 3: - hours, minutes, seconds = pack - elif len(pack) == 2: - minutes, seconds = pack - - video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) - - div = li.cssselect('div.rating') - if div: - video.rating = int(div[0].text.strip('% ')) - video.rating_max = 100 - - video.set_empty_fields(NotAvailable, ('url', 'author')) - - yield video + obj_author = NotAvailable + obj_duration = CSS('span.duration') & CleanText() & Duration() + obj_id = Attr('../..', 'data-video-id') + obj_rating = CleanText('./span/i') & Regexp(pattern=r'(..)%') & Type(type=int) + obj_rating_max = 100 + obj_title = CleanText('./p') + obj_url = NotAvailable diff --git a/modules/youporn/pages/video.py b/modules/youporn/pages/video.py index 6d62e296..18ebe0b5 100644 --- a/modules/youporn/pages/video.py +++ b/modules/youporn/pages/video.py @@ -18,74 +18,26 @@ # along with weboob. If not, see . -import re -import datetime -from dateutil.parser import parse as parse_dt - +from weboob.browser.pages import HTMLPage +from weboob.browser.elements import ItemElement, method +from weboob.browser.filters.html import Link +from weboob.browser.filters.standard import CleanText, Regexp, Type from weboob.capabilities.base import NotAvailable -from weboob.deprecated.browser import BrokenPageError -from .base import PornPage from ..video import YoupornVideo -class VideoPage(PornPage): - def get_video(self, video=None): - if not PornPage.on_loaded(self): - return - if video is None: - video = YoupornVideo(self.group_dict['id']) - video.title = self.get_title() - video.url, video.ext = self.get_url() - self.set_details(video) +class VideoPage(HTMLPage): + @method + class get_video(ItemElement): + klass = YoupornVideo - video.set_empty_fields(NotAvailable) - return video - - def get_url(self): - download_div = self.parser.select(self.document.getroot(), 'ul.downloadList li') - if len(download_div) < 1: - raise BrokenPageError('Unable to find file URL') - - a = self.parser.select(download_div[0], 'a', 1) - m = re.match('^(\w+) - .*', a.text) - if m: - ext = m.group(1).lower() - else: - ext = u'flv' - return unicode(a.attrib['href']), unicode(ext) - - def get_title(self): - element = self.parser.select(self.document.getroot(), 'h1', 1) - return element.text.strip().decode('utf-8') - - def set_details(self, v): - for li in self.parser.select(self.document.getroot(), 'ul.spaced li'): - span = li.find('label') - name = span.text.strip() - value = span.tail.strip() - - if name == 'Duration:': - m = re.match('((\d+)hrs)?\s*((\d+)min)?\s*((\d+)sec)?', value) - if not m: - raise BrokenPageError('Unable to parse datetime: %r' % value) - hours = m.group(2) or 0 - minutes = m.group(4) or 0 - seconds = m.group(6) or 0 - v.duration = datetime.timedelta(hours=int(hours), - minutes=int(minutes), - seconds=int(seconds)) - elif name == 'Submitted:': - author = li.find('i') - if author is None: - author = li.find('a') - if author is None: - v.author = unicode(value) - else: - v.author = unicode(author.text) - elif name == 'Rating:': - value = li.find('span').text - v.rating = int(value.rstrip('%')) - v.rating_max = 100 - elif name == 'Date:': - v.date = parse_dt(value) + obj_author = CleanText('//div[@class="author-block--line"][1]') & Regexp(pattern=r'By: (.*)') + #obj_date = Date('//div[@id="stats-date"]') + obj_duration = NotAvailable + obj_ext = 'mp4' + obj_rating = CleanText('//div[@class="rating-percentage"]') & Regexp(pattern=r'(..)%') & Type(type=int) + obj_rating_max = 100 + obj_thumbnail = NotAvailable + obj_title = CleanText('//h1') + obj_url = Link('//ul[@class="downloadList"]/li[2]/a')