[youporn] rewrite with Browser 2

This commit is contained in:
Alexandre Morignot 2015-02-06 13:25:46 +01:00 committed by Romain Bignon
commit 2d0df1b37a
5 changed files with 59 additions and 170 deletions

View file

@ -18,40 +18,36 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser from weboob.browser import PagesBrowser, URL
from weboob.deprecated.browser.decorators import id2url
from .pages.index import IndexPage from .pages.index import IndexPage
from .pages.video import VideoPage from .pages.video import VideoPage
from .video import YoupornVideo
__all__ = ['YoupornBrowser'] __all__ = ['YoupornBrowser']
class YoupornBrowser(Browser): class YoupornBrowser(PagesBrowser):
DOMAIN = 'www.youporn.com' BASEURL = 'http://www.youporn.com'
ENCODING = None
PAGES = {r'http://[w\.]*youporn\.com/?': IndexPage,
r'http://[w\.]*youporn\.com/search.*': IndexPage,
r'http://[w\.]*youporn\.com/watch/(?P<id>\d+)/?.*': VideoPage,
r'http://[w\.]*youporngay\.com:80/watch/(?P<id>.+)': VideoPage,
}
@id2url(YoupornVideo.id2url) home = URL('/$', IndexPage)
def get_video(self, url, video=None): search = URL('/search/\?query=(?P<query>.*)', IndexPage)
self.location(url) video = URL('/watch/(?P<id>[0-9]+)/.*', VideoPage)
return self.page.get_video(video)
def get_video(self, _id):
self.video.go(id=_id)
assert self.video.is_here()
return self.page.get_video()
def search_videos(self, pattern, sortby): def search_videos(self, pattern, sortby):
if pattern == 'a' or pattern == 'i': if pattern == 'a' or pattern == 'i':
raise ValueError('this pattern is not supported'); raise ValueError('this pattern is not supported');
self.location(self.buildurl('/search/%s' % sortby, query=pattern.encode('utf-8'))) self.search.go(query=pattern)
assert self.is_on_page(IndexPage) assert self.search.is_here()
return self.page.iter_videos() return self.page.iter_videos()
def latest_videos(self): def latest_videos(self):
self.home() self.home.go()
assert self.is_on_page(IndexPage) assert self.home.is_here()
return self.page.iter_videos() return self.page.iter_videos()

View file

@ -39,7 +39,6 @@ class YoupornModule(Module, CapVideo, CapCollection):
BROWSER = YoupornBrowser BROWSER = YoupornBrowser
def get_video(self, _id): def get_video(self, _id):
with self.browser:
return self.browser.get_video(_id) return self.browser.get_video(_id)
SORTBY = ['relevance', 'rating', 'views', 'time'] SORTBY = ['relevance', 'rating', 'views', 'time']
@ -47,19 +46,11 @@ class YoupornModule(Module, CapVideo, CapCollection):
def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False): def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw: if not nsfw:
return set() return set()
with self.browser:
return self.browser.search_videos(pattern, self.SORTBY[sortby]) return self.browser.search_videos(pattern, self.SORTBY[sortby])
def fill_video(self, video, fields): def fill_video(self, video, fields):
if fields != ['thumbnail']: return self.browser.get_video(video.id)
# if we don't want only the thumbnail, we probably want also every fields
with self.browser:
video = self.browser.get_video(YoupornVideo.id2url(video.id), video)
if 'thumbnail' in fields and video.thumbnail:
with self.browser:
video.thumbnail.data = self.browser.readurl(video.thumbnail.url)
return video
def iter_resources(self, objs, split_path): def iter_resources(self, objs, split_path):
if BaseVideo in objs: if BaseVideo in objs:

View file

@ -1,34 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.mech import ClientForm
ControlNotFoundError = ClientForm.ControlNotFoundError
from mechanize import FormNotFoundError
from weboob.deprecated.browser import Page
class PornPage(Page):
def on_loaded(self):
try:
self.browser.select_form(nr=0)
self.browser.submit(name='user_choice')
return False
except (ControlNotFoundError, FormNotFoundError):
return True

View file

@ -18,51 +18,35 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.html import Attr, CSS
from weboob.browser.filters.standard import CleanText, Duration, Regexp, Type
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.capabilities.image import BaseImage from weboob.capabilities.image import BaseImage
from .base import PornPage
from ..video import YoupornVideo from ..video import YoupornVideo
class IndexPage(PornPage): class IndexPage(HTMLPage):
def iter_videos(self): @method
for li in self.document.getroot().xpath('//ul/li[@class="videoBox"]'): class iter_videos(ListElement):
a = li.find('div').find('a') item_xpath = '//div[@id="content"]/div/div/ul/li/div/a'
if a is None or a.find('img') is None:
continue
thumbnail_url = a.find('img').attrib['src'] class item(ItemElement):
klass = YoupornVideo
a = self.parser.select(li, './/a[@class="videoTitle"]', 1, 'xpath') def obj_thumbnail(self):
thumbnail_url = Attr('./img', 'src')(self)
thumbnail = BaseImage(thumbnail_url)
thumbnail.url = thumbnail.id
return thumbnail
url = a.attrib['href'] obj_author = NotAvailable
_id = url[len('/watch/'):] obj_duration = CSS('span.duration') & CleanText() & Duration()
_id = _id[:_id.find('/')] obj_id = Attr('../..', 'data-video-id')
obj_rating = CleanText('./span/i') & Regexp(pattern=r'(..)%') & Type(type=int)
video = YoupornVideo(int(_id)) obj_rating_max = 100
video.title = unicode(a.text.strip()) obj_title = CleanText('./p')
video.thumbnail = BaseImage(thumbnail_url) obj_url = NotAvailable
video.thumbnail.url = video.thumbnail.id
hours = minutes = seconds = 0
div = li.cssselect('div.duration')
if len(div) > 0:
pack = [int(s) for s in div[0].text.strip().split(':')]
if len(pack) == 3:
hours, minutes, seconds = pack
elif len(pack) == 2:
minutes, seconds = pack
video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
div = li.cssselect('div.rating')
if div:
video.rating = int(div[0].text.strip('% '))
video.rating_max = 100
video.set_empty_fields(NotAvailable, ('url', 'author'))
yield video

View file

@ -18,74 +18,26 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re from weboob.browser.pages import HTMLPage
import datetime from weboob.browser.elements import ItemElement, method
from dateutil.parser import parse as parse_dt from weboob.browser.filters.html import Link
from weboob.browser.filters.standard import CleanText, Regexp, Type
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.deprecated.browser import BrokenPageError
from .base import PornPage
from ..video import YoupornVideo from ..video import YoupornVideo
class VideoPage(PornPage): class VideoPage(HTMLPage):
def get_video(self, video=None): @method
if not PornPage.on_loaded(self): class get_video(ItemElement):
return klass = YoupornVideo
if video is None:
video = YoupornVideo(self.group_dict['id'])
video.title = self.get_title()
video.url, video.ext = self.get_url()
self.set_details(video)
video.set_empty_fields(NotAvailable) obj_author = CleanText('//div[@class="author-block--line"][1]') & Regexp(pattern=r'By: (.*)')
return video #obj_date = Date('//div[@id="stats-date"]')
obj_duration = NotAvailable
def get_url(self): obj_ext = 'mp4'
download_div = self.parser.select(self.document.getroot(), 'ul.downloadList li') obj_rating = CleanText('//div[@class="rating-percentage"]') & Regexp(pattern=r'(..)%') & Type(type=int)
if len(download_div) < 1: obj_rating_max = 100
raise BrokenPageError('Unable to find file URL') obj_thumbnail = NotAvailable
obj_title = CleanText('//h1')
a = self.parser.select(download_div[0], 'a', 1) obj_url = Link('//ul[@class="downloadList"]/li[2]/a')
m = re.match('^(\w+) - .*', a.text)
if m:
ext = m.group(1).lower()
else:
ext = u'flv'
return unicode(a.attrib['href']), unicode(ext)
def get_title(self):
element = self.parser.select(self.document.getroot(), 'h1', 1)
return element.text.strip().decode('utf-8')
def set_details(self, v):
for li in self.parser.select(self.document.getroot(), 'ul.spaced li'):
span = li.find('label')
name = span.text.strip()
value = span.tail.strip()
if name == 'Duration:':
m = re.match('((\d+)hrs)?\s*((\d+)min)?\s*((\d+)sec)?', value)
if not m:
raise BrokenPageError('Unable to parse datetime: %r' % value)
hours = m.group(2) or 0
minutes = m.group(4) or 0
seconds = m.group(6) or 0
v.duration = datetime.timedelta(hours=int(hours),
minutes=int(minutes),
seconds=int(seconds))
elif name == 'Submitted:':
author = li.find('i')
if author is None:
author = li.find('a')
if author is None:
v.author = unicode(value)
else:
v.author = unicode(author.text)
elif name == 'Rating:':
value = li.find('span').text
v.rating = int(value.rstrip('%'))
v.rating_max = 100
elif name == 'Date:':
v.date = parse_dt(value)