[youporn] rewrite with Browser 2

This commit is contained in:
Alexandre Morignot 2015-02-06 13:25:46 +01:00 committed by Romain Bignon
commit 2d0df1b37a
5 changed files with 59 additions and 170 deletions

View file

@ -18,40 +18,36 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Browser
from weboob.deprecated.browser.decorators import id2url
from weboob.browser import PagesBrowser, URL
from .pages.index import IndexPage
from .pages.video import VideoPage
from .video import YoupornVideo
__all__ = ['YoupornBrowser']
class YoupornBrowser(Browser):
DOMAIN = 'www.youporn.com'
ENCODING = None
PAGES = {r'http://[w\.]*youporn\.com/?': IndexPage,
r'http://[w\.]*youporn\.com/search.*': IndexPage,
r'http://[w\.]*youporn\.com/watch/(?P<id>\d+)/?.*': VideoPage,
r'http://[w\.]*youporngay\.com:80/watch/(?P<id>.+)': VideoPage,
}
class YoupornBrowser(PagesBrowser):
BASEURL = 'http://www.youporn.com'
@id2url(YoupornVideo.id2url)
def get_video(self, url, video=None):
self.location(url)
return self.page.get_video(video)
home = URL('/$', IndexPage)
search = URL('/search/\?query=(?P<query>.*)', IndexPage)
video = URL('/watch/(?P<id>[0-9]+)/.*', VideoPage)
def get_video(self, _id):
self.video.go(id=_id)
assert self.video.is_here()
return self.page.get_video()
def search_videos(self, pattern, sortby):
if pattern == 'a' or pattern == 'i':
raise ValueError('this pattern is not supported');
self.location(self.buildurl('/search/%s' % sortby, query=pattern.encode('utf-8')))
assert self.is_on_page(IndexPage)
self.search.go(query=pattern)
assert self.search.is_here()
return self.page.iter_videos()
def latest_videos(self):
self.home()
assert self.is_on_page(IndexPage)
self.home.go()
assert self.home.is_here()
return self.page.iter_videos()

View file

@ -39,27 +39,18 @@ class YoupornModule(Module, CapVideo, CapCollection):
BROWSER = YoupornBrowser
def get_video(self, _id):
with self.browser:
return self.browser.get_video(_id)
return self.browser.get_video(_id)
SORTBY = ['relevance', 'rating', 'views', 'time']
def search_videos(self, pattern, sortby=CapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw:
return set()
with self.browser:
return self.browser.search_videos(pattern, self.SORTBY[sortby])
return self.browser.search_videos(pattern, self.SORTBY[sortby])
def fill_video(self, video, fields):
if fields != ['thumbnail']:
# if we don't want only the thumbnail, we probably want also every fields
with self.browser:
video = self.browser.get_video(YoupornVideo.id2url(video.id), video)
if 'thumbnail' in fields and video.thumbnail:
with self.browser:
video.thumbnail.data = self.browser.readurl(video.thumbnail.url)
return video
return self.browser.get_video(video.id)
def iter_resources(self, objs, split_path):
if BaseVideo in objs:

View file

@ -1,34 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.mech import ClientForm
ControlNotFoundError = ClientForm.ControlNotFoundError
from mechanize import FormNotFoundError
from weboob.deprecated.browser import Page
class PornPage(Page):
def on_loaded(self):
try:
self.browser.select_form(nr=0)
self.browser.submit(name='user_choice')
return False
except (ControlNotFoundError, FormNotFoundError):
return True

View file

@ -18,51 +18,35 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.html import Attr, CSS
from weboob.browser.filters.standard import CleanText, Duration, Regexp, Type
from weboob.capabilities.base import NotAvailable
from weboob.capabilities.image import BaseImage
from .base import PornPage
from ..video import YoupornVideo
class IndexPage(PornPage):
def iter_videos(self):
for li in self.document.getroot().xpath('//ul/li[@class="videoBox"]'):
a = li.find('div').find('a')
if a is None or a.find('img') is None:
continue
class IndexPage(HTMLPage):
@method
class iter_videos(ListElement):
item_xpath = '//div[@id="content"]/div/div/ul/li/div/a'
thumbnail_url = a.find('img').attrib['src']
class item(ItemElement):
klass = YoupornVideo
a = self.parser.select(li, './/a[@class="videoTitle"]', 1, 'xpath')
def obj_thumbnail(self):
thumbnail_url = Attr('./img', 'src')(self)
thumbnail = BaseImage(thumbnail_url)
thumbnail.url = thumbnail.id
return thumbnail
url = a.attrib['href']
_id = url[len('/watch/'):]
_id = _id[:_id.find('/')]
video = YoupornVideo(int(_id))
video.title = unicode(a.text.strip())
video.thumbnail = BaseImage(thumbnail_url)
video.thumbnail.url = video.thumbnail.id
hours = minutes = seconds = 0
div = li.cssselect('div.duration')
if len(div) > 0:
pack = [int(s) for s in div[0].text.strip().split(':')]
if len(pack) == 3:
hours, minutes, seconds = pack
elif len(pack) == 2:
minutes, seconds = pack
video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
div = li.cssselect('div.rating')
if div:
video.rating = int(div[0].text.strip('% '))
video.rating_max = 100
video.set_empty_fields(NotAvailable, ('url', 'author'))
yield video
obj_author = NotAvailable
obj_duration = CSS('span.duration') & CleanText() & Duration()
obj_id = Attr('../..', 'data-video-id')
obj_rating = CleanText('./span/i') & Regexp(pattern=r'(..)%') & Type(type=int)
obj_rating_max = 100
obj_title = CleanText('./p')
obj_url = NotAvailable

View file

@ -18,74 +18,26 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import datetime
from dateutil.parser import parse as parse_dt
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import ItemElement, method
from weboob.browser.filters.html import Link
from weboob.browser.filters.standard import CleanText, Regexp, Type
from weboob.capabilities.base import NotAvailable
from weboob.deprecated.browser import BrokenPageError
from .base import PornPage
from ..video import YoupornVideo
class VideoPage(PornPage):
def get_video(self, video=None):
if not PornPage.on_loaded(self):
return
if video is None:
video = YoupornVideo(self.group_dict['id'])
video.title = self.get_title()
video.url, video.ext = self.get_url()
self.set_details(video)
class VideoPage(HTMLPage):
@method
class get_video(ItemElement):
klass = YoupornVideo
video.set_empty_fields(NotAvailable)
return video
def get_url(self):
download_div = self.parser.select(self.document.getroot(), 'ul.downloadList li')
if len(download_div) < 1:
raise BrokenPageError('Unable to find file URL')
a = self.parser.select(download_div[0], 'a', 1)
m = re.match('^(\w+) - .*', a.text)
if m:
ext = m.group(1).lower()
else:
ext = u'flv'
return unicode(a.attrib['href']), unicode(ext)
def get_title(self):
element = self.parser.select(self.document.getroot(), 'h1', 1)
return element.text.strip().decode('utf-8')
def set_details(self, v):
for li in self.parser.select(self.document.getroot(), 'ul.spaced li'):
span = li.find('label')
name = span.text.strip()
value = span.tail.strip()
if name == 'Duration:':
m = re.match('((\d+)hrs)?\s*((\d+)min)?\s*((\d+)sec)?', value)
if not m:
raise BrokenPageError('Unable to parse datetime: %r' % value)
hours = m.group(2) or 0
minutes = m.group(4) or 0
seconds = m.group(6) or 0
v.duration = datetime.timedelta(hours=int(hours),
minutes=int(minutes),
seconds=int(seconds))
elif name == 'Submitted:':
author = li.find('i')
if author is None:
author = li.find('a')
if author is None:
v.author = unicode(value)
else:
v.author = unicode(author.text)
elif name == 'Rating:':
value = li.find('span').text
v.rating = int(value.rstrip('%'))
v.rating_max = 100
elif name == 'Date:':
v.date = parse_dt(value)
obj_author = CleanText('//div[@class="author-block--line"][1]') & Regexp(pattern=r'By: (.*)')
#obj_date = Date('//div[@id="stats-date"]')
obj_duration = NotAvailable
obj_ext = 'mp4'
obj_rating = CleanText('//div[@class="rating-percentage"]') & Regexp(pattern=r'(..)%') & Type(type=int)
obj_rating_max = 100
obj_thumbnail = NotAvailable
obj_title = CleanText('//h1')
obj_url = Link('//ul[@class="downloadList"]/li[2]/a')