upgrade to browser2

This commit is contained in:
Romain Bignon 2014-03-09 15:44:28 +01:00
commit af9197fba7
4 changed files with 89 additions and 83 deletions

View file

@ -41,14 +41,12 @@ class YoujizzBackend(BaseBackend, ICapVideo, ICapCollection):
BROWSER = YoujizzBrowser BROWSER = YoujizzBrowser
def get_video(self, _id): def get_video(self, _id):
with self.browser:
video = self.browser.get_video(_id) video = self.browser.get_video(_id)
return video return video
def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw: if not nsfw:
return set() return set()
with self.browser:
return self.browser.search_videos(pattern) return self.browser.search_videos(pattern)
def fill_video(self, video, fields): def fill_video(self, video, fields):

View file

@ -18,9 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import urllib from weboob.tools.browser2 import PagesBrowser, URL
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages.index import IndexPage from .pages.index import IndexPage
@ -31,27 +29,28 @@ from .video import YoujizzVideo
__all__ = ['YoujizzBrowser'] __all__ = ['YoujizzBrowser']
class YoujizzBrowser(BaseBrowser): class YoujizzBrowser(PagesBrowser):
DOMAIN = 'youjizz.com' BASEURL = 'http://www.youjizz.com'
ENCODING = None
PAGES = {r'http://.*youjizz\.com/?': IndexPage, index = URL(r'/?(index.php)?$', IndexPage)
r'http://.*youjizz\.com/index.php': IndexPage, search = URL(r'/search/(?P<pattern>.+)-(?P<pagenum>\d+).html', IndexPage)
r'http://.*youjizz\.com/search/(?P<pattern>.+)\.html': IndexPage, video = URL(r'/videos/(?P<id>.*).html', VideoPage)
r'http://.*youjizz\.com/videos/(?P<id>.+)\.html': VideoPage,
}
@id2url(YoujizzVideo.id2url) @id2url(YoujizzVideo.id2url)
def get_video(self, url, video=None): def get_video(self, url, video=None):
self.location(url) self.location(url)
assert self.is_on_page(VideoPage), 'Should be on video page.' assert self.video.is_here()
return self.page.get_video(video) return self.page.get_video(video)
def search_videos(self, pattern): def search_videos(self, pattern):
self.location('/search/%s-1.html' % (urllib.quote_plus(pattern.encode('utf-8')))) self.search.go(pattern=pattern, pagenum=1)
assert self.is_on_page(IndexPage) assert self.search.is_here()
return self.page.iter_videos()
return self.pagination(lambda: self.page.iter_videos())
def latest_videos(self): def latest_videos(self):
self.home() self.index.go()
assert self.is_on_page(IndexPage) assert self.index.is_here()
return self.page.iter_videos()
return self.pagination(lambda: self.page.iter_videos())

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010-2012 Roger Philibert # Copyright(C) 2010-2014 Roger Philibert
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -21,9 +21,10 @@
import datetime import datetime
import re import re
from weboob.tools.browser import BasePage, BrokenPageError from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement
from weboob.tools.browser2.filters import Filter, Link, CleanText
from weboob.capabilities.image import BaseImage from weboob.capabilities.image import BaseImage
from weboob.tools.misc import to_unicode
from ..video import YoujizzVideo from ..video import YoujizzVideo
@ -31,24 +32,23 @@ from ..video import YoujizzVideo
__all__ = ['IndexPage'] __all__ = ['IndexPage']
class IndexPage(BasePage): class IndexPage(HTMLPage):
def iter_videos(self): @method
span_list = self.parser.select(self.document.getroot(), 'span#miniatura') class iter_videos(ListElement):
for span in span_list: item_xpath = '//span[@id="miniatura"]'
a = self.parser.select(span, 'a', 1)
url = a.attrib['href']
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
video = YoujizzVideo(_id) next_page = Link(u'//a[text()="Next »"]')
video.thumbnail = BaseImage(span.find('.//img').attrib['data-original']) class item(ItemElement):
video.thumbnail.url = video.thumbnail.id klass = YoujizzVideo
title_el = self.parser.select(span, 'span#title1', 1) class Id(Filter):
video.title = to_unicode(title_el.text.strip()) def filter(self, link):
return re.sub(r'/videos/(.+)\.html', r'\1', link)
time_span = self.parser.select(span, 'span.thumbtime span', 1) class Duration(Filter):
time_txt = time_span.text.strip().replace(';', ':') def filter(self, txt):
time_txt = txt.replace(';', ':')
hours, minutes, seconds = 0, 0, 0 hours, minutes, seconds = 0, 0, 0
if ':' in time_txt: if ':' in time_txt:
t = time_txt.split(':') t = time_txt.split(':')
@ -58,8 +58,16 @@ class IndexPage(BasePage):
if len(t) == 3: if len(t) == 3:
hours = int(t[2]) hours = int(t[2])
elif time_txt != 'N/A': elif time_txt != 'N/A':
raise BrokenPageError('Unable to parse the video duration: %s' % time_txt) raise ValueError('Unable to parse the video duration: %s' % time_txt)
video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
yield video
obj_id = Id(Link('.//a'))
obj_title = CleanText('.//span[@id="title1"]')
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))
def obj_thumbnail(self):
thumbnail = BaseImage(self.xpath('.//img')[0].attrib['data-original'])
thumbnail.url = thumbnail.id
return thumbnail

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Roger Philibert # Copyright(C) 2010-2014 Roger Philibert
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -19,11 +19,12 @@
import datetime import datetime
import lxml.html
import re import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement
from weboob.tools.browser2.filters import CleanText, Env
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.tools.browser import BasePage, BrokenPageError
from weboob.tools.misc import to_unicode from weboob.tools.misc import to_unicode
from ..video import YoujizzVideo from ..video import YoujizzVideo
@ -32,36 +33,36 @@ from ..video import YoujizzVideo
__all__ = ['VideoPage'] __all__ = ['VideoPage']
class VideoPage(BasePage): class VideoPage(HTMLPage):
def get_video(self, video=None): @method
_id = to_unicode(self.group_dict['id']) class get_video(ItemElement):
if video is None: klass = YoujizzVideo
video = YoujizzVideo(_id)
title_el = self.parser.select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_el.text.strip())
obj_id = Env('id')
obj_title = CleanText('//title')
def obj_duration(self):
# youjizz HTML is crap, we must parse it with regexps # youjizz HTML is crap, we must parse it with regexps
data = lxml.html.tostring(self.document.getroot()) m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', self.page.response.text)
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', data)
if m: if m:
txt = m.group(1).strip() txt = m.group(1).strip()
if txt == 'Unknown': if txt == 'Unknown':
video.duration = NotAvailable return NotAvailable
else: else:
minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) return datetime.timedelta(minutes=minutes, seconds=seconds)
else: else:
raise BrokenPageError('Unable to retrieve video duration') raise ValueError('Unable to retrieve video duration')
real_id = int(_id.split('-')[-1]) def obj_url(self):
data = self.browser.readurl('http://www.youjizz.com/videos/embed/%s' % real_id) real_id = int(self.env['id'].split('-')[-1])
response = self.page.browser.open('http://www.youjizz.com/videos/embed/%s' % real_id)
data = response.text
video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data) video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data)
if len(video_file_urls) == 0: if len(video_file_urls) == 0:
raise BrokenPageError('Video URL not found') raise ValueError('Video URL not found')
elif len(video_file_urls) > 1: elif len(video_file_urls) > 1:
raise BrokenPageError('Many video file URL found') raise ValueError('Many video file URL found')
else: else:
video.url = to_unicode(video_file_urls[0]) return to_unicode(video_file_urls[0])
return video