From af9197fba708add70ace61d2de51cb476ff1604a Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Sun, 9 Mar 2014 15:44:28 +0100 Subject: [PATCH] upgrade to browser2 --- modules/youjizz/backend.py | 6 ++-- modules/youjizz/browser.py | 35 +++++++++--------- modules/youjizz/pages/index.py | 66 +++++++++++++++++++--------------- modules/youjizz/pages/video.py | 65 ++++++++++++++++----------------- 4 files changed, 89 insertions(+), 83 deletions(-) diff --git a/modules/youjizz/backend.py b/modules/youjizz/backend.py index 14e06ed2..21981e4a 100644 --- a/modules/youjizz/backend.py +++ b/modules/youjizz/backend.py @@ -41,15 +41,13 @@ class YoujizzBackend(BaseBackend, ICapVideo, ICapCollection): BROWSER = YoujizzBrowser def get_video(self, _id): - with self.browser: - video = self.browser.get_video(_id) + video = self.browser.get_video(_id) return video def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): if not nsfw: return set() - with self.browser: - return self.browser.search_videos(pattern) + return self.browser.search_videos(pattern) def fill_video(self, video, fields): if fields != ['thumbnail']: diff --git a/modules/youjizz/browser.py b/modules/youjizz/browser.py index cb6ce374..d2ccebaf 100644 --- a/modules/youjizz/browser.py +++ b/modules/youjizz/browser.py @@ -18,9 +18,7 @@ # along with weboob. If not, see . -import urllib - -from weboob.tools.browser import BaseBrowser +from weboob.tools.browser2 import PagesBrowser, URL from weboob.tools.browser.decorators import id2url from .pages.index import IndexPage @@ -31,27 +29,28 @@ from .video import YoujizzVideo __all__ = ['YoujizzBrowser'] -class YoujizzBrowser(BaseBrowser): - DOMAIN = 'youjizz.com' - ENCODING = None - PAGES = {r'http://.*youjizz\.com/?': IndexPage, - r'http://.*youjizz\.com/index.php': IndexPage, - r'http://.*youjizz\.com/search/(?P.+)\.html': IndexPage, - r'http://.*youjizz\.com/videos/(?P.+)\.html': VideoPage, - } +class YoujizzBrowser(PagesBrowser): + BASEURL = 'http://www.youjizz.com' + + index = URL(r'/?(index.php)?$', IndexPage) + search = URL(r'/search/(?P.+)-(?P\d+).html', IndexPage) + video = URL(r'/videos/(?P.*).html', VideoPage) @id2url(YoujizzVideo.id2url) def get_video(self, url, video=None): self.location(url) - assert self.is_on_page(VideoPage), 'Should be on video page.' + assert self.video.is_here() + return self.page.get_video(video) def search_videos(self, pattern): - self.location('/search/%s-1.html' % (urllib.quote_plus(pattern.encode('utf-8')))) - assert self.is_on_page(IndexPage) - return self.page.iter_videos() + self.search.go(pattern=pattern, pagenum=1) + assert self.search.is_here() + + return self.pagination(lambda: self.page.iter_videos()) def latest_videos(self): - self.home() - assert self.is_on_page(IndexPage) - return self.page.iter_videos() + self.index.go() + assert self.index.is_here() + + return self.pagination(lambda: self.page.iter_videos()) diff --git a/modules/youjizz/pages/index.py b/modules/youjizz/pages/index.py index 95800c3a..70658c3a 100644 --- a/modules/youjizz/pages/index.py +++ b/modules/youjizz/pages/index.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010-2012 Roger Philibert +# Copyright(C) 2010-2014 Roger Philibert # # This file is part of weboob. # @@ -21,9 +21,10 @@ import datetime import re -from weboob.tools.browser import BasePage, BrokenPageError +from weboob.tools.browser2 import HTMLPage +from weboob.tools.browser2.page import ListElement, method, ItemElement +from weboob.tools.browser2.filters import Filter, Link, CleanText from weboob.capabilities.image import BaseImage -from weboob.tools.misc import to_unicode from ..video import YoujizzVideo @@ -31,35 +32,42 @@ from ..video import YoujizzVideo __all__ = ['IndexPage'] -class IndexPage(BasePage): - def iter_videos(self): - span_list = self.parser.select(self.document.getroot(), 'span#miniatura') - for span in span_list: - a = self.parser.select(span, 'a', 1) - url = a.attrib['href'] - _id = re.sub(r'/videos/(.+)\.html', r'\1', url) +class IndexPage(HTMLPage): + @method + class iter_videos(ListElement): + item_xpath = '//span[@id="miniatura"]' - video = YoujizzVideo(_id) + next_page = Link(u'//a[text()="Next ยป"]') - video.thumbnail = BaseImage(span.find('.//img').attrib['data-original']) - video.thumbnail.url = video.thumbnail.id + class item(ItemElement): + klass = YoujizzVideo - title_el = self.parser.select(span, 'span#title1', 1) - video.title = to_unicode(title_el.text.strip()) + class Id(Filter): + def filter(self, link): + return re.sub(r'/videos/(.+)\.html', r'\1', link) - time_span = self.parser.select(span, 'span.thumbtime span', 1) - time_txt = time_span.text.strip().replace(';', ':') - hours, minutes, seconds = 0, 0, 0 - if ':' in time_txt: - t = time_txt.split(':') - t.reverse() - seconds = int(t[0]) - minutes = int(t[1]) - if len(t) == 3: - hours = int(t[2]) - elif time_txt != 'N/A': - raise BrokenPageError('Unable to parse the video duration: %s' % time_txt) + class Duration(Filter): + def filter(self, txt): + time_txt = txt.replace(';', ':') + hours, minutes, seconds = 0, 0, 0 + if ':' in time_txt: + t = time_txt.split(':') + t.reverse() + seconds = int(t[0]) + minutes = int(t[1]) + if len(t) == 3: + hours = int(t[2]) + elif time_txt != 'N/A': + raise ValueError('Unable to parse the video duration: %s' % time_txt) - video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) + return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) - yield video + + obj_id = Id(Link('.//a')) + obj_title = CleanText('.//span[@id="title1"]') + obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span')) + + def obj_thumbnail(self): + thumbnail = BaseImage(self.xpath('.//img')[0].attrib['data-original']) + thumbnail.url = thumbnail.id + return thumbnail diff --git a/modules/youjizz/pages/video.py b/modules/youjizz/pages/video.py index e227af25..4ded1ccb 100644 --- a/modules/youjizz/pages/video.py +++ b/modules/youjizz/pages/video.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010-2011 Roger Philibert +# Copyright(C) 2010-2014 Roger Philibert # # This file is part of weboob. # @@ -19,11 +19,12 @@ import datetime -import lxml.html import re +from weboob.tools.browser2 import HTMLPage +from weboob.tools.browser2.page import method, ItemElement +from weboob.tools.browser2.filters import CleanText, Env from weboob.capabilities.base import NotAvailable -from weboob.tools.browser import BasePage, BrokenPageError from weboob.tools.misc import to_unicode from ..video import YoujizzVideo @@ -32,36 +33,36 @@ from ..video import YoujizzVideo __all__ = ['VideoPage'] -class VideoPage(BasePage): - def get_video(self, video=None): - _id = to_unicode(self.group_dict['id']) - if video is None: - video = YoujizzVideo(_id) - title_el = self.parser.select(self.document.getroot(), 'title', 1) - video.title = to_unicode(title_el.text.strip()) +class VideoPage(HTMLPage): + @method + class get_video(ItemElement): + klass = YoujizzVideo - # youjizz HTML is crap, we must parse it with regexps - data = lxml.html.tostring(self.document.getroot()) - m = re.search(r'.*?Runtime.*? (.+?)', data) - if m: - txt = m.group(1).strip() - if txt == 'Unknown': - video.duration = NotAvailable + obj_id = Env('id') + obj_title = CleanText('//title') + + def obj_duration(self): + # youjizz HTML is crap, we must parse it with regexps + m = re.search(r'.*?Runtime.*? (.+?)', self.page.response.text) + if m: + txt = m.group(1).strip() + if txt == 'Unknown': + return NotAvailable + else: + minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) + return datetime.timedelta(minutes=minutes, seconds=seconds) else: - minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) - video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) - else: - raise BrokenPageError('Unable to retrieve video duration') + raise ValueError('Unable to retrieve video duration') - real_id = int(_id.split('-')[-1]) - data = self.browser.readurl('http://www.youjizz.com/videos/embed/%s' % real_id) + def obj_url(self): + real_id = int(self.env['id'].split('-')[-1]) + response = self.page.browser.open('http://www.youjizz.com/videos/embed/%s' % real_id) + data = response.text - video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data) - if len(video_file_urls) == 0: - raise BrokenPageError('Video URL not found') - elif len(video_file_urls) > 1: - raise BrokenPageError('Many video file URL found') - else: - video.url = to_unicode(video_file_urls[0]) - - return video + video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data) + if len(video_file_urls) == 0: + raise ValueError('Video URL not found') + elif len(video_file_urls) > 1: + raise ValueError('Many video file URL found') + else: + return to_unicode(video_file_urls[0])