upgrade to browser2

This commit is contained in:
Romain Bignon 2014-03-09 15:44:28 +01:00
commit af9197fba7
4 changed files with 89 additions and 83 deletions

View file

@ -41,15 +41,13 @@ class YoujizzBackend(BaseBackend, ICapVideo, ICapCollection):
BROWSER = YoujizzBrowser
def get_video(self, _id):
with self.browser:
video = self.browser.get_video(_id)
video = self.browser.get_video(_id)
return video
def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw:
return set()
with self.browser:
return self.browser.search_videos(pattern)
return self.browser.search_videos(pattern)
def fill_video(self, video, fields):
if fields != ['thumbnail']:

View file

@ -18,9 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import urllib
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser2 import PagesBrowser, URL
from weboob.tools.browser.decorators import id2url
from .pages.index import IndexPage
@ -31,27 +29,28 @@ from .video import YoujizzVideo
__all__ = ['YoujizzBrowser']
class YoujizzBrowser(BaseBrowser):
DOMAIN = 'youjizz.com'
ENCODING = None
PAGES = {r'http://.*youjizz\.com/?': IndexPage,
r'http://.*youjizz\.com/index.php': IndexPage,
r'http://.*youjizz\.com/search/(?P<pattern>.+)\.html': IndexPage,
r'http://.*youjizz\.com/videos/(?P<id>.+)\.html': VideoPage,
}
class YoujizzBrowser(PagesBrowser):
BASEURL = 'http://www.youjizz.com'
index = URL(r'/?(index.php)?$', IndexPage)
search = URL(r'/search/(?P<pattern>.+)-(?P<pagenum>\d+).html', IndexPage)
video = URL(r'/videos/(?P<id>.*).html', VideoPage)
@id2url(YoujizzVideo.id2url)
def get_video(self, url, video=None):
self.location(url)
assert self.is_on_page(VideoPage), 'Should be on video page.'
assert self.video.is_here()
return self.page.get_video(video)
def search_videos(self, pattern):
self.location('/search/%s-1.html' % (urllib.quote_plus(pattern.encode('utf-8'))))
assert self.is_on_page(IndexPage)
return self.page.iter_videos()
self.search.go(pattern=pattern, pagenum=1)
assert self.search.is_here()
return self.pagination(lambda: self.page.iter_videos())
def latest_videos(self):
self.home()
assert self.is_on_page(IndexPage)
return self.page.iter_videos()
self.index.go()
assert self.index.is_here()
return self.pagination(lambda: self.page.iter_videos())

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2012 Roger Philibert
# Copyright(C) 2010-2014 Roger Philibert
#
# This file is part of weboob.
#
@ -21,9 +21,10 @@
import datetime
import re
from weboob.tools.browser import BasePage, BrokenPageError
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import ListElement, method, ItemElement
from weboob.tools.browser2.filters import Filter, Link, CleanText
from weboob.capabilities.image import BaseImage
from weboob.tools.misc import to_unicode
from ..video import YoujizzVideo
@ -31,35 +32,42 @@ from ..video import YoujizzVideo
__all__ = ['IndexPage']
class IndexPage(BasePage):
def iter_videos(self):
span_list = self.parser.select(self.document.getroot(), 'span#miniatura')
for span in span_list:
a = self.parser.select(span, 'a', 1)
url = a.attrib['href']
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
class IndexPage(HTMLPage):
@method
class iter_videos(ListElement):
item_xpath = '//span[@id="miniatura"]'
video = YoujizzVideo(_id)
next_page = Link(u'//a[text()="Next »"]')
video.thumbnail = BaseImage(span.find('.//img').attrib['data-original'])
video.thumbnail.url = video.thumbnail.id
class item(ItemElement):
klass = YoujizzVideo
title_el = self.parser.select(span, 'span#title1', 1)
video.title = to_unicode(title_el.text.strip())
class Id(Filter):
def filter(self, link):
return re.sub(r'/videos/(.+)\.html', r'\1', link)
time_span = self.parser.select(span, 'span.thumbtime span', 1)
time_txt = time_span.text.strip().replace(';', ':')
hours, minutes, seconds = 0, 0, 0
if ':' in time_txt:
t = time_txt.split(':')
t.reverse()
seconds = int(t[0])
minutes = int(t[1])
if len(t) == 3:
hours = int(t[2])
elif time_txt != 'N/A':
raise BrokenPageError('Unable to parse the video duration: %s' % time_txt)
class Duration(Filter):
def filter(self, txt):
time_txt = txt.replace(';', ':')
hours, minutes, seconds = 0, 0, 0
if ':' in time_txt:
t = time_txt.split(':')
t.reverse()
seconds = int(t[0])
minutes = int(t[1])
if len(t) == 3:
hours = int(t[2])
elif time_txt != 'N/A':
raise ValueError('Unable to parse the video duration: %s' % time_txt)
video.duration = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
return datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
yield video
obj_id = Id(Link('.//a'))
obj_title = CleanText('.//span[@id="title1"]')
obj_duration = Duration(CleanText('.//span[@class="thumbtime"]//span'))
def obj_thumbnail(self):
thumbnail = BaseImage(self.xpath('.//img')[0].attrib['data-original'])
thumbnail.url = thumbnail.id
return thumbnail

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Roger Philibert
# Copyright(C) 2010-2014 Roger Philibert
#
# This file is part of weboob.
#
@ -19,11 +19,12 @@
import datetime
import lxml.html
import re
from weboob.tools.browser2 import HTMLPage
from weboob.tools.browser2.page import method, ItemElement
from weboob.tools.browser2.filters import CleanText, Env
from weboob.capabilities.base import NotAvailable
from weboob.tools.browser import BasePage, BrokenPageError
from weboob.tools.misc import to_unicode
from ..video import YoujizzVideo
@ -32,36 +33,36 @@ from ..video import YoujizzVideo
__all__ = ['VideoPage']
class VideoPage(BasePage):
def get_video(self, video=None):
_id = to_unicode(self.group_dict['id'])
if video is None:
video = YoujizzVideo(_id)
title_el = self.parser.select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_el.text.strip())
class VideoPage(HTMLPage):
@method
class get_video(ItemElement):
klass = YoujizzVideo
# youjizz HTML is crap, we must parse it with regexps
data = lxml.html.tostring(self.document.getroot())
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', data)
if m:
txt = m.group(1).strip()
if txt == 'Unknown':
video.duration = NotAvailable
obj_id = Env('id')
obj_title = CleanText('//title')
def obj_duration(self):
# youjizz HTML is crap, we must parse it with regexps
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)</div>', self.page.response.text)
if m:
txt = m.group(1).strip()
if txt == 'Unknown':
return NotAvailable
else:
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
return datetime.timedelta(minutes=minutes, seconds=seconds)
else:
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
else:
raise BrokenPageError('Unable to retrieve video duration')
raise ValueError('Unable to retrieve video duration')
real_id = int(_id.split('-')[-1])
data = self.browser.readurl('http://www.youjizz.com/videos/embed/%s' % real_id)
def obj_url(self):
real_id = int(self.env['id'].split('-')[-1])
response = self.page.browser.open('http://www.youjizz.com/videos/embed/%s' % real_id)
data = response.text
video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data)
if len(video_file_urls) == 0:
raise BrokenPageError('Video URL not found')
elif len(video_file_urls) > 1:
raise BrokenPageError('Many video file URL found')
else:
video.url = to_unicode(video_file_urls[0])
return video
video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv(?:\?[^"]*)?)"', data)
if len(video_file_urls) == 0:
raise ValueError('Video URL not found')
elif len(video_file_urls) > 1:
raise ValueError('Many video file URL found')
else:
return to_unicode(video_file_urls[0])