implement fillobj() browser method

This commit is contained in:
Christophe Benz 2010-07-15 01:21:49 +02:00
commit 3175883351
13 changed files with 164 additions and 129 deletions

View file

@ -42,7 +42,7 @@ class YoujizzBackend(BaseBackend, ICapVideo):
def iter_page_urls(self, mozaic_url):
return self.browser.iter_page_urls(mozaic_url)
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw:
return
return self.browser.iter_search_results(pattern, required_fields=required_fields)
return self.browser.iter_search_results(pattern)

View file

@ -16,7 +16,6 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import logging
import re
import urllib
@ -26,6 +25,7 @@ from weboob.tools.browser.decorators import check_domain, id2url
from weboob.tools.misc import iter_fields, to_unicode
from .pages.index import IndexPage
from .pages.video import VideoPage
from .video import YoujizzVideo
@ -37,57 +37,28 @@ class YoujizzBrowser(BaseBrowser):
ENCODING = None
PAGES = {r'http://.*youjizz\.com/?': IndexPage,
r'http://.*youjizz\.com/index.php': IndexPage,
r'http://.*youjizz\.com/search/.+\.html': IndexPage,
r'http://.*youjizz\.com/search/(?P<pattern>.+)\.html': IndexPage,
r'http://.*youjizz\.com/videos/(?P<id>.+)\.html': VideoPage,
}
def fillobj(self, video, fields):
# ignore the fields param: VideoPage.get_video() returns all the information
self.location(YoujizzVideo.id2url(video.id))
return self.page.get_video(video)
@id2url(YoujizzVideo.id2url)
def get_video(self, url, video=None):
try:
data = self.openurl(url.encode('utf-8')).read()
except BrowserUnavailable:
return None
def _get_url():
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0:
return None
else:
if len(video_file_urls) > 1:
logging.warning('Many video file URL found for given URL: %s' % video_file_urls)
return video_file_urls[0]
m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
_id = unicode(m.group(1)) if m else None
if video is None:
video = YoujizzVideo(_id)
m = re.search(r'<title>(.+)</title>', data)
title = to_unicode(m.group(1)) if m else None
m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
if m:
minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
else:
minutes = seconds = 0
video.title = title
video.url = _get_url()
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
return video
def get_video(self, url):
self.location(url)
return self.page.get_video()
@check_domain
def iter_page_urls(self, mozaic_url):
raise NotImplementedError()
def iter_search_results(self, pattern, required_fields=None):
def iter_search_results(self, pattern):
if not pattern:
self.home()
else:
self.location('/search/%s-1.html' % (urllib.quote_plus(pattern)))
assert self.is_on_page(IndexPage)
for video in self.page.iter_videos():
if required_fields is not None:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
self.get_video(video.id, video=video)
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
yield video
return self.page.iter_videos()

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Roger Philibert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import lxml.html
import re
from weboob.tools.browser import BasePage
from weboob.tools.misc import to_unicode
from weboob.tools.parsers.lxmlparser import select, SelectElementException
from ..video import YoujizzVideo
__all__ = ['VideoPage']
class VideoPage(BasePage):
def get_video(self, video=None):
_id = to_unicode(self.group_dict['id'])
if video is None:
video = YoujizzVideo(_id)
title_el = select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_el.text.strip())
# youjizz HTML is crap, we must parse it with regexps
data = lxml.html.tostring(self.document.getroot())
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)<br.*>', data)
try:
if m:
minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(':'))
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
else:
raise Exception()
except Exception:
raise SelectElementException('Could not retrieve video duration')
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0:
raise SelectElementException('Video URL not found')
elif len(video_file_urls) > 1:
raise SelectElementException('Many video file URL found')
else:
video.url = video_file_urls[0]
return video