implement fillobj() browser method

This commit is contained in:
Christophe Benz 2010-07-15 01:21:49 +02:00
commit 3175883351
13 changed files with 164 additions and 129 deletions

View file

@ -40,6 +40,6 @@ class InaBackend(BaseBackend, ICapVideo):
def get_video(self, _id): def get_video(self, _id):
return self.browser.get_video(_id) return self.browser.get_video(_id)
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
debug(u'backend ina: iter_search_results is not implemented') debug(u'backend ina: iter_search_results is not implemented')
return set() return set()

View file

@ -42,7 +42,7 @@ class YoujizzBackend(BaseBackend, ICapVideo):
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
return self.browser.iter_page_urls(mozaic_url) return self.browser.iter_page_urls(mozaic_url)
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw: if not nsfw:
return return
return self.browser.iter_search_results(pattern, required_fields=required_fields) return self.browser.iter_search_results(pattern)

View file

@ -16,7 +16,6 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import logging import logging
import re import re
import urllib import urllib
@ -26,6 +25,7 @@ from weboob.tools.browser.decorators import check_domain, id2url
from weboob.tools.misc import iter_fields, to_unicode from weboob.tools.misc import iter_fields, to_unicode
from .pages.index import IndexPage from .pages.index import IndexPage
from .pages.video import VideoPage
from .video import YoujizzVideo from .video import YoujizzVideo
@ -37,57 +37,28 @@ class YoujizzBrowser(BaseBrowser):
ENCODING = None ENCODING = None
PAGES = {r'http://.*youjizz\.com/?': IndexPage, PAGES = {r'http://.*youjizz\.com/?': IndexPage,
r'http://.*youjizz\.com/index.php': IndexPage, r'http://.*youjizz\.com/index.php': IndexPage,
r'http://.*youjizz\.com/search/.+\.html': IndexPage, r'http://.*youjizz\.com/search/(?P<pattern>.+)\.html': IndexPage,
r'http://.*youjizz\.com/videos/(?P<id>.+)\.html': VideoPage,
} }
def fillobj(self, video, fields):
# ignore the fields param: VideoPage.get_video() returns all the information
self.location(YoujizzVideo.id2url(video.id))
return self.page.get_video(video)
@id2url(YoujizzVideo.id2url) @id2url(YoujizzVideo.id2url)
def get_video(self, url, video=None): def get_video(self, url):
try: self.location(url)
data = self.openurl(url.encode('utf-8')).read() return self.page.get_video()
except BrowserUnavailable:
return None
def _get_url():
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0:
return None
else:
if len(video_file_urls) > 1:
logging.warning('Many video file URL found for given URL: %s' % video_file_urls)
return video_file_urls[0]
m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
_id = unicode(m.group(1)) if m else None
if video is None:
video = YoujizzVideo(_id)
m = re.search(r'<title>(.+)</title>', data)
title = to_unicode(m.group(1)) if m else None
m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
if m:
minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
else:
minutes = seconds = 0
video.title = title
video.url = _get_url()
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
return video
@check_domain @check_domain
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
raise NotImplementedError() raise NotImplementedError()
def iter_search_results(self, pattern, required_fields=None): def iter_search_results(self, pattern):
if not pattern: if not pattern:
self.home() self.home()
else: else:
self.location('/search/%s-1.html' % (urllib.quote_plus(pattern))) self.location('/search/%s-1.html' % (urllib.quote_plus(pattern)))
assert self.is_on_page(IndexPage) assert self.is_on_page(IndexPage)
return self.page.iter_videos()
for video in self.page.iter_videos():
if required_fields is not None:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
self.get_video(video.id, video=video)
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
yield video

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Roger Philibert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import lxml.html
import re
from weboob.tools.browser import BasePage
from weboob.tools.misc import to_unicode
from weboob.tools.parsers.lxmlparser import select, SelectElementException
from ..video import YoujizzVideo
__all__ = ['VideoPage']
class VideoPage(BasePage):
def get_video(self, video=None):
_id = to_unicode(self.group_dict['id'])
if video is None:
video = YoujizzVideo(_id)
title_el = select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_el.text.strip())
# youjizz HTML is crap, we must parse it with regexps
data = lxml.html.tostring(self.document.getroot())
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)<br.*>', data)
try:
if m:
minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(':'))
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
else:
raise Exception()
except Exception:
raise SelectElementException('Could not retrieve video duration')
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0:
raise SelectElementException('Video URL not found')
elif len(video_file_urls) > 1:
raise SelectElementException('Many video file URL found')
else:
video.url = video_file_urls[0]
return video

View file

@ -39,10 +39,10 @@ class YoupornBackend(BaseBackend, ICapVideo):
return self.browser.get_video(_id) return self.browser.get_video(_id)
SORTBY = ['relevance', 'rating', 'views', 'time'] SORTBY = ['relevance', 'rating', 'views', 'time']
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
if not nsfw: if not nsfw:
return return
return self.browser.iter_search_results(pattern, self.SORTBY[sortby], required_fields=required_fields) return self.browser.iter_search_results(pattern, self.SORTBY[sortby])
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
raise NotImplementedError() raise NotImplementedError()

View file

@ -39,30 +39,20 @@ class YoupornBrowser(BaseBrowser):
r'http://[w\.]*youporngay\.com:80/watch/(?P<id>.+)': VideoPage, r'http://[w\.]*youporngay\.com:80/watch/(?P<id>.+)': VideoPage,
} }
@id2url(YoupornVideo.id2url) def fillobj(self, video, fields):
def get_video(self, url, video=None): # ignore the fields param: VideoPage.get_video() returns all the information
self.location(url) self.location(YoupornVideo.id2url(video.id))
if video is None: return self.page.get_video(video)
return self.page.video
else:
for k, v in iter_fields(self.page.video):
if v and getattr(video, k) != v:
setattr(video, k, v)
return video
def iter_search_results(self, pattern, sortby, required_fields=None): @id2url(YoupornVideo.id2url)
def get_video(self, url):
self.location(url)
return self.page.get_video()
def iter_search_results(self, pattern, sortby):
if not pattern: if not pattern:
self.home() self.home()
else: else:
self.location(self.buildurl('/search/%s' % sortby, query=pattern)) self.location(self.buildurl('/search/%s' % sortby, query=pattern))
assert self.is_on_page(IndexPage) assert self.is_on_page(IndexPage)
for video in self.page.iter_videos(): return self.page.iter_videos()
if required_fields is not None:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
self.get_video(video.id, video=video)
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
yield video

View file

@ -27,19 +27,20 @@ from ..video import YoupornVideo
class VideoPage(PornPage): class VideoPage(PornPage):
def on_loaded(self): def get_video(self, video=None):
if not PornPage.on_loaded(self): if not PornPage.on_loaded(self):
return return
self.video = YoupornVideo(self.group_dict['id'], if video is None:
self.get_title(), video = YoupornVideo(self.group_dict['id'])
self.get_url(), video.title = self.get_title()
) video.url = self.get_url()
self.set_details(self.video) self.set_details(video)
return video
def get_url(self): def get_url(self):
el = self.document.getroot().cssselect('div[id=download]') download_div = select(self.document.getroot(), '#download', 1)
if el: a = select(download_div, 'a', 1)
return el[0].cssselect('a')[0].attrib['href'] return a.attrib['href']
def get_title(self): def get_title(self):
element = select(self.document.getroot(), '#videoArea h1', 1) element = select(self.document.getroot(), '#videoArea h1', 1)
@ -49,11 +50,8 @@ class VideoPage(PornPage):
MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
def set_details(self, v): def set_details(self, v):
div = self.document.getroot().cssselect('div[id=details]') details_div = select(self.document.getroot(), '#details', 1)
if not div: for li in details_div.getiterator('li'):
return
for li in div[0].getiterator('li'):
span = li.find('span') span = li.find('span')
name = span.text.strip() name = span.text.strip()
value = span.tail.strip() value = span.tail.strip()

View file

@ -24,7 +24,6 @@ from weboob.tools.backend import BaseBackend
from weboob.tools.misc import iter_fields from weboob.tools.misc import iter_fields
from .browser import YoutubeBrowser from .browser import YoutubeBrowser
from .pages import ForbiddenVideo
from .video import YoutubeVideo from .video import YoutubeVideo
@ -41,23 +40,10 @@ class YoutubeBackend(BaseBackend, ICapVideo):
BROWSER = YoutubeBrowser BROWSER = YoutubeBrowser
def get_video(self, _id, video=None): def get_video(self, _id):
try: return self.browser.get_video(_id)
browser_video = self.browser.get_video(_id)
except ForbiddenVideo:
if video is None:
return None
else:
raise
if video is None:
return browser_video
else:
for k, v in iter_fields(browser_video):
if v and getattr(video, k) != v:
setattr(video, k, v)
return video
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
import gdata.youtube.service import gdata.youtube.service
yt_service = gdata.youtube.service.YouTubeService() yt_service = gdata.youtube.service.YouTubeService()
query = gdata.youtube.service.YouTubeVideoQuery() query = gdata.youtube.service.YouTubeVideoQuery()
@ -77,19 +63,6 @@ class YoutubeBackend(BaseBackend, ICapVideo):
duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())), duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())),
thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(), thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(),
) )
if required_fields is not None:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
try:
self.get_video(video.id, video=video)
except ForbiddenVideo, e:
logging.debug(e)
continue
else:
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
if missing_required_fields:
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
yield video yield video
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):

View file

@ -16,10 +16,12 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import logging
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage from .pages import ForbiddenVideo, ForbiddenVideoPage, VerifyAgePage, VideoPage
from .video import YoutubeVideo from .video import YoutubeVideo
@ -34,10 +36,12 @@ class YoutubeBrowser(BaseBrowser):
r'.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage, r'.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage,
} }
def fillobj(self, video, fields):
# ignore the fields param: VideoPage.get_video() returns all the information
self.location(YoutubeVideo.id2url(video.id))
return self.page.get_video(video)
@id2url(YoutubeVideo.id2url) @id2url(YoutubeVideo.id2url)
def get_video(self, url): def get_video(self, url):
self.location(url) self.location(url)
if hasattr(self.page, 'video'): return self.page.get_video()
return self.page.video
else:
return None

View file

@ -32,26 +32,26 @@ class ForbiddenVideo(Exception):
class ForbiddenVideoPage(BasePage): class ForbiddenVideoPage(BasePage):
def on_loaded(self): def get_video(self, video=None):
element = select(self.document.getroot(), '.yt-alert-content', 1) element = select(self.document.getroot(), '.yt-alert-content', 1)
raise ForbiddenVideo(element.text.strip()) raise ForbiddenVideo(element.text.strip())
class VerifyAgePage(BasePage): class VerifyAgePage(BasePage):
def on_loaded(self): def get_video(self, video=None):
raise ForbiddenVideo('verify age not implemented') raise ForbiddenVideo('verify age not implemented')
class VideoPage(BasePage): class VideoPage(BasePage):
VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)') VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
def on_loaded(self): def get_video(self, video=None):
_id = self.group_dict['id'] if video is None:
self.video = YoutubeVideo(_id, video = YoutubeVideo(self.group_dict['id'])
title=self.get_title(), video.title = self.get_title()
url=self.get_url(_id), video.url = self.get_url(video.id)
author=self.get_author(), video.author = self.get_author()
) return video
def get_author(self): def get_author(self):
element = select(self.document.getroot(), 'a.watch-description-username strong', 1) element = select(self.document.getroot(), 'a.watch-description-username strong', 1)

View file

@ -55,7 +55,7 @@ class ICapVideo(ICap):
SEARCH_VIEWS, SEARCH_VIEWS,
SEARCH_DATE) = range(4) SEARCH_DATE) = range(4)
def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False, required_fields=None): def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False):
""" """
Iter results of a search on a pattern. Note that if pattern is None, Iter results of a search on a pattern. Note that if pattern is None,
it get the latest videos. it get the latest videos.
@ -63,7 +63,6 @@ class ICapVideo(ICap):
@param pattern [str] pattern to search on @param pattern [str] pattern to search on
@param sortby [enum] sort by... @param sortby [enum] sort by...
@param nsfw [bool] include non-suitable for work videos if True @param nsfw [bool] include non-suitable for work videos if True
@param required_fields [tuple] fields to load even if it takes many HTTP requests
""" """
raise NotImplementedError() raise NotImplementedError()

View file

@ -270,6 +270,40 @@ class ConsoleApplication(BaseApplication):
logging.error(e) logging.error(e)
def do(self, function, *args, **kwargs): def do(self, function, *args, **kwargs):
if self.selected_fields: """
kwargs['required_fields'] = set(self.selected_fields) - set('*') Call Weboob.do(), after having filled the yielded object, if selected fields are given by user.
return self.weboob.do(function, *args, **kwargs) """
for backend, result in self.weboob.do(function, *args, **kwargs):
fields = set(self.selected_fields) - set('*')
if fields:
try:
backend.browser.fillobj(result, fields)
except Exception, e:
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
yield backend, result
def do_caps(self, caps, function, *args, **kwargs):
"""
Call Weboob.do_caps(), after having filled the yielded object, if selected fields are given by user.
"""
for backend, result in self.weboob.do_caps(caps, function, *args, **kwargs):
fields = set(self.selected_fields) - set('*')
if fields:
try:
backend.browser.fillobj(result, fields)
except Exception, e:
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
yield backend, result
def do_backends(self, backends, function, *args, **kwargs):
"""
Call Weboob.do_backends(), after having filled the yielded object, if selected fields are given by user.
"""
for backend, result in self.weboob.do_backends(backends, function, *args, **kwargs):
fields = set(self.selected_fields) - set('*')
if fields:
try:
backend.browser.fillobj(result, fields)
except Exception, e:
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
yield backend, result

View file

@ -394,3 +394,6 @@ class BaseBrowser(mechanize.Browser):
self[field] = value self[field] = value
except ControlNotFoundError: except ControlNotFoundError:
return return
def fillobj(self, obj, fields):
raise NotImplementedError()