implement fillobj() browser method
This commit is contained in:
parent
90756cddc9
commit
3175883351
13 changed files with 164 additions and 129 deletions
|
|
@ -40,6 +40,6 @@ class InaBackend(BaseBackend, ICapVideo):
|
|||
def get_video(self, _id):
|
||||
return self.browser.get_video(_id)
|
||||
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
|
||||
debug(u'backend ina: iter_search_results is not implemented')
|
||||
return set()
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ class YoujizzBackend(BaseBackend, ICapVideo):
|
|||
def iter_page_urls(self, mozaic_url):
|
||||
return self.browser.iter_page_urls(mozaic_url)
|
||||
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
|
||||
if not nsfw:
|
||||
return
|
||||
return self.browser.iter_search_results(pattern, required_fields=required_fields)
|
||||
return self.browser.iter_search_results(pattern)
|
||||
|
|
|
|||
|
|
@ -16,7 +16,6 @@
|
|||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import urllib
|
||||
|
|
@ -26,6 +25,7 @@ from weboob.tools.browser.decorators import check_domain, id2url
|
|||
from weboob.tools.misc import iter_fields, to_unicode
|
||||
|
||||
from .pages.index import IndexPage
|
||||
from .pages.video import VideoPage
|
||||
from .video import YoujizzVideo
|
||||
|
||||
|
||||
|
|
@ -37,57 +37,28 @@ class YoujizzBrowser(BaseBrowser):
|
|||
ENCODING = None
|
||||
PAGES = {r'http://.*youjizz\.com/?': IndexPage,
|
||||
r'http://.*youjizz\.com/index.php': IndexPage,
|
||||
r'http://.*youjizz\.com/search/.+\.html': IndexPage,
|
||||
r'http://.*youjizz\.com/search/(?P<pattern>.+)\.html': IndexPage,
|
||||
r'http://.*youjizz\.com/videos/(?P<id>.+)\.html': VideoPage,
|
||||
}
|
||||
|
||||
def fillobj(self, video, fields):
|
||||
# ignore the fields param: VideoPage.get_video() returns all the information
|
||||
self.location(YoujizzVideo.id2url(video.id))
|
||||
return self.page.get_video(video)
|
||||
|
||||
@id2url(YoujizzVideo.id2url)
|
||||
def get_video(self, url, video=None):
|
||||
try:
|
||||
data = self.openurl(url.encode('utf-8')).read()
|
||||
except BrowserUnavailable:
|
||||
return None
|
||||
def _get_url():
|
||||
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
|
||||
if len(video_file_urls) == 0:
|
||||
return None
|
||||
else:
|
||||
if len(video_file_urls) > 1:
|
||||
logging.warning('Many video file URL found for given URL: %s' % video_file_urls)
|
||||
return video_file_urls[0]
|
||||
m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
|
||||
_id = unicode(m.group(1)) if m else None
|
||||
if video is None:
|
||||
video = YoujizzVideo(_id)
|
||||
m = re.search(r'<title>(.+)</title>', data)
|
||||
title = to_unicode(m.group(1)) if m else None
|
||||
m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
|
||||
if m:
|
||||
minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
|
||||
else:
|
||||
minutes = seconds = 0
|
||||
video.title = title
|
||||
video.url = _get_url()
|
||||
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
|
||||
return video
|
||||
def get_video(self, url):
|
||||
self.location(url)
|
||||
return self.page.get_video()
|
||||
|
||||
@check_domain
|
||||
def iter_page_urls(self, mozaic_url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def iter_search_results(self, pattern, required_fields=None):
|
||||
def iter_search_results(self, pattern):
|
||||
if not pattern:
|
||||
self.home()
|
||||
else:
|
||||
self.location('/search/%s-1.html' % (urllib.quote_plus(pattern)))
|
||||
assert self.is_on_page(IndexPage)
|
||||
|
||||
for video in self.page.iter_videos():
|
||||
if required_fields is not None:
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
|
||||
self.get_video(video.id, video=video)
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
|
||||
yield video
|
||||
return self.page.iter_videos()
|
||||
|
|
|
|||
63
weboob/backends/youjizz/pages/video.py
Normal file
63
weboob/backends/youjizz/pages/video.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright(C) 2010 Roger Philibert
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3 of the License.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
import datetime
|
||||
import lxml.html
|
||||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.misc import to_unicode
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
|
||||
from ..video import YoujizzVideo
|
||||
|
||||
|
||||
__all__ = ['VideoPage']
|
||||
|
||||
|
||||
class VideoPage(BasePage):
|
||||
|
||||
def get_video(self, video=None):
|
||||
_id = to_unicode(self.group_dict['id'])
|
||||
if video is None:
|
||||
video = YoujizzVideo(_id)
|
||||
title_el = select(self.document.getroot(), 'title', 1)
|
||||
video.title = to_unicode(title_el.text.strip())
|
||||
|
||||
# youjizz HTML is crap, we must parse it with regexps
|
||||
data = lxml.html.tostring(self.document.getroot())
|
||||
m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)<br.*>', data)
|
||||
try:
|
||||
if m:
|
||||
minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(':'))
|
||||
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
|
||||
else:
|
||||
raise Exception()
|
||||
except Exception:
|
||||
raise SelectElementException('Could not retrieve video duration')
|
||||
|
||||
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
|
||||
if len(video_file_urls) == 0:
|
||||
raise SelectElementException('Video URL not found')
|
||||
elif len(video_file_urls) > 1:
|
||||
raise SelectElementException('Many video file URL found')
|
||||
else:
|
||||
video.url = video_file_urls[0]
|
||||
|
||||
return video
|
||||
|
||||
|
|
@ -39,10 +39,10 @@ class YoupornBackend(BaseBackend, ICapVideo):
|
|||
return self.browser.get_video(_id)
|
||||
|
||||
SORTBY = ['relevance', 'rating', 'views', 'time']
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
|
||||
if not nsfw:
|
||||
return
|
||||
return self.browser.iter_search_results(pattern, self.SORTBY[sortby], required_fields=required_fields)
|
||||
return self.browser.iter_search_results(pattern, self.SORTBY[sortby])
|
||||
|
||||
def iter_page_urls(self, mozaic_url):
|
||||
raise NotImplementedError()
|
||||
|
|
|
|||
|
|
@ -39,30 +39,20 @@ class YoupornBrowser(BaseBrowser):
|
|||
r'http://[w\.]*youporngay\.com:80/watch/(?P<id>.+)': VideoPage,
|
||||
}
|
||||
|
||||
@id2url(YoupornVideo.id2url)
|
||||
def get_video(self, url, video=None):
|
||||
self.location(url)
|
||||
if video is None:
|
||||
return self.page.video
|
||||
else:
|
||||
for k, v in iter_fields(self.page.video):
|
||||
if v and getattr(video, k) != v:
|
||||
setattr(video, k, v)
|
||||
return video
|
||||
def fillobj(self, video, fields):
|
||||
# ignore the fields param: VideoPage.get_video() returns all the information
|
||||
self.location(YoupornVideo.id2url(video.id))
|
||||
return self.page.get_video(video)
|
||||
|
||||
def iter_search_results(self, pattern, sortby, required_fields=None):
|
||||
@id2url(YoupornVideo.id2url)
|
||||
def get_video(self, url):
|
||||
self.location(url)
|
||||
return self.page.get_video()
|
||||
|
||||
def iter_search_results(self, pattern, sortby):
|
||||
if not pattern:
|
||||
self.home()
|
||||
else:
|
||||
self.location(self.buildurl('/search/%s' % sortby, query=pattern))
|
||||
assert self.is_on_page(IndexPage)
|
||||
for video in self.page.iter_videos():
|
||||
if required_fields is not None:
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
|
||||
self.get_video(video.id, video=video)
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
|
||||
yield video
|
||||
return self.page.iter_videos()
|
||||
|
|
|
|||
|
|
@ -27,19 +27,20 @@ from ..video import YoupornVideo
|
|||
|
||||
|
||||
class VideoPage(PornPage):
|
||||
def on_loaded(self):
|
||||
def get_video(self, video=None):
|
||||
if not PornPage.on_loaded(self):
|
||||
return
|
||||
self.video = YoupornVideo(self.group_dict['id'],
|
||||
self.get_title(),
|
||||
self.get_url(),
|
||||
)
|
||||
self.set_details(self.video)
|
||||
if video is None:
|
||||
video = YoupornVideo(self.group_dict['id'])
|
||||
video.title = self.get_title()
|
||||
video.url = self.get_url()
|
||||
self.set_details(video)
|
||||
return video
|
||||
|
||||
def get_url(self):
|
||||
el = self.document.getroot().cssselect('div[id=download]')
|
||||
if el:
|
||||
return el[0].cssselect('a')[0].attrib['href']
|
||||
download_div = select(self.document.getroot(), '#download', 1)
|
||||
a = select(download_div, 'a', 1)
|
||||
return a.attrib['href']
|
||||
|
||||
def get_title(self):
|
||||
element = select(self.document.getroot(), '#videoArea h1', 1)
|
||||
|
|
@ -49,11 +50,8 @@ class VideoPage(PornPage):
|
|||
MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
|
||||
def set_details(self, v):
|
||||
div = self.document.getroot().cssselect('div[id=details]')
|
||||
if not div:
|
||||
return
|
||||
|
||||
for li in div[0].getiterator('li'):
|
||||
details_div = select(self.document.getroot(), '#details', 1)
|
||||
for li in details_div.getiterator('li'):
|
||||
span = li.find('span')
|
||||
name = span.text.strip()
|
||||
value = span.tail.strip()
|
||||
|
|
|
|||
|
|
@ -24,7 +24,6 @@ from weboob.tools.backend import BaseBackend
|
|||
from weboob.tools.misc import iter_fields
|
||||
|
||||
from .browser import YoutubeBrowser
|
||||
from .pages import ForbiddenVideo
|
||||
from .video import YoutubeVideo
|
||||
|
||||
|
||||
|
|
@ -41,23 +40,10 @@ class YoutubeBackend(BaseBackend, ICapVideo):
|
|||
|
||||
BROWSER = YoutubeBrowser
|
||||
|
||||
def get_video(self, _id, video=None):
|
||||
try:
|
||||
browser_video = self.browser.get_video(_id)
|
||||
except ForbiddenVideo:
|
||||
if video is None:
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
if video is None:
|
||||
return browser_video
|
||||
else:
|
||||
for k, v in iter_fields(browser_video):
|
||||
if v and getattr(video, k) != v:
|
||||
setattr(video, k, v)
|
||||
return video
|
||||
def get_video(self, _id):
|
||||
return self.browser.get_video(_id)
|
||||
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None):
|
||||
def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False):
|
||||
import gdata.youtube.service
|
||||
yt_service = gdata.youtube.service.YouTubeService()
|
||||
query = gdata.youtube.service.YouTubeVideoQuery()
|
||||
|
|
@ -77,19 +63,6 @@ class YoutubeBackend(BaseBackend, ICapVideo):
|
|||
duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())),
|
||||
thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(),
|
||||
)
|
||||
if required_fields is not None:
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
logging.debug(u'Completing missing required fields: %s' % missing_required_fields)
|
||||
try:
|
||||
self.get_video(video.id, video=video)
|
||||
except ForbiddenVideo, e:
|
||||
logging.debug(e)
|
||||
continue
|
||||
else:
|
||||
missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v)
|
||||
if missing_required_fields:
|
||||
raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields)
|
||||
yield video
|
||||
|
||||
def iter_page_urls(self, mozaic_url):
|
||||
|
|
|
|||
|
|
@ -16,10 +16,12 @@
|
|||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
from weboob.tools.browser import BaseBrowser
|
||||
from weboob.tools.browser.decorators import id2url
|
||||
|
||||
from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage
|
||||
from .pages import ForbiddenVideo, ForbiddenVideoPage, VerifyAgePage, VideoPage
|
||||
from .video import YoutubeVideo
|
||||
|
||||
|
||||
|
|
@ -34,10 +36,12 @@ class YoutubeBrowser(BaseBrowser):
|
|||
r'.*youtube\.com/verify_age\?next_url=(?P<next_url>.+)': VerifyAgePage,
|
||||
}
|
||||
|
||||
def fillobj(self, video, fields):
|
||||
# ignore the fields param: VideoPage.get_video() returns all the information
|
||||
self.location(YoutubeVideo.id2url(video.id))
|
||||
return self.page.get_video(video)
|
||||
|
||||
@id2url(YoutubeVideo.id2url)
|
||||
def get_video(self, url):
|
||||
self.location(url)
|
||||
if hasattr(self.page, 'video'):
|
||||
return self.page.video
|
||||
else:
|
||||
return None
|
||||
return self.page.get_video()
|
||||
|
|
|
|||
|
|
@ -32,26 +32,26 @@ class ForbiddenVideo(Exception):
|
|||
|
||||
|
||||
class ForbiddenVideoPage(BasePage):
|
||||
def on_loaded(self):
|
||||
def get_video(self, video=None):
|
||||
element = select(self.document.getroot(), '.yt-alert-content', 1)
|
||||
raise ForbiddenVideo(element.text.strip())
|
||||
|
||||
|
||||
class VerifyAgePage(BasePage):
|
||||
def on_loaded(self):
|
||||
def get_video(self, video=None):
|
||||
raise ForbiddenVideo('verify age not implemented')
|
||||
|
||||
|
||||
class VideoPage(BasePage):
|
||||
VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)')
|
||||
|
||||
def on_loaded(self):
|
||||
_id = self.group_dict['id']
|
||||
self.video = YoutubeVideo(_id,
|
||||
title=self.get_title(),
|
||||
url=self.get_url(_id),
|
||||
author=self.get_author(),
|
||||
)
|
||||
def get_video(self, video=None):
|
||||
if video is None:
|
||||
video = YoutubeVideo(self.group_dict['id'])
|
||||
video.title = self.get_title()
|
||||
video.url = self.get_url(video.id)
|
||||
video.author = self.get_author()
|
||||
return video
|
||||
|
||||
def get_author(self):
|
||||
element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ class ICapVideo(ICap):
|
|||
SEARCH_VIEWS,
|
||||
SEARCH_DATE) = range(4)
|
||||
|
||||
def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False, required_fields=None):
|
||||
def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False):
|
||||
"""
|
||||
Iter results of a search on a pattern. Note that if pattern is None,
|
||||
it get the latest videos.
|
||||
|
|
@ -63,7 +63,6 @@ class ICapVideo(ICap):
|
|||
@param pattern [str] pattern to search on
|
||||
@param sortby [enum] sort by...
|
||||
@param nsfw [bool] include non-suitable for work videos if True
|
||||
@param required_fields [tuple] fields to load even if it takes many HTTP requests
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
|
|
|||
|
|
@ -270,6 +270,40 @@ class ConsoleApplication(BaseApplication):
|
|||
logging.error(e)
|
||||
|
||||
def do(self, function, *args, **kwargs):
|
||||
if self.selected_fields:
|
||||
kwargs['required_fields'] = set(self.selected_fields) - set('*')
|
||||
return self.weboob.do(function, *args, **kwargs)
|
||||
"""
|
||||
Call Weboob.do(), after having filled the yielded object, if selected fields are given by user.
|
||||
"""
|
||||
for backend, result in self.weboob.do(function, *args, **kwargs):
|
||||
fields = set(self.selected_fields) - set('*')
|
||||
if fields:
|
||||
try:
|
||||
backend.browser.fillobj(result, fields)
|
||||
except Exception, e:
|
||||
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
|
||||
yield backend, result
|
||||
|
||||
def do_caps(self, caps, function, *args, **kwargs):
|
||||
"""
|
||||
Call Weboob.do_caps(), after having filled the yielded object, if selected fields are given by user.
|
||||
"""
|
||||
for backend, result in self.weboob.do_caps(caps, function, *args, **kwargs):
|
||||
fields = set(self.selected_fields) - set('*')
|
||||
if fields:
|
||||
try:
|
||||
backend.browser.fillobj(result, fields)
|
||||
except Exception, e:
|
||||
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
|
||||
yield backend, result
|
||||
|
||||
def do_backends(self, backends, function, *args, **kwargs):
|
||||
"""
|
||||
Call Weboob.do_backends(), after having filled the yielded object, if selected fields are given by user.
|
||||
"""
|
||||
for backend, result in self.weboob.do_backends(backends, function, *args, **kwargs):
|
||||
fields = set(self.selected_fields) - set('*')
|
||||
if fields:
|
||||
try:
|
||||
backend.browser.fillobj(result, fields)
|
||||
except Exception, e:
|
||||
logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e))
|
||||
yield backend, result
|
||||
|
|
|
|||
|
|
@ -394,3 +394,6 @@ class BaseBrowser(mechanize.Browser):
|
|||
self[field] = value
|
||||
except ControlNotFoundError:
|
||||
return
|
||||
|
||||
def fillobj(self, obj, fields):
|
||||
raise NotImplementedError()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue