diff --git a/weboob/backends/ina/backend.py b/weboob/backends/ina/backend.py index 8863b454..4b8694c6 100644 --- a/weboob/backends/ina/backend.py +++ b/weboob/backends/ina/backend.py @@ -40,6 +40,6 @@ class InaBackend(BaseBackend, ICapVideo): def get_video(self, _id): return self.browser.get_video(_id) - def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): + def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): debug(u'backend ina: iter_search_results is not implemented') return set() diff --git a/weboob/backends/youjizz/backend.py b/weboob/backends/youjizz/backend.py index 3253f48e..07c1e04a 100644 --- a/weboob/backends/youjizz/backend.py +++ b/weboob/backends/youjizz/backend.py @@ -42,7 +42,7 @@ class YoujizzBackend(BaseBackend, ICapVideo): def iter_page_urls(self, mozaic_url): return self.browser.iter_page_urls(mozaic_url) - def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): + def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): if not nsfw: return - return self.browser.iter_search_results(pattern, required_fields=required_fields) + return self.browser.iter_search_results(pattern) diff --git a/weboob/backends/youjizz/browser.py b/weboob/backends/youjizz/browser.py index 8a2f79fb..d3520748 100644 --- a/weboob/backends/youjizz/browser.py +++ b/weboob/backends/youjizz/browser.py @@ -16,7 +16,6 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -import datetime import logging import re import urllib @@ -26,6 +25,7 @@ from weboob.tools.browser.decorators import check_domain, id2url from weboob.tools.misc import iter_fields, to_unicode from .pages.index import IndexPage +from .pages.video import VideoPage from .video import YoujizzVideo @@ -37,57 +37,28 @@ class YoujizzBrowser(BaseBrowser): ENCODING = None PAGES = {r'http://.*youjizz\.com/?': IndexPage, r'http://.*youjizz\.com/index.php': IndexPage, - r'http://.*youjizz\.com/search/.+\.html': IndexPage, + r'http://.*youjizz\.com/search/(?P.+)\.html': IndexPage, + r'http://.*youjizz\.com/videos/(?P.+)\.html': VideoPage, } + def fillobj(self, video, fields): + # ignore the fields param: VideoPage.get_video() returns all the information + self.location(YoujizzVideo.id2url(video.id)) + return self.page.get_video(video) + @id2url(YoujizzVideo.id2url) - def get_video(self, url, video=None): - try: - data = self.openurl(url.encode('utf-8')).read() - except BrowserUnavailable: - return None - def _get_url(): - video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) - if len(video_file_urls) == 0: - return None - else: - if len(video_file_urls) > 1: - logging.warning('Many video file URL found for given URL: %s' % video_file_urls) - return video_file_urls[0] - m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url) - _id = unicode(m.group(1)) if m else None - if video is None: - video = YoujizzVideo(_id) - m = re.search(r'(.+)', data) - title = to_unicode(m.group(1)) if m else None - m = re.search(r'.*Runtime.*(.+)', data) - if m: - minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':')) - else: - minutes = seconds = 0 - video.title = title - video.url = _get_url() - video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) - return video + def get_video(self, url): + self.location(url) + return self.page.get_video() @check_domain def iter_page_urls(self, mozaic_url): raise NotImplementedError() - def iter_search_results(self, pattern, required_fields=None): + def iter_search_results(self, pattern): if not pattern: self.home() else: self.location('/search/%s-1.html' % (urllib.quote_plus(pattern))) assert self.is_on_page(IndexPage) - - for video in self.page.iter_videos(): - if required_fields is not None: - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - logging.debug(u'Completing missing required fields: %s' % missing_required_fields) - self.get_video(video.id, video=video) - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields) - yield video + return self.page.iter_videos() diff --git a/weboob/backends/youjizz/pages/video.py b/weboob/backends/youjizz/pages/video.py new file mode 100644 index 00000000..73d8d026 --- /dev/null +++ b/weboob/backends/youjizz/pages/video.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Roger Philibert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +import datetime +import lxml.html +import re + +from weboob.tools.browser import BasePage +from weboob.tools.misc import to_unicode +from weboob.tools.parsers.lxmlparser import select, SelectElementException + +from ..video import YoujizzVideo + + +__all__ = ['VideoPage'] + + +class VideoPage(BasePage): + + def get_video(self, video=None): + _id = to_unicode(self.group_dict['id']) + if video is None: + video = YoujizzVideo(_id) + title_el = select(self.document.getroot(), 'title', 1) + video.title = to_unicode(title_el.text.strip()) + + # youjizz HTML is crap, we must parse it with regexps + data = lxml.html.tostring(self.document.getroot()) + m = re.search(r'.*?Runtime.*? (.+?)', data) + try: + if m: + minutes, seconds = (int(v) for v in to_unicode(m.group(1).strip()).split(':')) + video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) + else: + raise Exception() + except Exception: + raise SelectElementException('Could not retrieve video duration') + + video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) + if len(video_file_urls) == 0: + raise SelectElementException('Video URL not found') + elif len(video_file_urls) > 1: + raise SelectElementException('Many video file URL found') + else: + video.url = video_file_urls[0] + + return video + diff --git a/weboob/backends/youporn/backend.py b/weboob/backends/youporn/backend.py index 66660c08..df462e75 100644 --- a/weboob/backends/youporn/backend.py +++ b/weboob/backends/youporn/backend.py @@ -39,10 +39,10 @@ class YoupornBackend(BaseBackend, ICapVideo): return self.browser.get_video(_id) SORTBY = ['relevance', 'rating', 'views', 'time'] - def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): + def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): if not nsfw: return - return self.browser.iter_search_results(pattern, self.SORTBY[sortby], required_fields=required_fields) + return self.browser.iter_search_results(pattern, self.SORTBY[sortby]) def iter_page_urls(self, mozaic_url): raise NotImplementedError() diff --git a/weboob/backends/youporn/browser.py b/weboob/backends/youporn/browser.py index 0ec979c0..5202b7a8 100644 --- a/weboob/backends/youporn/browser.py +++ b/weboob/backends/youporn/browser.py @@ -39,30 +39,20 @@ class YoupornBrowser(BaseBrowser): r'http://[w\.]*youporngay\.com:80/watch/(?P.+)': VideoPage, } - @id2url(YoupornVideo.id2url) - def get_video(self, url, video=None): - self.location(url) - if video is None: - return self.page.video - else: - for k, v in iter_fields(self.page.video): - if v and getattr(video, k) != v: - setattr(video, k, v) - return video + def fillobj(self, video, fields): + # ignore the fields param: VideoPage.get_video() returns all the information + self.location(YoupornVideo.id2url(video.id)) + return self.page.get_video(video) - def iter_search_results(self, pattern, sortby, required_fields=None): + @id2url(YoupornVideo.id2url) + def get_video(self, url): + self.location(url) + return self.page.get_video() + + def iter_search_results(self, pattern, sortby): if not pattern: self.home() else: self.location(self.buildurl('/search/%s' % sortby, query=pattern)) assert self.is_on_page(IndexPage) - for video in self.page.iter_videos(): - if required_fields is not None: - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - logging.debug(u'Completing missing required fields: %s' % missing_required_fields) - self.get_video(video.id, video=video) - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields) - yield video + return self.page.iter_videos() diff --git a/weboob/backends/youporn/pages/video.py b/weboob/backends/youporn/pages/video.py index 0a7cc21f..c7df0d5a 100644 --- a/weboob/backends/youporn/pages/video.py +++ b/weboob/backends/youporn/pages/video.py @@ -27,19 +27,20 @@ from ..video import YoupornVideo class VideoPage(PornPage): - def on_loaded(self): + def get_video(self, video=None): if not PornPage.on_loaded(self): return - self.video = YoupornVideo(self.group_dict['id'], - self.get_title(), - self.get_url(), - ) - self.set_details(self.video) + if video is None: + video = YoupornVideo(self.group_dict['id']) + video.title = self.get_title() + video.url = self.get_url() + self.set_details(video) + return video def get_url(self): - el = self.document.getroot().cssselect('div[id=download]') - if el: - return el[0].cssselect('a')[0].attrib['href'] + download_div = select(self.document.getroot(), '#download', 1) + a = select(download_div, 'a', 1) + return a.attrib['href'] def get_title(self): element = select(self.document.getroot(), '#videoArea h1', 1) @@ -49,11 +50,8 @@ class VideoPage(PornPage): MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] def set_details(self, v): - div = self.document.getroot().cssselect('div[id=details]') - if not div: - return - - for li in div[0].getiterator('li'): + details_div = select(self.document.getroot(), '#details', 1) + for li in details_div.getiterator('li'): span = li.find('span') name = span.text.strip() value = span.tail.strip() diff --git a/weboob/backends/youtube/backend.py b/weboob/backends/youtube/backend.py index 10140d8c..05eebe2f 100644 --- a/weboob/backends/youtube/backend.py +++ b/weboob/backends/youtube/backend.py @@ -24,7 +24,6 @@ from weboob.tools.backend import BaseBackend from weboob.tools.misc import iter_fields from .browser import YoutubeBrowser -from .pages import ForbiddenVideo from .video import YoutubeVideo @@ -41,23 +40,10 @@ class YoutubeBackend(BaseBackend, ICapVideo): BROWSER = YoutubeBrowser - def get_video(self, _id, video=None): - try: - browser_video = self.browser.get_video(_id) - except ForbiddenVideo: - if video is None: - return None - else: - raise - if video is None: - return browser_video - else: - for k, v in iter_fields(browser_video): - if v and getattr(video, k) != v: - setattr(video, k, v) - return video + def get_video(self, _id): + return self.browser.get_video(_id) - def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, required_fields=None): + def iter_search_results(self, pattern=None, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False): import gdata.youtube.service yt_service = gdata.youtube.service.YouTubeService() query = gdata.youtube.service.YouTubeVideoQuery() @@ -77,19 +63,6 @@ class YoutubeBackend(BaseBackend, ICapVideo): duration=datetime.timedelta(seconds=int(entry.media.duration.seconds.decode('utf-8').strip())), thumbnail_url=entry.media.thumbnail[0].url.decode('utf-8').strip(), ) - if required_fields is not None: - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - logging.debug(u'Completing missing required fields: %s' % missing_required_fields) - try: - self.get_video(video.id, video=video) - except ForbiddenVideo, e: - logging.debug(e) - continue - else: - missing_required_fields = set(required_fields) - set(k for k, v in iter_fields(video) if v) - if missing_required_fields: - raise Exception(u'Could not load all required fields. Missing: %s' % missing_required_fields) yield video def iter_page_urls(self, mozaic_url): diff --git a/weboob/backends/youtube/browser.py b/weboob/backends/youtube/browser.py index 6eb83a93..0ad0bc67 100644 --- a/weboob/backends/youtube/browser.py +++ b/weboob/backends/youtube/browser.py @@ -16,10 +16,12 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +import logging + from weboob.tools.browser import BaseBrowser from weboob.tools.browser.decorators import id2url -from .pages import ForbiddenVideoPage, VerifyAgePage, VideoPage +from .pages import ForbiddenVideo, ForbiddenVideoPage, VerifyAgePage, VideoPage from .video import YoutubeVideo @@ -34,10 +36,12 @@ class YoutubeBrowser(BaseBrowser): r'.*youtube\.com/verify_age\?next_url=(?P.+)': VerifyAgePage, } + def fillobj(self, video, fields): + # ignore the fields param: VideoPage.get_video() returns all the information + self.location(YoutubeVideo.id2url(video.id)) + return self.page.get_video(video) + @id2url(YoutubeVideo.id2url) def get_video(self, url): self.location(url) - if hasattr(self.page, 'video'): - return self.page.video - else: - return None + return self.page.get_video() diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py index 6fc523c4..33b18ddc 100644 --- a/weboob/backends/youtube/pages.py +++ b/weboob/backends/youtube/pages.py @@ -32,26 +32,26 @@ class ForbiddenVideo(Exception): class ForbiddenVideoPage(BasePage): - def on_loaded(self): + def get_video(self, video=None): element = select(self.document.getroot(), '.yt-alert-content', 1) raise ForbiddenVideo(element.text.strip()) class VerifyAgePage(BasePage): - def on_loaded(self): + def get_video(self, video=None): raise ForbiddenVideo('verify age not implemented') class VideoPage(BasePage): VIDEO_SIGNATURE_REGEX = re.compile(r'&t=([^ ,&]*)') - def on_loaded(self): - _id = self.group_dict['id'] - self.video = YoutubeVideo(_id, - title=self.get_title(), - url=self.get_url(_id), - author=self.get_author(), - ) + def get_video(self, video=None): + if video is None: + video = YoutubeVideo(self.group_dict['id']) + video.title = self.get_title() + video.url = self.get_url(video.id) + video.author = self.get_author() + return video def get_author(self): element = select(self.document.getroot(), 'a.watch-description-username strong', 1) diff --git a/weboob/capabilities/video.py b/weboob/capabilities/video.py index 600e8ebf..d004c0a3 100644 --- a/weboob/capabilities/video.py +++ b/weboob/capabilities/video.py @@ -55,7 +55,7 @@ class ICapVideo(ICap): SEARCH_VIEWS, SEARCH_DATE) = range(4) - def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False, required_fields=None): + def iter_search_results(self, pattern=None, sortby=SEARCH_RELEVANCE, nsfw=False): """ Iter results of a search on a pattern. Note that if pattern is None, it get the latest videos. @@ -63,7 +63,6 @@ class ICapVideo(ICap): @param pattern [str] pattern to search on @param sortby [enum] sort by... @param nsfw [bool] include non-suitable for work videos if True - @param required_fields [tuple] fields to load even if it takes many HTTP requests """ raise NotImplementedError() diff --git a/weboob/tools/application/console.py b/weboob/tools/application/console.py index 0c73d999..9586df53 100644 --- a/weboob/tools/application/console.py +++ b/weboob/tools/application/console.py @@ -270,6 +270,40 @@ class ConsoleApplication(BaseApplication): logging.error(e) def do(self, function, *args, **kwargs): - if self.selected_fields: - kwargs['required_fields'] = set(self.selected_fields) - set('*') - return self.weboob.do(function, *args, **kwargs) + """ + Call Weboob.do(), after having filled the yielded object, if selected fields are given by user. + """ + for backend, result in self.weboob.do(function, *args, **kwargs): + fields = set(self.selected_fields) - set('*') + if fields: + try: + backend.browser.fillobj(result, fields) + except Exception, e: + logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e)) + yield backend, result + + def do_caps(self, caps, function, *args, **kwargs): + """ + Call Weboob.do_caps(), after having filled the yielded object, if selected fields are given by user. + """ + for backend, result in self.weboob.do_caps(caps, function, *args, **kwargs): + fields = set(self.selected_fields) - set('*') + if fields: + try: + backend.browser.fillobj(result, fields) + except Exception, e: + logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e)) + yield backend, result + + def do_backends(self, backends, function, *args, **kwargs): + """ + Call Weboob.do_backends(), after having filled the yielded object, if selected fields are given by user. + """ + for backend, result in self.weboob.do_backends(backends, function, *args, **kwargs): + fields = set(self.selected_fields) - set('*') + if fields: + try: + backend.browser.fillobj(result, fields) + except Exception, e: + logging.warning(u'Could not retrieve required fields (%s): %s' % (','.join(fields), e)) + yield backend, result diff --git a/weboob/tools/browser/browser.py b/weboob/tools/browser/browser.py index 20a783c0..598fab47 100644 --- a/weboob/tools/browser/browser.py +++ b/weboob/tools/browser/browser.py @@ -394,3 +394,6 @@ class BaseBrowser(mechanize.Browser): self[field] = value except ControlNotFoundError: return + + def fillobj(self, obj, fields): + raise NotImplementedError()