From 4a1e7e7b99e2ad6f6f97f960d70a46b5d83717db Mon Sep 17 00:00:00 2001 From: Christophe Benz Date: Thu, 20 May 2010 01:33:35 +0200 Subject: [PATCH] fix youjizz parser --- weboob/backends/youjizz/browser.py | 59 +++++++++++++++++--------- weboob/backends/youjizz/pages/video.py | 59 -------------------------- 2 files changed, 38 insertions(+), 80 deletions(-) delete mode 100644 weboob/backends/youjizz/pages/video.py diff --git a/weboob/backends/youjizz/browser.py b/weboob/backends/youjizz/browser.py index 71efb156..01745a88 100644 --- a/weboob/backends/youjizz/browser.py +++ b/weboob/backends/youjizz/browser.py @@ -1,29 +1,29 @@ # -*- coding: utf-8 -*- -""" -Copyright(C) 2010 Roger Philibert +# Copyright(C) 2010 Roger Philibert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, version 3 of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -""" +import lxml +import re import urllib from weboob.tools.browser import BaseBrowser from .pages.index import IndexPage -from .pages.video import VideoPage +from .video import YoujizzVideo __all__ = ['YoujizzBrowser'] @@ -33,13 +33,30 @@ class YoujizzBrowser(BaseBrowser): DOMAIN = 'youjizz.com' PROTOCOL = 'http' PAGES = {r'http://.*youjizz\.com/?': IndexPage, - r'http://.*youjizz\.com/videos/.+\.html': VideoPage, r'http://.*youjizz\.com/search/.+\.html': IndexPage, } - + def get_video(self, url): - self.location(url) - return self.page.video + data = self.openurl(url).read() + def _get_url(): + video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) + if len(video_file_urls) == 0: + return None + else: + if len(video_file_urls) > 1: + warning('Many video file URL found for given URL: %s' % video_file_urls) + return video_file_urls[0] + m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url) + _id = unicode(m.group(1)) if m else None + m = re.search(r'(.+)', data) + title = unicode(m.group(1)) if m else None + m = re.search(r'.*Runtime.*(.+)', data) + if m: + minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':')) + duration = minutes * 60 + seconds + else: + duration = 0 + return YoujizzVideo(_id=u'youjizz:%s' % _id, title=title, url=_get_url(), duration=duration, nsfw=True) def iter_page_urls(self, mozaic_url): raise NotImplementedError() diff --git a/weboob/backends/youjizz/pages/video.py b/weboob/backends/youjizz/pages/video.py deleted file mode 100644 index ef5b5d53..00000000 --- a/weboob/backends/youjizz/pages/video.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Copyright(C) 2010 Roger Philibert - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, version 3 of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -""" - -from logging import error, warning -import re - -from weboob.tools.browser import BasePage - -from ..video import YoujizzVideo - -class VideoPage(BasePage): - URL_REGEX = re.compile(r'http://.*youjizz\.com/videos/.+-(\d+)\.html') - VIDEO_FILE_REGEX = re.compile(r'"(http://media[^ ,]+\.flv)"') - - def on_loaded(self): - details = self.get_details() - self.video = YoujizzVideo(_id=self.get_id(), title=details.get('title', u''), url=self.get_url(), - duration=details.get('duration', 0), nsfw=True) - - def get_id(self): - m = self.URL_REGEX.match(self.url) - if m: - return int(m.group(1)) - warning("Unable to parse ID") - return 0 - - def get_url(self): - video_file_urls = re.findall(self.VIDEO_FILE_REGEX, self.browser.parser.tostring(self.document)) - if len(video_file_urls) == 0: - return None - else: - if len(video_file_urls) > 1: - error('Many video file URL found for given URL: %s' % video_file_urls) - return video_file_urls[0] - - def get_details(self): - results = {} - div = self.document.getroot().cssselect('#video_text')[0] - results['title'] = unicode(div.find('h2').text).strip() - minutes, seconds = [int(v) for v in [e for e in div.cssselect('strong') if e.text.startswith('Runtime')][0].tail.split(':')] - results['duration'] = minutes * 60 + seconds - return results