fix youjizz parser

2010-05-20 01:33:35 +02:00 · 2010-05-20 01:33:35 +02:00 · 4a1e7e7b99
commit 4a1e7e7b99
parent 99c64d4849
2 changed files with 40 additions and 82 deletions
--- a/weboob/backends/youjizz/browser.py
+++ b/weboob/backends/youjizz/browser.py
@ -1,29 +1,29 @@
 # -*- coding: utf-8 -*-
-"""
+# Copyright(C) 2010  Roger Philibert
-Copyright(C) 2010  Roger Philibert
+# 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, version 3 of the License.
 # 
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 # 
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, version 3 of the License.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 import lxml
 import re
 import urllib
 from weboob.tools.browser import BaseBrowser
 from .pages.index import IndexPage
-from .pages.video import VideoPage
+from .video import YoujizzVideo
 __all__ = ['YoujizzBrowser']
@ -33,13 +33,30 @@ class YoujizzBrowser(BaseBrowser):
    DOMAIN = 'youjizz.com'
    PROTOCOL = 'http'
    PAGES = {r'http://.*youjizz\.com/?': IndexPage,
             r'http://.*youjizz\.com/videos/.+\.html': VideoPage,
             r'http://.*youjizz\.com/search/.+\.html': IndexPage,
            }
-
+    
    def get_video(self, url):
-        self.location(url)
+        data = self.openurl(url).read()
-        return self.page.video
+        def _get_url():
            video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
            if len(video_file_urls) == 0:
                return None
            else:
                if len(video_file_urls) > 1:
                    warning('Many video file URL found for given URL: %s' % video_file_urls)
                return video_file_urls[0]
        m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
        _id = unicode(m.group(1)) if m else None
        m = re.search(r'<title>(.+)</title>', data)
        title = unicode(m.group(1)) if m else None
        m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
        if m:
            minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
            duration = minutes * 60 + seconds
        else:
            duration = 0
        return YoujizzVideo(_id=u'youjizz:%s' % _id, title=title, url=_get_url(), duration=duration, nsfw=True)
    def iter_page_urls(self, mozaic_url):
        raise NotImplementedError()
--- a/weboob/backends/youjizz/pages/video.py
+++ b/weboob/backends/youjizz/pages/video.py
@ -1,59 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 Copyright(C) 2010  Roger Philibert
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, version 3 of the License.
 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.
 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 """
 from logging import error, warning
 import re
 from weboob.tools.browser import BasePage
 from ..video import YoujizzVideo
 class VideoPage(BasePage):
    URL_REGEX = re.compile(r'http://.*youjizz\.com/videos/.+-(\d+)\.html')
    VIDEO_FILE_REGEX = re.compile(r'"(http://media[^ ,]+\.flv)"')
    def on_loaded(self):
        details = self.get_details()
        self.video = YoujizzVideo(_id=self.get_id(), title=details.get('title', u''), url=self.get_url(),
                                  duration=details.get('duration', 0), nsfw=True)
    def get_id(self):
        m = self.URL_REGEX.match(self.url)
        if m:
            return int(m.group(1))
        warning("Unable to parse ID")
        return 0
    def get_url(self):
        video_file_urls = re.findall(self.VIDEO_FILE_REGEX, self.browser.parser.tostring(self.document))
        if len(video_file_urls) == 0:
            return None
        else:
            if len(video_file_urls) > 1:
                error('Many video file URL found for given URL: %s' % video_file_urls)
            return video_file_urls[0]
    def get_details(self):
        results = {}
        div = self.document.getroot().cssselect('#video_text')[0]
        results['title'] = unicode(div.find('h2').text).strip()
        minutes, seconds = [int(v) for v in [e for e in div.cssselect('strong') if e.text.startswith('Runtime')][0].tail.split(':')]
        results['duration'] = minutes * 60 + seconds
        return results