fix youjizz parser
This commit is contained in:
parent
99c64d4849
commit
4a1e7e7b99
2 changed files with 40 additions and 82 deletions
|
|
@ -1,29 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Copyright(C) 2010 Roger Philibert
|
||||
# Copyright(C) 2010 Roger Philibert
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, version 3 of the License.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, version 3 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
"""
|
||||
|
||||
import lxml
|
||||
import re
|
||||
import urllib
|
||||
|
||||
from weboob.tools.browser import BaseBrowser
|
||||
|
||||
from .pages.index import IndexPage
|
||||
from .pages.video import VideoPage
|
||||
from .video import YoujizzVideo
|
||||
|
||||
|
||||
__all__ = ['YoujizzBrowser']
|
||||
|
|
@ -33,13 +33,30 @@ class YoujizzBrowser(BaseBrowser):
|
|||
DOMAIN = 'youjizz.com'
|
||||
PROTOCOL = 'http'
|
||||
PAGES = {r'http://.*youjizz\.com/?': IndexPage,
|
||||
r'http://.*youjizz\.com/videos/.+\.html': VideoPage,
|
||||
r'http://.*youjizz\.com/search/.+\.html': IndexPage,
|
||||
}
|
||||
|
||||
|
||||
def get_video(self, url):
|
||||
self.location(url)
|
||||
return self.page.video
|
||||
data = self.openurl(url).read()
|
||||
def _get_url():
|
||||
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
|
||||
if len(video_file_urls) == 0:
|
||||
return None
|
||||
else:
|
||||
if len(video_file_urls) > 1:
|
||||
warning('Many video file URL found for given URL: %s' % video_file_urls)
|
||||
return video_file_urls[0]
|
||||
m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
|
||||
_id = unicode(m.group(1)) if m else None
|
||||
m = re.search(r'<title>(.+)</title>', data)
|
||||
title = unicode(m.group(1)) if m else None
|
||||
m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
|
||||
if m:
|
||||
minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
|
||||
duration = minutes * 60 + seconds
|
||||
else:
|
||||
duration = 0
|
||||
return YoujizzVideo(_id=u'youjizz:%s' % _id, title=title, url=_get_url(), duration=duration, nsfw=True)
|
||||
|
||||
def iter_page_urls(self, mozaic_url):
|
||||
raise NotImplementedError()
|
||||
|
|
|
|||
|
|
@ -1,59 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Copyright(C) 2010 Roger Philibert
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, version 3 of the License.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program; if not, write to the Free Software
|
||||
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
"""
|
||||
|
||||
from logging import error, warning
|
||||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
|
||||
from ..video import YoujizzVideo
|
||||
|
||||
class VideoPage(BasePage):
|
||||
URL_REGEX = re.compile(r'http://.*youjizz\.com/videos/.+-(\d+)\.html')
|
||||
VIDEO_FILE_REGEX = re.compile(r'"(http://media[^ ,]+\.flv)"')
|
||||
|
||||
def on_loaded(self):
|
||||
details = self.get_details()
|
||||
self.video = YoujizzVideo(_id=self.get_id(), title=details.get('title', u''), url=self.get_url(),
|
||||
duration=details.get('duration', 0), nsfw=True)
|
||||
|
||||
def get_id(self):
|
||||
m = self.URL_REGEX.match(self.url)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
warning("Unable to parse ID")
|
||||
return 0
|
||||
|
||||
def get_url(self):
|
||||
video_file_urls = re.findall(self.VIDEO_FILE_REGEX, self.browser.parser.tostring(self.document))
|
||||
if len(video_file_urls) == 0:
|
||||
return None
|
||||
else:
|
||||
if len(video_file_urls) > 1:
|
||||
error('Many video file URL found for given URL: %s' % video_file_urls)
|
||||
return video_file_urls[0]
|
||||
|
||||
def get_details(self):
|
||||
results = {}
|
||||
div = self.document.getroot().cssselect('#video_text')[0]
|
||||
results['title'] = unicode(div.find('h2').text).strip()
|
||||
minutes, seconds = [int(v) for v in [e for e in div.cssselect('strong') if e.text.startswith('Runtime')][0].tail.split(':')]
|
||||
results['duration'] = minutes * 60 + seconds
|
||||
return results
|
||||
Loading…
Add table
Add a link
Reference in a new issue