fix youjizz parser

This commit is contained in:
Christophe Benz 2010-05-20 01:33:35 +02:00
commit 4a1e7e7b99
2 changed files with 40 additions and 82 deletions

View file

@ -1,29 +1,29 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" # Copyright(C) 2010 Roger Philibert
Copyright(C) 2010 Roger Philibert #
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
import lxml
import re
import urllib import urllib
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from .pages.index import IndexPage from .pages.index import IndexPage
from .pages.video import VideoPage from .video import YoujizzVideo
__all__ = ['YoujizzBrowser'] __all__ = ['YoujizzBrowser']
@ -33,13 +33,30 @@ class YoujizzBrowser(BaseBrowser):
DOMAIN = 'youjizz.com' DOMAIN = 'youjizz.com'
PROTOCOL = 'http' PROTOCOL = 'http'
PAGES = {r'http://.*youjizz\.com/?': IndexPage, PAGES = {r'http://.*youjizz\.com/?': IndexPage,
r'http://.*youjizz\.com/videos/.+\.html': VideoPage,
r'http://.*youjizz\.com/search/.+\.html': IndexPage, r'http://.*youjizz\.com/search/.+\.html': IndexPage,
} }
def get_video(self, url): def get_video(self, url):
self.location(url) data = self.openurl(url).read()
return self.page.video def _get_url():
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0:
return None
else:
if len(video_file_urls) > 1:
warning('Many video file URL found for given URL: %s' % video_file_urls)
return video_file_urls[0]
m = re.search(r'http://.*youjizz\.com/videos/(.+)\.html', url)
_id = unicode(m.group(1)) if m else None
m = re.search(r'<title>(.+)</title>', data)
title = unicode(m.group(1)) if m else None
m = re.search(r'<strong>.*Runtime.*</strong>(.+)<br.*>', data)
if m:
minutes, seconds = (int(v) for v in unicode(m.group(1).strip()).split(':'))
duration = minutes * 60 + seconds
else:
duration = 0
return YoujizzVideo(_id=u'youjizz:%s' % _id, title=title, url=_get_url(), duration=duration, nsfw=True)
def iter_page_urls(self, mozaic_url): def iter_page_urls(self, mozaic_url):
raise NotImplementedError() raise NotImplementedError()

View file

@ -1,59 +0,0 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Roger Philibert
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
from logging import error, warning
import re
from weboob.tools.browser import BasePage
from ..video import YoujizzVideo
class VideoPage(BasePage):
URL_REGEX = re.compile(r'http://.*youjizz\.com/videos/.+-(\d+)\.html')
VIDEO_FILE_REGEX = re.compile(r'"(http://media[^ ,]+\.flv)"')
def on_loaded(self):
details = self.get_details()
self.video = YoujizzVideo(_id=self.get_id(), title=details.get('title', u''), url=self.get_url(),
duration=details.get('duration', 0), nsfw=True)
def get_id(self):
m = self.URL_REGEX.match(self.url)
if m:
return int(m.group(1))
warning("Unable to parse ID")
return 0
def get_url(self):
video_file_urls = re.findall(self.VIDEO_FILE_REGEX, self.browser.parser.tostring(self.document))
if len(video_file_urls) == 0:
return None
else:
if len(video_file_urls) > 1:
error('Many video file URL found for given URL: %s' % video_file_urls)
return video_file_urls[0]
def get_details(self):
results = {}
div = self.document.getroot().cssselect('#video_text')[0]
results['title'] = unicode(div.find('h2').text).strip()
minutes, seconds = [int(v) for v in [e for e in div.cssselect('strong') if e.text.startswith('Runtime')][0].tail.split(':')]
results['duration'] = minutes * 60 + seconds
return results