support repositories to manage backends (closes #747)

2012-01-03 12:10:21 +01:00 · 2012-01-03 12:10:21 +01:00 · 14a7a1d362
commit 14a7a1d362
parent ef16a5b726
410 changed files with 1079 additions and 297 deletions
--- a/modules/youjizz/pages/init.py
+++ b/modules/youjizz/pages/init.py
--- a/modules/youjizz/pages/index.py
+++ b/modules/youjizz/pages/index.py
@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Roger Philibert
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import datetime
+import re
+
+from weboob.tools.browser import BasePage
+from weboob.tools.browser import BrokenPageError
+
+from ..video import YoujizzVideo
+
+
+__all__ = ['IndexPage']
+
+
+class IndexPage(BasePage):
+    def iter_videos(self):
+        span_list = self.parser.select(self.document.getroot(), 'span#miniatura')
+        for span in span_list:
+            a = self.parser.select(span, 'a', 1)
+            url = a.attrib['href']
+            _id = re.sub(r'/videos/(.+)\.html', r'\1', url)
+
+            thumbnail_url = span.find('.//img').attrib['src']
+
+            title_el = self.parser.select(span, 'span#title1', 1)
+            title = title_el.text.strip()
+
+            time_span = self.parser.select(span, 'span.thumbtime span', 1)
+            time_txt = time_span.text.strip().replace(';', ':')
+            if time_txt == 'N/A':
+                minutes, seconds = 0, 0
+            elif ':' in time_txt:
+                minutes, seconds = (int(v) for v in time_txt.split(':'))
+            else:
+                raise BrokenPageError('Unable to parse the video duration: %s' % time_txt)
+
+
+            yield YoujizzVideo(_id,
+                               title=title,
+                               duration=datetime.timedelta(minutes=minutes, seconds=seconds),
+                               thumbnail_url=thumbnail_url,
+                               )
--- a/modules/youjizz/pages/video.py
+++ b/modules/youjizz/pages/video.py
@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Roger Philibert
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import datetime
+import lxml.html
+import re
+
+from weboob.capabilities.base import NotAvailable
+from weboob.tools.browser import BasePage, BrokenPageError
+from weboob.tools.misc import to_unicode
+
+from ..video import YoujizzVideo
+
+
+__all__ = ['VideoPage']
+
+
+class VideoPage(BasePage):
+
+    def get_video(self, video=None):
+        _id = to_unicode(self.group_dict['id'])
+        if video is None:
+            video = YoujizzVideo(_id)
+        title_el = self.parser.select(self.document.getroot(), 'title', 1)
+        video.title = to_unicode(title_el.text.strip())
+
+        # youjizz HTML is crap, we must parse it with regexps
+        data = lxml.html.tostring(self.document.getroot())
+        m = re.search(r'<strong>.*?Runtime.*?</strong> (.+?)<br.*>', data)
+        if m:
+            txt = m.group(1).strip()
+            if txt == 'Unknown':
+                video.duration = NotAvailable
+            else:
+                minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
+                video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
+        else:
+            raise BrokenPageError('Unable to retrieve video duration')
+
+        video_file_urls = re.findall(r'"(http://[^",]+\.youjizz\.com[^",]+\.flv)[\?"]', data)
+        if len(video_file_urls) == 0:
+            raise BrokenPageError('Video URL not found')
+        elif len(video_file_urls) > 1:
+            raise BrokenPageError('Many video file URL found')
+        else:
+            video.url = video_file_urls[0]
+
+        return video
+