[ina] fix : site changed

This commit is contained in:
Bezleputh 2014-02-24 20:11:45 +01:00
commit 91f11bc948
5 changed files with 52 additions and 104 deletions

View file

@ -18,8 +18,6 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.video import ICapVideo from weboob.capabilities.video import ICapVideo
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend

View file

@ -21,7 +21,7 @@
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.browser.decorators import id2url from weboob.tools.browser.decorators import id2url
from .pages.video import VideoPage, BoutiqueVideoPage from .pages.video import VideoPage
from .pages.search import SearchPage from .pages.search import SearchPage
from .video import InaVideo from .video import InaVideo
@ -31,14 +31,14 @@ __all__ = ['InaBrowser']
class InaBrowser(BaseBrowser): class InaBrowser(BaseBrowser):
DOMAIN = 'ina.fr' DOMAIN = 'ina.fr'
PAGES = {'http://boutique\.ina\.fr/(video|audio)/.+\.html': BoutiqueVideoPage, PAGES = {'http://player.ina.fr/notices/.+\.mrss': (VideoPage, 'xml'),
'http://www\.ina\.fr/.+\.html': VideoPage,
'http://boutique\.ina\.fr/recherche/.+': SearchPage, 'http://boutique\.ina\.fr/recherche/.+': SearchPage,
} }
@id2url(InaVideo.id2url) @id2url(InaVideo.id2url)
def get_video(self, url, video=None): def get_video(self, url, video=None):
self.location(url) self.location(url)
#assert self.is_on_page(VideoPage)
return self.page.get_video(video) return self.page.get_video(video)
def search_videos(self, pattern): def search_videos(self, pattern):

View file

@ -31,7 +31,7 @@ __all__ = ['SearchPage']
class SearchPage(BasePage): class SearchPage(BasePage):
URL_REGEXP = re.compile(r'/video/(.+)\.html') URL_REGEXP = re.compile(r'/(.+)/(.+)\.jpeg')
def iter_videos(self): def iter_videos(self):
try: try:
@ -40,12 +40,12 @@ class SearchPage(BasePage):
# It means there are no results. # It means there are no results.
return return
for li in ul.findall('li'): for li in ul.findall('li'):
id = re.sub(self.URL_REGEXP, r'\1', li.find('a').attrib['href']) url = li.find('a').find('img').attrib['src']
video = InaVideo('boutique.%s' % id) id = re.sub(self.URL_REGEXP, r'\2', url)
video = InaVideo(id)
url = u'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] video.thumbnail = BaseImage(u'http://boutique.ina.fr%s' % url)
video.thumbnail = BaseImage(url)
video.thumbnail.url = video.thumbnail.id video.thumbnail.url = video.thumbnail.id
video.title = unicode(self.parser.select(li, 'p.titre', 1).text) video.title = unicode(self.parser.select(li, 'p.titre', 1).text)

View file

@ -18,83 +18,65 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime from datetime import datetime
import re import re
from urlparse import parse_qs
from weboob.capabilities import NotAvailable from weboob.capabilities import NotAvailable
from weboob.tools.browser import BasePage, BrokenPageError from weboob.capabilities.image import BaseImage
from weboob.tools.browser import BasePage
from ..video import InaVideo from ..video import InaVideo
__all__ = ['VideoPage']
__all__ = ['VideoPage', 'BoutiqueVideoPage']
class BaseVideoPage(BasePage): class VideoPage(BasePage):
def get_video(self, video): URL_REGEXP = re.compile('http://player.ina.fr/notices/(.+)\.mrss')
date, duration = self.get_date_and_duration()
if not video:
video = InaVideo(self.get_id())
video.title = self.get_title()
video.url = self.get_url()
video.date = date
video.duration = duration
video.description = self.get_description()
video.set_empty_fields(NotAvailable)
return video
def get_id(self): def get_id(self):
m = self.URL_REGEXP.match(self.url) m = self.URL_REGEXP.match(self.url)
if m: if m:
return self.create_id(m.group(1)) return m.group(1)
self.logger.warning('Unable to parse ID') self.logger.warning('Unable to parse ID')
return 0 return 0
def get_url(self): def get_video(self, video):
qs = parse_qs(self.document.getroot().cssselect('param[name="flashvars"]')[0].attrib['value']) if not video:
s = self.browser.readurl('http://www.ina.fr/player/infovideo/id_notice/%s/module_request/%s' % (qs['id_notice'][0], qs['module'][0])) video = InaVideo(self.get_id())
s = s[s.find('<Media>')+7:s.find('</Media>')]
return u'%s/id_chaine/%s/module_request/%s/pkey/%s' % \
(s, qs['id_chaine'][0], qs['module'][0], qs['pkey'][0])
def parse_date_and_duration(self, text): video.title = u'%s' % self.parser.select(self.document.getroot(),
duration_regexp = re.compile('(.* - )?(.+) - ((.+)h)?((.+)min)?(.+)s') '//rss/channel/item/title',
m = duration_regexp.match(text) 1,
if m: method='xpath').text
day, month, year = [abs(int(s)) for s in m.group(2).split('/')]
date = datetime.datetime(year, month, day)
duration = datetime.timedelta(hours=int(m.group(4) if m.group(4) is not None else 0),
minutes=int(m.group(6) if m.group(6) is not None else 0),
seconds=int(m.group(7)))
return date, duration
else:
raise BrokenPageError('Unable to parse date and duration')
def create_id(self, id): _image = u'%s' % self.parser.select(self.document.getroot(),
raise NotImplementedError() '//rss/channel/item/media:content/media:thumbnail',
1,
method='xpath',
namespaces={'media': 'http://search.yahoo.com/mrss/'}).attrib['url']
video.thumbnail = BaseImage(_image)
video.thumbnail.url = video.thumbnail.id
def get_date_and_duration(self): video.url = u'%s' % self.parser.select(self.document.getroot(),
raise NotImplementedError() '//rss/channel/item/media:content',
1,
method='xpath',
namespaces={'media': 'http://search.yahoo.com/mrss/'}).attrib['url']
def get_title(self): _date = self.parser.select(self.document.getroot(),
raise NotImplementedError() '//rss/channel/item/pubDate',
1,
def get_description(self): method='xpath').text
raise NotImplementedError() video.date = datetime.strptime(_date[:-6], '%a, %d %b %Y %H:%M:%S')
class VideoPage(BaseVideoPage): video.description = u'%s' % self.parser.select(self.document.getroot(),
URL_REGEXP = re.compile('http://www.ina.fr/(.+)\.html') '//rss/channel/item/description',
1,
method='xpath').text
def create_id(self, id): video.set_empty_fields(NotAvailable)
return u'www.%s' % id return video
def get_date_and_duration(self):
qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0].find('div').findall('div')[1]
return self.parse_date_and_duration(qr.find('h2').tail.strip())
def get_title(self): def get_title(self):
qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0] qr = self.parser.select(self.document.getroot(), 'div.container-global-qr')[0]
@ -104,29 +86,3 @@ class VideoPage(BaseVideoPage):
desc = self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p') desc = self.parser.select(self.document.getroot(), 'div.container-global-qr')[1].find('div').find('p')
if desc: if desc:
return unicode(desc.text.strip()) return unicode(desc.text.strip())
class BoutiqueVideoPage(BaseVideoPage):
URL_REGEXP = re.compile('http://boutique.ina.fr/(audio|video)/(.+).html')
def create_id(self, id):
return u'boutique.%s' % id
def get_description(self):
el = self.document.getroot().cssselect('div.bloc-produit-haut div.contenu p')[0]
if el is not None:
return unicode(el.text.strip())
def get_date_and_duration(self):
el = self.document.getroot().cssselect('div.bloc-produit-haut p.date')[0]
if el is not None:
return self.parse_date_and_duration(el.text.strip())
else:
raise BrokenPageError('Unable to find date and duration element')
def get_title(self):
el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0]
if el is not None:
return unicode(el.text.strip())
else:
return None

View file

@ -27,10 +27,4 @@ __all__ = ['InaVideo']
class InaVideo(BaseVideo): class InaVideo(BaseVideo):
@classmethod @classmethod
def id2url(cls, _id): def id2url(cls, _id):
if not '.' in _id: return "http://player.ina.fr/notices/%s.mrss" % _id
return None
site, _id = _id.split('.', 1)
if site == 'boutique':
return 'http://boutique.ina.fr/video/%s.html' % _id
if site == 'www':
return 'http://www.ina.fr/%s.html' % _id