[imdb] simplification of iters and fill short description in all cases

This commit is contained in:
Julien Veyssier 2013-03-07 02:28:50 +01:00
commit 19b418e6d0
3 changed files with 31 additions and 36 deletions

View file

@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.capabilities.cinema import Movie, Person
from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
from datetime import datetime
@ -36,7 +36,6 @@ class ImdbBrowser(BaseBrowser):
ENCODING = 'utf-8'
USER_AGENT = BaseBrowser.USER_AGENTS['wget']
PAGES = {
'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
@ -98,10 +97,13 @@ class ImdbBrowser(BaseBrowser):
pitch = NotAvailable
country = NotAvailable
note = NotAvailable
short_description = NotAvailable
other_titles = []
roles = {}
title = unicode(jres['title'].strip())
if jres.has_key('directors'):
short_description = ', '.join(jres['directors'])
if jres.has_key('runtime'):
dur_str = jres['runtime'][0].split(':')
if len(dur_str) == 1:
@ -145,6 +147,7 @@ class ImdbBrowser(BaseBrowser):
movie.country = country
movie.note = note
movie.roles = roles
movie.short_description= short_description
return movie
def get_person(self, id):
@ -158,9 +161,10 @@ class ImdbBrowser(BaseBrowser):
return self.page.get_biography()
def iter_movie_persons(self, movie_id, role):
self.location('http://www.imdb.com/title/%s' % movie_id)
assert self.is_on_page(MoviePage)
return self.page.iter_persons(movie_id, role)
self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id)
assert self.is_on_page(MovieCrewPage)
for p in self.page.iter_persons(role):
yield p
def iter_person_movies(self, person_id, role):
self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
@ -168,11 +172,13 @@ class ImdbBrowser(BaseBrowser):
return self.page.iter_movies(role)
def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(PersonPage)
return self.page.iter_movies_ids(person_id)
self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
assert self.is_on_page(FilmographyPage)
for movie in self.page.iter_movies_ids():
yield movie
def iter_movie_persons_ids(self, movie_id):
self.location('http://www.imdb.com/title/%s' % movie_id)
assert self.is_on_page(MoviePage)
return self.page.iter_persons_ids(movie_id)
self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id)
assert self.is_on_page(MovieCrewPage)
for person in self.page.iter_persons_ids():
yield person

View file

@ -25,23 +25,7 @@ from weboob.tools.browser import BasePage
from datetime import datetime
__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
class MoviePage(BasePage):
''' Page describing a movie, only used to go on the MovieCrewPage
'''
def iter_persons(self, id, role=None):
self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id)
assert self.browser.is_on_page(MovieCrewPage)
for p in self.browser.page.iter_persons(role):
yield p
def iter_persons_ids(self,id):
self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id)
assert self.browser.is_on_page(MovieCrewPage)
for p in self.browser.page.iter_persons_ids():
yield p
__all__ = ['PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
class BiographyPage(BasePage):
@ -171,16 +155,17 @@ class PersonPage(BasePage):
person.roles = roles
return person
def iter_movies_ids(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1]
yield id
class FilmographyPage(BasePage):
''' Page of detailed filmography of a person, sorted by type of role
This page is easier to parse than the main person page filmography
'''
def iter_movies_ids(self):
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
yield id
def get_roles(self):
roles = {}
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
@ -205,6 +190,9 @@ class FilmographyPage(BasePage):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
title = a.text
#movie = self.browser.get_movie(id)
role_detail = NotAvailable
if len(a.tail) > 0:
role_detail = unicode(' '.join(a.tail.replace('..','').split()))
movie = Movie(id,title)
movie.short_description = role_detail
yield movie

View file

@ -203,7 +203,8 @@ class Cineoob(ReplApplication):
inter = list(set(lid1) & set(lid2))
for common in inter:
movie = self.get_object(common, 'get_movie')
self.cached_format(movie)
if movie:
self.cached_format(movie)
self.flush()
def do_persons_in_common(self, line):