[imdb] simplification of iters and fill short description in all cases

This commit is contained in:
Julien Veyssier 2013-03-07 02:28:50 +01:00
commit 19b418e6d0
3 changed files with 31 additions and 36 deletions

View file

@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.capabilities.cinema import Movie, Person from weboob.capabilities.cinema import Movie, Person
from weboob.tools.json import json from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
from datetime import datetime from datetime import datetime
@ -36,7 +36,6 @@ class ImdbBrowser(BaseBrowser):
ENCODING = 'utf-8' ENCODING = 'utf-8'
USER_AGENT = BaseBrowser.USER_AGENTS['wget'] USER_AGENT = BaseBrowser.USER_AGENTS['wget']
PAGES = { PAGES = {
'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
@ -98,10 +97,13 @@ class ImdbBrowser(BaseBrowser):
pitch = NotAvailable pitch = NotAvailable
country = NotAvailable country = NotAvailable
note = NotAvailable note = NotAvailable
short_description = NotAvailable
other_titles = [] other_titles = []
roles = {} roles = {}
title = unicode(jres['title'].strip()) title = unicode(jres['title'].strip())
if jres.has_key('directors'):
short_description = ', '.join(jres['directors'])
if jres.has_key('runtime'): if jres.has_key('runtime'):
dur_str = jres['runtime'][0].split(':') dur_str = jres['runtime'][0].split(':')
if len(dur_str) == 1: if len(dur_str) == 1:
@ -145,6 +147,7 @@ class ImdbBrowser(BaseBrowser):
movie.country = country movie.country = country
movie.note = note movie.note = note
movie.roles = roles movie.roles = roles
movie.short_description= short_description
return movie return movie
def get_person(self, id): def get_person(self, id):
@ -158,9 +161,10 @@ class ImdbBrowser(BaseBrowser):
return self.page.get_biography() return self.page.get_biography()
def iter_movie_persons(self, movie_id, role): def iter_movie_persons(self, movie_id, role):
self.location('http://www.imdb.com/title/%s' % movie_id) self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id)
assert self.is_on_page(MoviePage) assert self.is_on_page(MovieCrewPage)
return self.page.iter_persons(movie_id, role) for p in self.page.iter_persons(role):
yield p
def iter_person_movies(self, person_id, role): def iter_person_movies(self, person_id, role):
self.location('http://www.imdb.com/name/%s/filmotype' % person_id) self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
@ -168,11 +172,13 @@ class ImdbBrowser(BaseBrowser):
return self.page.iter_movies(role) return self.page.iter_movies(role)
def iter_person_movies_ids(self, person_id): def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id) self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
assert self.is_on_page(PersonPage) assert self.is_on_page(FilmographyPage)
return self.page.iter_movies_ids(person_id) for movie in self.page.iter_movies_ids():
yield movie
def iter_movie_persons_ids(self, movie_id): def iter_movie_persons_ids(self, movie_id):
self.location('http://www.imdb.com/title/%s' % movie_id) self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id)
assert self.is_on_page(MoviePage) assert self.is_on_page(MovieCrewPage)
return self.page.iter_persons_ids(movie_id) for person in self.page.iter_persons_ids():
yield person

View file

@ -25,23 +25,7 @@ from weboob.tools.browser import BasePage
from datetime import datetime from datetime import datetime
__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage'] __all__ = ['PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
class MoviePage(BasePage):
''' Page describing a movie, only used to go on the MovieCrewPage
'''
def iter_persons(self, id, role=None):
self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id)
assert self.browser.is_on_page(MovieCrewPage)
for p in self.browser.page.iter_persons(role):
yield p
def iter_persons_ids(self,id):
self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id)
assert self.browser.is_on_page(MovieCrewPage)
for p in self.browser.page.iter_persons_ids():
yield p
class BiographyPage(BasePage): class BiographyPage(BasePage):
@ -171,16 +155,17 @@ class PersonPage(BasePage):
person.roles = roles person.roles = roles
return person return person
def iter_movies_ids(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1]
yield id
class FilmographyPage(BasePage): class FilmographyPage(BasePage):
''' Page of detailed filmography of a person, sorted by type of role ''' Page of detailed filmography of a person, sorted by type of role
This page is easier to parse than the main person page filmography This page is easier to parse than the main person page filmography
''' '''
def iter_movies_ids(self):
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
yield id
def get_roles(self): def get_roles(self):
roles = {} roles = {}
for role_div in self.parser.select(self.document.getroot(),'div.filmo'): for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
@ -205,6 +190,9 @@ class FilmographyPage(BasePage):
id = a.attrib.get('href','').strip('/').split('/')[-1] id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'): if id.startswith('tt'):
title = a.text title = a.text
#movie = self.browser.get_movie(id) role_detail = NotAvailable
if len(a.tail) > 0:
role_detail = unicode(' '.join(a.tail.replace('..','').split()))
movie = Movie(id,title) movie = Movie(id,title)
movie.short_description = role_detail
yield movie yield movie

View file

@ -203,7 +203,8 @@ class Cineoob(ReplApplication):
inter = list(set(lid1) & set(lid2)) inter = list(set(lid1) & set(lid2))
for common in inter: for common in inter:
movie = self.get_object(common, 'get_movie') movie = self.get_object(common, 'get_movie')
self.cached_format(movie) if movie:
self.cached_format(movie)
self.flush() self.flush()
def do_persons_in_common(self, line): def do_persons_in_common(self, line):