diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index fad7a0e0..d6b881e9 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.cinema import Movie, Person from weboob.tools.json import json -from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage +from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage from datetime import datetime @@ -36,7 +36,6 @@ class ImdbBrowser(BaseBrowser): ENCODING = 'utf-8' USER_AGENT = BaseBrowser.USER_AGENTS['wget'] PAGES = { - 'http://www.imdb.com/title/tt[0-9]*/*': MoviePage, 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, @@ -98,10 +97,13 @@ class ImdbBrowser(BaseBrowser): pitch = NotAvailable country = NotAvailable note = NotAvailable + short_description = NotAvailable other_titles = [] roles = {} title = unicode(jres['title'].strip()) + if jres.has_key('directors'): + short_description = ', '.join(jres['directors']) if jres.has_key('runtime'): dur_str = jres['runtime'][0].split(':') if len(dur_str) == 1: @@ -145,6 +147,7 @@ class ImdbBrowser(BaseBrowser): movie.country = country movie.note = note movie.roles = roles + movie.short_description= short_description return movie def get_person(self, id): @@ -158,9 +161,10 @@ class ImdbBrowser(BaseBrowser): return self.page.get_biography() def iter_movie_persons(self, movie_id, role): - self.location('http://www.imdb.com/title/%s' % movie_id) - assert self.is_on_page(MoviePage) - return self.page.iter_persons(movie_id, role) + self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id) + assert self.is_on_page(MovieCrewPage) + for p in self.page.iter_persons(role): + yield p def iter_person_movies(self, person_id, role): self.location('http://www.imdb.com/name/%s/filmotype' % person_id) @@ -168,11 +172,13 @@ class ImdbBrowser(BaseBrowser): return self.page.iter_movies(role) def iter_person_movies_ids(self, person_id): - self.location('http://www.imdb.com/name/%s' % person_id) - assert self.is_on_page(PersonPage) - return self.page.iter_movies_ids(person_id) + self.location('http://www.imdb.com/name/%s/filmotype' % person_id) + assert self.is_on_page(FilmographyPage) + for movie in self.page.iter_movies_ids(): + yield movie def iter_movie_persons_ids(self, movie_id): - self.location('http://www.imdb.com/title/%s' % movie_id) - assert self.is_on_page(MoviePage) - return self.page.iter_persons_ids(movie_id) + self.location('http://www.imdb.com/title/%s/fullcredits'%movie_id) + assert self.is_on_page(MovieCrewPage) + for person in self.page.iter_persons_ids(): + yield person diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index d8970e87..ad9906da 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -25,23 +25,7 @@ from weboob.tools.browser import BasePage from datetime import datetime -__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage'] - - -class MoviePage(BasePage): - ''' Page describing a movie, only used to go on the MovieCrewPage - ''' - def iter_persons(self, id, role=None): - self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id) - assert self.browser.is_on_page(MovieCrewPage) - for p in self.browser.page.iter_persons(role): - yield p - - def iter_persons_ids(self,id): - self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id) - assert self.browser.is_on_page(MovieCrewPage) - for p in self.browser.page.iter_persons_ids(): - yield p +__all__ = ['PersonPage','MovieCrewPage','BiographyPage','FilmographyPage'] class BiographyPage(BasePage): @@ -171,16 +155,17 @@ class PersonPage(BasePage): person.roles = roles return person - def iter_movies_ids(self,person_id): - for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): - a = self.parser.select(movie_div,'b a',1) - id = a.attrib.get('href','').strip('/').split('/')[-1] - yield id - class FilmographyPage(BasePage): ''' Page of detailed filmography of a person, sorted by type of role This page is easier to parse than the main person page filmography ''' + def iter_movies_ids(self): + for role_div in self.parser.select(self.document.getroot(),'div.filmo'): + for a in self.parser.select(role_div,'ol > li > a'): + id = a.attrib.get('href','').strip('/').split('/')[-1] + if id.startswith('tt'): + yield id + def get_roles(self): roles = {} for role_div in self.parser.select(self.document.getroot(),'div.filmo'): @@ -205,6 +190,9 @@ class FilmographyPage(BasePage): id = a.attrib.get('href','').strip('/').split('/')[-1] if id.startswith('tt'): title = a.text - #movie = self.browser.get_movie(id) + role_detail = NotAvailable + if len(a.tail) > 0: + role_detail = unicode(' '.join(a.tail.replace('..','').split())) movie = Movie(id,title) + movie.short_description = role_detail yield movie diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py index c89119df..b96de521 100644 --- a/weboob/applications/cineoob/cineoob.py +++ b/weboob/applications/cineoob/cineoob.py @@ -203,7 +203,8 @@ class Cineoob(ReplApplication): inter = list(set(lid1) & set(lid2)) for common in inter: movie = self.get_object(common, 'get_movie') - self.cached_format(movie) + if movie: + self.cached_format(movie) self.flush() def do_persons_in_common(self, line):