From b91a1cd481ef68ffff17f6d502f5f9577b59309b Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Wed, 6 Mar 2013 00:19:00 +0100 Subject: [PATCH] [imdb] filmography page muuuuch more readable than person page --- modules/imdb/browser.py | 18 ++++--- modules/imdb/pages.py | 75 ++++++++++++++------------ weboob/applications/cineoob/cineoob.py | 4 +- 3 files changed, 55 insertions(+), 42 deletions(-) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index bd8bc08e..c7e22f10 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable from weboob.capabilities.cinema import Movie from weboob.tools.json import json -from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage +from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage from datetime import datetime @@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser): 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, + 'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage, } def iter_movies(self, pattern): @@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser): for cat in ['title_popular','title_exact','title_approx']: if jres.has_key(cat): for m in jres[cat]: - yield self.get_movie(m['id']) + movie = self.get_movie(m['id']) + if movie != None: + yield movie def iter_persons(self, pattern): res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8')) @@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser): def get_movie(self, id): res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id ) - jres = json.loads(res) + if res != None: + jres = json.loads(res) + else: + return None title = NotAvailable duration = NotAvailable @@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser): return self.page.iter_persons(movie_id) def iter_person_movies(self, person_id): - self.location('http://www.imdb.com/name/%s' % person_id) - assert self.is_on_page(PersonPage) - return self.page.iter_movies(person_id) + self.location('http://www.imdb.com/name/%s/filmotype' % person_id) + assert self.is_on_page(FilmographyPage) + return self.page.iter_movies() def iter_person_movies_ids(self, person_id): self.location('http://www.imdb.com/name/%s' % person_id) diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index 72ea61b4..813eec2f 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage from datetime import datetime -__all__ = ['MoviePage','PersonPage','MovieCrewPage'] +__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage'] class MoviePage(BasePage): @@ -50,8 +50,12 @@ class BiographyPage(BasePage): def get_biography(self): bio = '' tn = self.parser.select(self.document.getroot(),'div#tn15content',1) - for p in self.parser.select(tn,'p'): - bio += '\n\n%s'%p.text_content().strip() + #for p in self.parser.select(tn,'p'): + # bio += '\n\n%s'%p.text_content().strip() + # get children, append if label or tag = a,p,h... + bio = tn.text_content().strip() + if bio == "": + bio = NotAvailable return bio @@ -125,31 +129,10 @@ class PersonPage(BasePage): dtime.append('1') dtime.append('1') death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2])) - # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle ----------- - - #filmo_block = self.parser.select(self.document.getroot(),'div#filmography',1) - #role_list = [] - #for span in self.parser.select(self.document.getroot(),'span.show-link'): - # role_list.append(span.attrib.get('id','').replace('show-','')) - #role_index = -1 - #current_parent = None - ##for sp in self.parser.select(filmo_block[0],'span.show-link'): - #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): - # divhead = divmovie.getparent() - # print "-- %s"%(self.document.getpath(divhead)) - # print divmovie.attrib.get('class','') - # if current_parent != self.document.getpath(divhead): - # role_index += 1 - # current_parent = self.document.getpath(divhead) - # role = role_list[role_index] - # a = self.parser.select(divmovie,'b a',1) - # roles[role].append(a.text) - #print roles - - roles['any activity'] = [] - for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): - a = self.parser.select(movie_div,'b a',1) - roles['any activity'].append(a.text) + # go to the filmography page + self.browser.location('http://www.imdb.com/name/%s/filmotype'%id) + assert self.browser.is_on_page(FilmographyPage) + roles = self.browser.page.get_roles() person = Person(id,name) person.real_name = real_name @@ -162,14 +145,38 @@ class PersonPage(BasePage): person.roles = roles return person - def iter_movies(self,person_id): - for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): - a = self.parser.select(movie_div,'b a',1) - id = a.attrib.get('href','').strip('/').split('/')[-1] - yield self.browser.get_movie(id) - def iter_movies_ids(self,person_id): for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): a = self.parser.select(movie_div,'b a',1) id = a.attrib.get('href','').strip('/').split('/')[-1] yield id + +class FilmographyPage(BasePage): + def get_roles(self): + roles = {} + for role_div in self.parser.select(self.document.getroot(),'div.filmo'): + role = self.parser.select(role_div,'h5 a',1).text.replace(':','') + roles[role] = [] + for a in self.parser.select(role_div,'ol > li > a'): + id = a.attrib.get('href','').strip('/').split('/')[-1] + if id.startswith('tt'): + #li = a.getparent() + #between_p = li.text_content().split(')')[0].split('(')[1] + if '(' in a.tail and ')' in a.tail: + between_p = a.tail.split(')')[0].split('(')[1] + else: + between_p = '????' + roles[role].append('(%s) %s'%(between_p,a.text)) + return roles + + def iter_movies(self): + for role_div in self.parser.select(self.document.getroot(),'div.filmo'): + role = self.parser.select(role_div,'h5 a',1).text.replace(':','') + if role != 'In Development': + for a in self.parser.select(role_div,'ol > li > a'): + id = a.attrib.get('href','').strip('/').split('/')[-1] + if id.startswith('tt'): + movie = self.browser.get_movie(id) + if movie != None: + yield movie + diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py index d16325cf..72daad1a 100644 --- a/weboob/applications/cineoob/cineoob.py +++ b/weboob/applications/cineoob/cineoob.py @@ -347,7 +347,7 @@ class Cineoob(ReplApplication): print >>sys.stderr, 'Person not found: %s' % id return 3 - self.change_path([u'biography']) for backend, bio in self.do('get_person_biography', person.id): print bio - self.flush() + if bio != NotAvailable: + self.flush()