From 27c36d412beb56bb1114e77a4b1c9eff29e51d8b Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Tue, 5 Mar 2013 16:24:10 +0100 Subject: [PATCH] [cineoob] new command : biography --- modules/imdb/backend.py | 3 ++ modules/imdb/browser.py | 10 ++++-- modules/imdb/pages.py | 49 +++++++++++++++++++------- weboob/applications/cineoob/cineoob.py | 20 +++++++++-- weboob/capabilities/cinema.py | 2 +- 5 files changed, 67 insertions(+), 17 deletions(-) diff --git a/modules/imdb/backend.py b/modules/imdb/backend.py index d4b40b6d..7219585d 100644 --- a/modules/imdb/backend.py +++ b/modules/imdb/backend.py @@ -62,3 +62,6 @@ class ImdbBackend(BaseBackend, ICapCinema): def iter_movie_persons_ids(self, id): return self.browser.iter_movie_persons_ids(id) + + def get_person_biography(self,id): + return self.browser.get_person_biography(id) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index 076d80d5..bd8bc08e 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable from weboob.capabilities.cinema import Movie from weboob.tools.json import json -from .pages import MoviePage, PersonPage, MovieCrewPage +from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage from datetime import datetime @@ -38,7 +38,8 @@ class ImdbBrowser(BaseBrowser): PAGES = { 'http://www.imdb.com/title/tt[0-9]*/*': MoviePage, 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, - 'http://www.imdb.com/name/nm.*': PersonPage, + 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, + 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, } def iter_movies(self, pattern): @@ -121,6 +122,11 @@ class ImdbBrowser(BaseBrowser): assert self.is_on_page(PersonPage) return self.page.get_person(id) + def get_person_biography(self, id): + self.location('http://www.imdb.com/name/%s/bio' % id) + assert self.is_on_page(BiographyPage) + return self.page.get_biography() + def iter_movie_persons(self, movie_id): self.location('http://www.imdb.com/title/%s' % movie_id) assert self.is_on_page(MoviePage) diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index 4a3daa17..72ea61b4 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -44,6 +44,17 @@ class MoviePage(BasePage): yield p +class BiographyPage(BasePage): + ''' Page containing biography of a person + ''' + def get_biography(self): + bio = '' + tn = self.parser.select(self.document.getroot(),'div#tn15content',1) + for p in self.parser.select(tn,'p'): + bio += '\n\n%s'%p.text_content().strip() + return bio + + class MovieCrewPage(BasePage): ''' Page listing all the persons related to a movie ''' @@ -72,7 +83,7 @@ class PersonPage(BasePage): ''' def get_person(self,id): name = NotAvailable - biography = NotAvailable + short_biography = NotAvailable birth_place = NotAvailable birth_date = NotAvailable death_date = NotAvailable @@ -83,7 +94,7 @@ class PersonPage(BasePage): td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1) descs = self.parser.select(td_overview,'span[itemprop=description]') if len(descs) > 0: - biography = descs[0].text + short_biography = descs[0].text rname_block = self.parser.select(td_overview,'div.txt-block h4.inline') if len(rname_block) > 0 and "born" in rname_block[0].text.lower(): links = self.parser.select(rname_block[0].getparent(),'a') @@ -114,17 +125,31 @@ class PersonPage(BasePage): dtime.append('1') dtime.append('1') death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2])) - # TODO IMPROVE THIS ----------- - #for role in ['Actor','Composer']: - # show_span = self.parser.select(self.document.getroot(),'span[id=show-%s]' % role) - # if len(show_span) > 0: - # roles[role] = [] - # filmo_block = show_span[0].getparent() - # filmo_block = filmo_block.getnext() - roles['actor'] = [] + # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle ----------- + + #filmo_block = self.parser.select(self.document.getroot(),'div#filmography',1) + #role_list = [] + #for span in self.parser.select(self.document.getroot(),'span.show-link'): + # role_list.append(span.attrib.get('id','').replace('show-','')) + #role_index = -1 + #current_parent = None + ##for sp in self.parser.select(filmo_block[0],'span.show-link'): + #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): + # divhead = divmovie.getparent() + # print "-- %s"%(self.document.getpath(divhead)) + # print divmovie.attrib.get('class','') + # if current_parent != self.document.getpath(divhead): + # role_index += 1 + # current_parent = self.document.getpath(divhead) + # role = role_list[role_index] + # a = self.parser.select(divmovie,'b a',1) + # roles[role].append(a.text) + #print roles + + roles['any activity'] = [] for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): a = self.parser.select(movie_div,'b a',1) - roles['actor'].append(a.text) + roles['any activity'].append(a.text) person = Person(id,name) person.real_name = real_name @@ -133,7 +158,7 @@ class PersonPage(BasePage): person.birth_place = birth_place person.gender = gender person.nationality = nationality - person.biography = biography + person.short_biography = short_biography person.roles = roles return person diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py index 91214c2b..d16325cf 100644 --- a/weboob/applications/cineoob/cineoob.py +++ b/weboob/applications/cineoob/cineoob.py @@ -98,7 +98,7 @@ def num_years(begin, end=None): return num_years class PersonInfoFormatter(IFormatter): - MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'biography', 'roles') + MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'short_biography', 'roles') def format_obj(self, obj, alias): result = u'%s%s%s\n' % (self.BOLD, obj.name, self.NC) @@ -124,7 +124,7 @@ class PersonInfoFormatter(IFormatter): for movie in lmovies: result += ' * %s\n' % movie result += '\n%sBiography%s\n' % (self.BOLD, self.NC) - result += '%s'%obj.biography + result += '%s'%obj.short_biography return result @@ -335,3 +335,19 @@ class Cineoob(ReplApplication): for backend, movie in self.do('iter_person_movies', person.id): self.cached_format(movie) self.flush() + + def do_biography(self, person_id): + """ + biography person_ID + + Show the complete biography of a person. + """ + person = self.get_object(person_id, 'get_person') + if not person: + print >>sys.stderr, 'Person not found: %s' % id + return 3 + + self.change_path([u'biography']) + for backend, bio in self.do('get_person_biography', person.id): + print bio + self.flush() diff --git a/weboob/capabilities/cinema.py b/weboob/capabilities/cinema.py index 65b0c940..caee2ba8 100644 --- a/weboob/capabilities/cinema.py +++ b/weboob/capabilities/cinema.py @@ -53,7 +53,7 @@ class Person(CapBaseObject): birth_place = StringField('City and country of birth of a person') gender = StringField('Gender of a person') nationality = StringField('Nationality of a person') - biography = StringField('Short biography of a person') + short_biography = StringField('Short biography of a person') roles = Field('Lists of movies related to the person indexed by roles',dict) def __init__(self, id, name):