[cineoob] new command : biography

This commit is contained in:
Julien Veyssier 2013-03-05 16:24:10 +01:00
commit 27c36d412b
5 changed files with 67 additions and 17 deletions

View file

@ -62,3 +62,6 @@ class ImdbBackend(BaseBackend, ICapCinema):
def iter_movie_persons_ids(self, id): def iter_movie_persons_ids(self, id):
return self.browser.iter_movie_persons_ids(id) return self.browser.iter_movie_persons_ids(id)
def get_person_biography(self,id):
return self.browser.get_person_biography(id)

View file

@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
from weboob.capabilities.cinema import Movie from weboob.capabilities.cinema import Movie
from weboob.tools.json import json from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
from datetime import datetime from datetime import datetime
@ -38,7 +38,8 @@ class ImdbBrowser(BaseBrowser):
PAGES = { PAGES = {
'http://www.imdb.com/title/tt[0-9]*/*': MoviePage, 'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm.*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
} }
def iter_movies(self, pattern): def iter_movies(self, pattern):
@ -121,6 +122,11 @@ class ImdbBrowser(BaseBrowser):
assert self.is_on_page(PersonPage) assert self.is_on_page(PersonPage)
return self.page.get_person(id) return self.page.get_person(id)
def get_person_biography(self, id):
self.location('http://www.imdb.com/name/%s/bio' % id)
assert self.is_on_page(BiographyPage)
return self.page.get_biography()
def iter_movie_persons(self, movie_id): def iter_movie_persons(self, movie_id):
self.location('http://www.imdb.com/title/%s' % movie_id) self.location('http://www.imdb.com/title/%s' % movie_id)
assert self.is_on_page(MoviePage) assert self.is_on_page(MoviePage)

View file

@ -44,6 +44,17 @@ class MoviePage(BasePage):
yield p yield p
class BiographyPage(BasePage):
''' Page containing biography of a person
'''
def get_biography(self):
bio = ''
tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
for p in self.parser.select(tn,'p'):
bio += '\n\n%s'%p.text_content().strip()
return bio
class MovieCrewPage(BasePage): class MovieCrewPage(BasePage):
''' Page listing all the persons related to a movie ''' Page listing all the persons related to a movie
''' '''
@ -72,7 +83,7 @@ class PersonPage(BasePage):
''' '''
def get_person(self,id): def get_person(self,id):
name = NotAvailable name = NotAvailable
biography = NotAvailable short_biography = NotAvailable
birth_place = NotAvailable birth_place = NotAvailable
birth_date = NotAvailable birth_date = NotAvailable
death_date = NotAvailable death_date = NotAvailable
@ -83,7 +94,7 @@ class PersonPage(BasePage):
td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1) td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1)
descs = self.parser.select(td_overview,'span[itemprop=description]') descs = self.parser.select(td_overview,'span[itemprop=description]')
if len(descs) > 0: if len(descs) > 0:
biography = descs[0].text short_biography = descs[0].text
rname_block = self.parser.select(td_overview,'div.txt-block h4.inline') rname_block = self.parser.select(td_overview,'div.txt-block h4.inline')
if len(rname_block) > 0 and "born" in rname_block[0].text.lower(): if len(rname_block) > 0 and "born" in rname_block[0].text.lower():
links = self.parser.select(rname_block[0].getparent(),'a') links = self.parser.select(rname_block[0].getparent(),'a')
@ -114,17 +125,31 @@ class PersonPage(BasePage):
dtime.append('1') dtime.append('1')
dtime.append('1') dtime.append('1')
death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2])) death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
# TODO IMPROVE THIS ----------- # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
#for role in ['Actor','Composer']:
# show_span = self.parser.select(self.document.getroot(),'span[id=show-%s]' % role) #filmo_block = self.parser.select(self.document.getroot(),'div#filmography',1)
# if len(show_span) > 0: #role_list = []
# roles[role] = [] #for span in self.parser.select(self.document.getroot(),'span.show-link'):
# filmo_block = show_span[0].getparent() # role_list.append(span.attrib.get('id','').replace('show-',''))
# filmo_block = filmo_block.getnext() #role_index = -1
roles['actor'] = [] #current_parent = None
##for sp in self.parser.select(filmo_block[0],'span.show-link'):
#for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
# divhead = divmovie.getparent()
# print "-- %s"%(self.document.getpath(divhead))
# print divmovie.attrib.get('class','')
# if current_parent != self.document.getpath(divhead):
# role_index += 1
# current_parent = self.document.getpath(divhead)
# role = role_list[role_index]
# a = self.parser.select(divmovie,'b a',1)
# roles[role].append(a.text)
#print roles
roles['any activity'] = []
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1) a = self.parser.select(movie_div,'b a',1)
roles['actor'].append(a.text) roles['any activity'].append(a.text)
person = Person(id,name) person = Person(id,name)
person.real_name = real_name person.real_name = real_name
@ -133,7 +158,7 @@ class PersonPage(BasePage):
person.birth_place = birth_place person.birth_place = birth_place
person.gender = gender person.gender = gender
person.nationality = nationality person.nationality = nationality
person.biography = biography person.short_biography = short_biography
person.roles = roles person.roles = roles
return person return person

View file

@ -98,7 +98,7 @@ def num_years(begin, end=None):
return num_years return num_years
class PersonInfoFormatter(IFormatter): class PersonInfoFormatter(IFormatter):
MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'biography', 'roles') MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'short_biography', 'roles')
def format_obj(self, obj, alias): def format_obj(self, obj, alias):
result = u'%s%s%s\n' % (self.BOLD, obj.name, self.NC) result = u'%s%s%s\n' % (self.BOLD, obj.name, self.NC)
@ -124,7 +124,7 @@ class PersonInfoFormatter(IFormatter):
for movie in lmovies: for movie in lmovies:
result += ' * %s\n' % movie result += ' * %s\n' % movie
result += '\n%sBiography%s\n' % (self.BOLD, self.NC) result += '\n%sBiography%s\n' % (self.BOLD, self.NC)
result += '%s'%obj.biography result += '%s'%obj.short_biography
return result return result
@ -335,3 +335,19 @@ class Cineoob(ReplApplication):
for backend, movie in self.do('iter_person_movies', person.id): for backend, movie in self.do('iter_person_movies', person.id):
self.cached_format(movie) self.cached_format(movie)
self.flush() self.flush()
def do_biography(self, person_id):
"""
biography person_ID
Show the complete biography of a person.
"""
person = self.get_object(person_id, 'get_person')
if not person:
print >>sys.stderr, 'Person not found: %s' % id
return 3
self.change_path([u'biography'])
for backend, bio in self.do('get_person_biography', person.id):
print bio
self.flush()

View file

@ -53,7 +53,7 @@ class Person(CapBaseObject):
birth_place = StringField('City and country of birth of a person') birth_place = StringField('City and country of birth of a person')
gender = StringField('Gender of a person') gender = StringField('Gender of a person')
nationality = StringField('Nationality of a person') nationality = StringField('Nationality of a person')
biography = StringField('Short biography of a person') short_biography = StringField('Short biography of a person')
roles = Field('Lists of movies related to the person indexed by roles',dict) roles = Field('Lists of movies related to the person indexed by roles',dict)
def __init__(self, id, name): def __init__(self, id, name):