[imdb] filmography page muuuuch more readable than person page

This commit is contained in:
Julien Veyssier 2013-03-06 00:19:00 +01:00
commit b91a1cd481
3 changed files with 55 additions and 42 deletions

View file

@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
from weboob.capabilities.cinema import Movie from weboob.capabilities.cinema import Movie
from weboob.tools.json import json from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
from datetime import datetime from datetime import datetime
@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser):
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
} }
def iter_movies(self, pattern): def iter_movies(self, pattern):
@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser):
for cat in ['title_popular','title_exact','title_approx']: for cat in ['title_popular','title_exact','title_approx']:
if jres.has_key(cat): if jres.has_key(cat):
for m in jres[cat]: for m in jres[cat]:
yield self.get_movie(m['id']) movie = self.get_movie(m['id'])
if movie != None:
yield movie
def iter_persons(self, pattern): def iter_persons(self, pattern):
res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8')) res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser):
def get_movie(self, id): def get_movie(self, id):
res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id ) res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id )
jres = json.loads(res) if res != None:
jres = json.loads(res)
else:
return None
title = NotAvailable title = NotAvailable
duration = NotAvailable duration = NotAvailable
@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser):
return self.page.iter_persons(movie_id) return self.page.iter_persons(movie_id)
def iter_person_movies(self, person_id): def iter_person_movies(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id) self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
assert self.is_on_page(PersonPage) assert self.is_on_page(FilmographyPage)
return self.page.iter_movies(person_id) return self.page.iter_movies()
def iter_person_movies_ids(self, person_id): def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id) self.location('http://www.imdb.com/name/%s' % person_id)

View file

@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage
from datetime import datetime from datetime import datetime
__all__ = ['MoviePage','PersonPage','MovieCrewPage'] __all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
class MoviePage(BasePage): class MoviePage(BasePage):
@ -50,8 +50,12 @@ class BiographyPage(BasePage):
def get_biography(self): def get_biography(self):
bio = '' bio = ''
tn = self.parser.select(self.document.getroot(),'div#tn15content',1) tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
for p in self.parser.select(tn,'p'): #for p in self.parser.select(tn,'p'):
bio += '\n\n%s'%p.text_content().strip() # bio += '\n\n%s'%p.text_content().strip()
# get children, append if label or tag = a,p,h...
bio = tn.text_content().strip()
if bio == "":
bio = NotAvailable
return bio return bio
@ -125,31 +129,10 @@ class PersonPage(BasePage):
dtime.append('1') dtime.append('1')
dtime.append('1') dtime.append('1')
death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2])) death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
# TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle ----------- # go to the filmography page
self.browser.location('http://www.imdb.com/name/%s/filmotype'%id)
#filmo_block = self.parser.select(self.document.getroot(),'div#filmography',1) assert self.browser.is_on_page(FilmographyPage)
#role_list = [] roles = self.browser.page.get_roles()
#for span in self.parser.select(self.document.getroot(),'span.show-link'):
# role_list.append(span.attrib.get('id','').replace('show-',''))
#role_index = -1
#current_parent = None
##for sp in self.parser.select(filmo_block[0],'span.show-link'):
#for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
# divhead = divmovie.getparent()
# print "-- %s"%(self.document.getpath(divhead))
# print divmovie.attrib.get('class','')
# if current_parent != self.document.getpath(divhead):
# role_index += 1
# current_parent = self.document.getpath(divhead)
# role = role_list[role_index]
# a = self.parser.select(divmovie,'b a',1)
# roles[role].append(a.text)
#print roles
roles['any activity'] = []
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
roles['any activity'].append(a.text)
person = Person(id,name) person = Person(id,name)
person.real_name = real_name person.real_name = real_name
@ -162,14 +145,38 @@ class PersonPage(BasePage):
person.roles = roles person.roles = roles
return person return person
def iter_movies(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1]
yield self.browser.get_movie(id)
def iter_movies_ids(self,person_id): def iter_movies_ids(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'): for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1) a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1] id = a.attrib.get('href','').strip('/').split('/')[-1]
yield id yield id
class FilmographyPage(BasePage):
def get_roles(self):
roles = {}
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
roles[role] = []
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
#li = a.getparent()
#between_p = li.text_content().split(')')[0].split('(')[1]
if '(' in a.tail and ')' in a.tail:
between_p = a.tail.split(')')[0].split('(')[1]
else:
between_p = '????'
roles[role].append('(%s) %s'%(between_p,a.text))
return roles
def iter_movies(self):
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
if role != 'In Development':
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
movie = self.browser.get_movie(id)
if movie != None:
yield movie

View file

@ -347,7 +347,7 @@ class Cineoob(ReplApplication):
print >>sys.stderr, 'Person not found: %s' % id print >>sys.stderr, 'Person not found: %s' % id
return 3 return 3
self.change_path([u'biography'])
for backend, bio in self.do('get_person_biography', person.id): for backend, bio in self.do('get_person_biography', person.id):
print bio print bio
self.flush() if bio != NotAvailable:
self.flush()