[imdb] filmography page muuuuch more readable than person page

This commit is contained in:
Julien Veyssier 2013-03-06 00:19:00 +01:00
commit b91a1cd481
3 changed files with 55 additions and 42 deletions

View file

@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
from weboob.capabilities.cinema import Movie
from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
from datetime import datetime
@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser):
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
}
def iter_movies(self, pattern):
@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser):
for cat in ['title_popular','title_exact','title_approx']:
if jres.has_key(cat):
for m in jres[cat]:
yield self.get_movie(m['id'])
movie = self.get_movie(m['id'])
if movie != None:
yield movie
def iter_persons(self, pattern):
res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser):
def get_movie(self, id):
res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id )
jres = json.loads(res)
if res != None:
jres = json.loads(res)
else:
return None
title = NotAvailable
duration = NotAvailable
@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser):
return self.page.iter_persons(movie_id)
def iter_person_movies(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(PersonPage)
return self.page.iter_movies(person_id)
self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
assert self.is_on_page(FilmographyPage)
return self.page.iter_movies()
def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id)

View file

@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage
from datetime import datetime
__all__ = ['MoviePage','PersonPage','MovieCrewPage']
__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
class MoviePage(BasePage):
@ -50,8 +50,12 @@ class BiographyPage(BasePage):
def get_biography(self):
bio = ''
tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
for p in self.parser.select(tn,'p'):
bio += '\n\n%s'%p.text_content().strip()
#for p in self.parser.select(tn,'p'):
# bio += '\n\n%s'%p.text_content().strip()
# get children, append if label or tag = a,p,h...
bio = tn.text_content().strip()
if bio == "":
bio = NotAvailable
return bio
@ -125,31 +129,10 @@ class PersonPage(BasePage):
dtime.append('1')
dtime.append('1')
death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
# TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
#filmo_block = self.parser.select(self.document.getroot(),'div#filmography',1)
#role_list = []
#for span in self.parser.select(self.document.getroot(),'span.show-link'):
# role_list.append(span.attrib.get('id','').replace('show-',''))
#role_index = -1
#current_parent = None
##for sp in self.parser.select(filmo_block[0],'span.show-link'):
#for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
# divhead = divmovie.getparent()
# print "-- %s"%(self.document.getpath(divhead))
# print divmovie.attrib.get('class','')
# if current_parent != self.document.getpath(divhead):
# role_index += 1
# current_parent = self.document.getpath(divhead)
# role = role_list[role_index]
# a = self.parser.select(divmovie,'b a',1)
# roles[role].append(a.text)
#print roles
roles['any activity'] = []
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
roles['any activity'].append(a.text)
# go to the filmography page
self.browser.location('http://www.imdb.com/name/%s/filmotype'%id)
assert self.browser.is_on_page(FilmographyPage)
roles = self.browser.page.get_roles()
person = Person(id,name)
person.real_name = real_name
@ -162,14 +145,38 @@ class PersonPage(BasePage):
person.roles = roles
return person
def iter_movies(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1]
yield self.browser.get_movie(id)
def iter_movies_ids(self,person_id):
for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
a = self.parser.select(movie_div,'b a',1)
id = a.attrib.get('href','').strip('/').split('/')[-1]
yield id
class FilmographyPage(BasePage):
def get_roles(self):
roles = {}
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
roles[role] = []
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
#li = a.getparent()
#between_p = li.text_content().split(')')[0].split('(')[1]
if '(' in a.tail and ')' in a.tail:
between_p = a.tail.split(')')[0].split('(')[1]
else:
between_p = '????'
roles[role].append('(%s) %s'%(between_p,a.text))
return roles
def iter_movies(self):
for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
if role != 'In Development':
for a in self.parser.select(role_div,'ol > li > a'):
id = a.attrib.get('href','').strip('/').split('/')[-1]
if id.startswith('tt'):
movie = self.browser.get_movie(id)
if movie != None:
yield movie

View file

@ -347,7 +347,7 @@ class Cineoob(ReplApplication):
print >>sys.stderr, 'Person not found: %s' % id
return 3
self.change_path([u'biography'])
for backend, bio in self.do('get_person_biography', person.id):
print bio
self.flush()
if bio != NotAvailable:
self.flush()