From ce64153161e23e1c489f57b01b43e8f0bedc34dc Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Mon, 4 Mar 2013 13:30:21 +0100 Subject: [PATCH] [imdb] get movie with imdbapi --- modules/imdb/browser.py | 95 +++++++++++++++++++------- modules/imdb/pages.py | 5 ++ weboob/applications/cineoob/cineoob.py | 16 ++--- weboob/capabilities/cinema.py | 4 +- 4 files changed, 87 insertions(+), 33 deletions(-) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index 692441ee..f4c4635c 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -19,10 +19,13 @@ from weboob.tools.browser import BaseBrowser +from weboob.capabilities.base import NotAvailable +from weboob.capabilities.cinema import Movie from weboob.tools.json import json from .pages import MoviePage, PersonPage, MovieCrewPage +from datetime import datetime __all__ = ['ImdbBrowser'] @@ -39,35 +42,81 @@ class ImdbBrowser(BaseBrowser): } def iter_movies(self, pattern): - # the api leads to a json result or the html movie page if there is only one result - self.location('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8')) - if self.is_on_page(MoviePage): - id = 'tt'+self.geturl().split('/tt')[1].split('/')[0] - yield self.page.get_movie(id) - else: - res = self.readurl('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8')) - jres = json.loads(res) - for restype,mlist in jres.items(): - for m in mlist: + res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&tt=on&q=%s' % pattern.encode('utf-8')) + jres = json.loads(res) + for cat in ['title_exact','title_popular','title_approx']: + if jres.has_key(cat): + for m in jres[cat]: yield self.get_movie(m['id']) def iter_persons(self, pattern): - # the api leads to a json result or the html person page if there is only one result - self.location('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8')) - if self.is_on_page(PersonPage): - id = 'nm'+self.geturl().split('/nm')[1].split('/')[0] - yield self.page.get_person(id) - else: - res = self.readurl('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8')) - jres = json.loads(res) - for restype,plist in jres.items(): - for p in plist: + res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8')) + jres = json.loads(res) + for cat in ['name_exact','name_popular','name_approx']: + if jres.has_key(cat): + for p in jres[cat]: yield self.get_person(p['id']) def get_movie(self, id): - self.location('http://www.imdb.com/title/%s' % id) - assert self.is_on_page(MoviePage) - return self.page.get_movie(id) + res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id ) + jres = json.loads(res) + + title = NotAvailable + duration = NotAvailable + release_date = NotAvailable + description = NotAvailable + country = NotAvailable + note = NotAvailable + other_titles = [] + roles = {} + + title = jres['title'] + if jres.has_key('runtime'): + duration = int(jres['runtime'][0].split()[0]) + if jres.has_key('also_known_as'): + for other_t in jres['also_known_as']: + if other_t.has_key('country') and other_t.has_key('title'): + other_titles.append('%s : %s' % (other_t['country'],other_t['title'])) + if jres.has_key('release_date'): + dstr = str(jres['release_date']) + year = int(dstr[:4]) + if year == 0: + year = 1 + month = int(dstr[4:5]) + if month == 0: + month = 1 + day = int(dstr[-2:]) + if day == 0: + day = 1 + release_date = datetime(year,month,day) + if jres.has_key('country'): + country = '' + for c in jres['country']: + country += '%s, '%c + country = country[:-2] + if jres.has_key('plot_simple'): + description = jres['plot_simple'] + if jres.has_key('rating') and jres.has_key('rating_count'): + note = "%s/10 (%s votes)"%(jres['rating'],jres['rating_count']) + for r in ['actor','director','writer']: + if jres.has_key('%ss'%r): + roles['%s'%r] = list(jres['%ss'%r]) + + + movie = Movie(id,title.strip()) + movie.other_titles = other_titles + movie.release_date = release_date + movie.duration = duration + movie.description = description + movie.country = country + movie.note = note + movie.roles = roles + return movie + + + #self.location('http://www.imdb.com/title/%s' % id) + #assert self.is_on_page(MoviePage) + #return self.page.get_movie(id) def get_person(self, id): self.location('http://www.imdb.com/name/%s' % id) diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index 9ca4922c..ba9241bc 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -108,6 +108,11 @@ class PersonPage(BasePage): times = self.parser.select(td_overview,'time[itemprop=birthDate]') if len(times) > 0: time = times[0].attrib.get('datetime','').split('-') + if len(time) == 2: + time.append('1') + elif len(time) == 1: + time.append('1') + time.append('1') birth_date = datetime(int(time[0]),int(time[1]),int(time[2])) person = Person(id,name) diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py index 14be0b8d..7ed06899 100644 --- a/weboob/applications/cineoob/cineoob.py +++ b/weboob/applications/cineoob/cineoob.py @@ -32,25 +32,25 @@ __all__ = ['Cineoob'] class MovieInfoFormatter(IFormatter): - MANDATORY_FIELDS = ('id', 'original_title', 'release_date', 'other_titles', 'duration', 'description', 'note', 'awards','roles') + MANDATORY_FIELDS = ('id', 'original_title', 'release_date', 'other_titles', 'duration', 'description', 'note', 'roles', 'country') def format_obj(self, obj, alias): result = u'%s%s%s\n' % (self.BOLD, obj.original_title, self.NC) result += 'ID: %s\n' % obj.fullid - result += 'Other titles: %s\n' % obj.other_titles result += 'Released: %s\n' % obj.release_date + result += 'Country: %s\n' % obj.country result += 'Duration: %s\n' % obj.duration result += 'Note: %s\n' % obj.note if obj.roles: result += '\n%sRelated persons%s\n' % (self.BOLD, self.NC) for role,lpersons in obj.roles.items(): result += ' -- %s\n' % role - for person in lpersons: - result += ' * %s\n' % person.name - if obj.awards: - result += '\n%sAwards%s\n' % (self.BOLD, self.NC) - for a in obj.awards: - result += ' * %s\n' % a + for name in lpersons: + result += ' * %s\n' % name + if obj.other_titles: + result += '\n%sOther titles%s\n' % (self.BOLD, self.NC) + for t in obj.other_titles: + result += ' * %s\n' % t result += '\n%sDescription%s\n' % (self.BOLD, self.NC) result += '%s'%obj.description return result diff --git a/weboob/capabilities/cinema.py b/weboob/capabilities/cinema.py index 659ca82a..57b85e7e 100644 --- a/weboob/capabilities/cinema.py +++ b/weboob/capabilities/cinema.py @@ -29,12 +29,12 @@ class Movie(CapBaseObject): Movie object. """ original_title = StringField('Original title of the movie') - other_titles = StringField('Titles in other languages') + other_titles = Field('Titles in other countries',list) release_date = DateField('Release date of the movie') duration = IntField('Duration of the movie in minutes') description = StringField('Short description of the movie') + country = StringField('Origin country of the movie') note = StringField('Notation of the movie') - awards = Field('Awards won by the movie',list) roles = Field('Lists of Persons related to the movie indexed by roles',dict) def __init__(self, id, original_title):