From 3492dbb9d6532a2727c91d839c80002e98879f37 Mon Sep 17 00:00:00 2001 From: Julien Veyssier Date: Mon, 4 Mar 2013 04:07:12 +0100 Subject: [PATCH] imdb and cineoob in progress --- modules/imdb/__init__.py | 22 ++++++ modules/imdb/backend.py | 58 ++++++++++++++ modules/imdb/browser.py | 83 ++++++++++++++++++++ modules/imdb/pages.py | 102 +++++++++++++++++++++++++ modules/imdb/test.py | 39 ++++++++++ weboob/applications/cineoob/cineoob.py | 34 ++++++--- weboob/capabilities/base.py | 2 +- weboob/capabilities/cinema.py | 10 +-- 8 files changed, 334 insertions(+), 16 deletions(-) create mode 100644 modules/imdb/__init__.py create mode 100644 modules/imdb/backend.py create mode 100644 modules/imdb/browser.py create mode 100644 modules/imdb/pages.py create mode 100644 modules/imdb/test.py diff --git a/modules/imdb/__init__.py b/modules/imdb/__init__.py new file mode 100644 index 00000000..d20b6ee1 --- /dev/null +++ b/modules/imdb/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Julien Veyssier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from .backend import ImdbBackend + +__all__ = ['ImdbBackend'] diff --git a/modules/imdb/backend.py b/modules/imdb/backend.py new file mode 100644 index 00000000..f358727c --- /dev/null +++ b/modules/imdb/backend.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Julien Veyssier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.cinema import ICapCinema +from weboob.tools.backend import BaseBackend + +from .browser import ImdbBrowser + +from urllib import quote_plus + +__all__ = ['ImdbBackend'] + + +class ImdbBackend(BaseBackend, ICapCinema): + NAME = 'imdb' + MAINTAINER = u'Julien Veyssier' + EMAIL = 'julien.veyssier@aiur.fr' + VERSION = '0.f' + DESCRIPTION = 'Internet Movie Database service' + LICENSE = 'AGPLv3+' + BROWSER = ImdbBrowser + + def create_default_browser(self): + return self.create_browser() + + def get_movie(self, id): + return self.browser.get_movie(id) + + def get_person(self, id): + return self.browser.get_person(id) + + def iter_movies(self, pattern): + return self.browser.iter_movies(quote_plus(pattern.encode('utf-8'))) + + def iter_persons(self, pattern): + return self.browser.iter_persons(quote_plus(pattern.encode('utf-8'))) + + def iter_movie_persons(self, id): + return self.browser.iter_movie_persons(id) + + def iter_person_movies(self, id): + return self.browser.iter_person_movies(id) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py new file mode 100644 index 00000000..2bfec383 --- /dev/null +++ b/modules/imdb/browser.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Julien Veyssier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BaseBrowser +from weboob.tools.json import json + +from .pages import MoviePage, PersonPage, MovieCrewPage + + +__all__ = ['ImdbBrowser'] + + +class ImdbBrowser(BaseBrowser): + DOMAIN = 'www.imdb.com' + PROTOCOL = 'http' + ENCODING = 'utf-8' + USER_AGENT = BaseBrowser.USER_AGENTS['wget'] + PAGES = { + 'http://www.imdb.com/title/tt[0-9]*/*': MoviePage, + 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, + 'http://www.imdb.com/name/nm.*': PersonPage, + } + + def iter_movies(self, pattern): + # the api leads to a json result or the html movie page if there is only one result + self.location('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8')) + if self.is_on_page(MoviePage): + yield self.page.get_movie() + else: + res = self.readurl('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8')) + jres = json.loads(res) + for restype,mlist in jres.items(): + for m in mlist: + yield self.get_movie(m['id']) + + def iter_persons(self, pattern): + # the api leads to a json result or the html movie page if there is only one result + self.location('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8')) + if self.is_on_page(PersonPage): + yield self.page.get_person() + else: + res = self.readurl('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8')) + jres = json.loads(res) + for restype,plist in jres.items(): + for p in plist: + yield self.get_person(p['id']) + + def get_movie(self, id): + self.location('http://www.imdb.com/title/%s' % id) + assert self.is_on_page(MoviePage) + return self.page.get_movie(id) + + def get_person(self, id): + self.location('http://www.imdb.com/name/%s' % id) + assert self.is_on_page(PersonPage) + return self.page.get_person(id) + + def iter_movie_persons(self, movie_id): + self.location('http://www.imdb.com/title/%s' % movie_id) + assert self.is_on_page(MoviePage) + return self.page.iter_persons(movie_id) + + def iter_person_movies(self, person_id): + self.location('http://www.imdb.com/name/%s' % person_id) + assert self.is_on_page(PersonPage) + return self.page.iter_movies(person_id) diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py new file mode 100644 index 00000000..ae23bf9a --- /dev/null +++ b/modules/imdb/pages.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Julien Veyssier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.capabilities.cinema import Movie, Person +from weboob.capabilities.base import NotAvailable +from weboob.tools.browser import BasePage + +import string +from datetime import datetime + + +__all__ = ['MoviePage','PersonPage','MovieCrewPage'] + + +class MoviePage(BasePage): + def get_movie(self,id): + title = NotAvailable + duration = NotAvailable + description = NotAvailable.__unicode__() + td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1) + for span in self.parser.select(td_overview,'h1.header span[itemprop=name]'): + if span.attrib.get('class','') == 'itemprop': + other_titles = span.text + if title == NotAvailable: + title = other_titles + elif span.attrib.get('class','') == 'title-extra': + title = span.text + meta = self.parser.select(td_overview,'meta[itemprop=datePublished]',1) + datestrings = meta.attrib.get('content','').split('-') + if len(datestrings) == 2: + datestrings.append('1') + time = self.parser.select(td_overview,'time[itemprop=duration]') + if len(time) > 0: + duration = int(time[0].attrib.get('datetime','').strip(string.letters)) + desc = self.parser.select(td_overview,'p[itemprop=description]') + if len(desc) > 0: + description = desc[0].text + movie = Movie(id,title.strip()) + movie.other_titles = other_titles.strip() + movie.release_date = datetime(int(datestrings[0]),int(datestrings[1]),int(datestrings[2])) + movie.duration = duration + movie.description = description + movie.note = "10/10" + movie.awards = ["aw1","aw2"] + movie.roles = {} + return movie + + def iter_persons(self,id): + self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id) + assert self.browser.is_on_page(MovieCrewPage) + for p in self.browser.page.iter_persons(): + yield p + +class MovieCrewPage(BasePage): + def iter_persons(self): + tables = self.parser.select(self.document.getroot(),'table.cast') + if len(tables) > 0: + table = tables[0] + tds = self.parser.select(table,'td.nm') + for td in tds: + name = td.text_content() + id = td.find('a').attrib.get('href','').strip('/').split('/')[-1] + person = Person(id,name) + person.real_name = NotAvailable + person.birth_date = NotAvailable + person.nationality = NotAvailable + person.gender = NotAvailable + yield person + + +class PersonPage(BasePage): + def get_person(self,id): + person = Person(id,'nameplop') + person.real_name = 'rn' + person.birth_date = datetime.now() + person.birth_place = "place" + person.gender = "M" + person.nationality = "nn" + person.biography = 'bio' + person.awards = ["aw1","aw2"] + person.roles = {} + return person + + def iter_movies(self,person_id): + pass diff --git a/modules/imdb/test.py b/modules/imdb/test.py new file mode 100644 index 00000000..e12ed63e --- /dev/null +++ b/modules/imdb/test.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Julien Veyssier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.test import BackendTest + +#from random import choice + +class ImdbTest(BackendTest): + BACKEND = 'imdb' + + def test_movie(self): + movies = list(self.backend.iter_movies('spiderman')) + for movie in movies: + assert movie.id + assert movie.original_title + assert movie.release_date + + def test_persons(self): + persons = list(self.backend.iter_persons('robert')) + for person in persons: + assert person.id + assert person.name + assert person.birth_date diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py index 98c6f1b4..0b8632c4 100644 --- a/weboob/applications/cineoob/cineoob.py +++ b/weboob/applications/cineoob/cineoob.py @@ -23,6 +23,7 @@ import sys from datetime import datetime from weboob.capabilities.cinema import ICapCinema +from weboob.capabilities.base import NotAvailable from weboob.tools.application.repl import ReplApplication from weboob.tools.application.formatters.iformatter import IFormatter, PrettyFormatter @@ -38,7 +39,7 @@ class MovieInfoFormatter(IFormatter): result += 'ID: %s\n' % obj.fullid result += 'Other titles: %s\n' % obj.other_titles result += 'Released: %s\n' % obj.release_date - result += 'Duration: %d\n' % obj.duration + result += 'Duration: %s\n' % obj.duration result += 'Note: %s\n' % obj.note if obj.roles: result += '\n%sRelated persons%s\n' % (self.BOLD, self.NC) @@ -62,7 +63,7 @@ class MovieListFormatter(PrettyFormatter): return obj.original_title def get_description(self, obj): - return 'Released: %s (note: %d, duration: %d)' % (obj.release_date, obj.note, obj.duration) + return 'Released: %s (note: %s, duration: %s)' % (obj.release_date, obj.note, obj.duration) def yearsago(years, from_date=None): if from_date is None: @@ -94,7 +95,7 @@ class PersonInfoFormatter(IFormatter): result += 'Birth date: %s\n' % obj.birth_date age = num_years(obj.birth_date) result += 'Age: %s\n' % age - result += 'Birth place: %d\n' % obj.birth_place + result += 'Birth place: %s\n' % obj.birth_place result += 'Gender: %s\n' % obj.gender result += 'Nationality: %s\n' % obj.nationality if obj.roles: @@ -107,7 +108,7 @@ class PersonInfoFormatter(IFormatter): result += '\n%sAwards%s\n' % (self.BOLD, self.NC) for a in obj.awards: result += ' * %s\n' % a - result += '\n%Biography%s\n' % (self.BOLD, self.NC) + result += '\n%sBiography%s\n' % (self.BOLD, self.NC) result += obj.biography return result @@ -119,8 +120,11 @@ class PersonListFormatter(PrettyFormatter): return obj.name def get_description(self, obj): - age = num_years(obj.birth_date) - return 'Real name: %s (age: %d, nationality: %s, gender: %s)' % (obj.real_name, age, obj.nationality, obj.gender) + if obj.birth_date != NotAvailable: + age = num_years(obj.birth_date) + else: + age = NotAvailable + return 'Real name: %s (age: %s, nationality: %s, gender: %s)' % (obj.real_name, age, obj.nationality, obj.gender) class Cineoob(ReplApplication): @@ -155,7 +159,7 @@ class Cineoob(ReplApplication): Get information about a movie. """ - + # TODO verify if path = search movie or filmo movie = self.get_object(id, 'get_movie') if not movie: print >>sys.stderr, 'Movie not found: %s' % id @@ -171,7 +175,7 @@ class Cineoob(ReplApplication): Get information about a person. """ - + # TODO verify if path = search person or casting person = self.get_object(id, 'get_person') if not person: print >>sys.stderr, 'Person not found: %s' % id @@ -217,8 +221,13 @@ class Cineoob(ReplApplication): List persons related to a movie. """ + movie = self.get_object(movie_id, 'get_movie') + if not movie: + print >>sys.stderr, 'Movie not found: %s' % id + return 3 + self.change_path([u'casting']) - for backend, person in self.do('iter_movie_persons', movie_id): + for backend, person in self.do('iter_movie_persons', movie.id): self.cached_format(person) self.flush() @@ -228,7 +237,12 @@ class Cineoob(ReplApplication): List movies of a person. """ + person = self.get_object(person_id, 'get_person') + if not person: + print >>sys.stderr, 'Person not found: %s' % id + return 3 + self.change_path([u'filmography']) - for backend, movie in self.do('iter_person_movies', person_id): + for backend, movie in self.do('iter_person_movies', person.id): self.cached_format(movie) self.flush() diff --git a/weboob/capabilities/base.py b/weboob/capabilities/base.py index cd3a7657..d05a4661 100644 --- a/weboob/capabilities/base.py +++ b/weboob/capabilities/base.py @@ -30,7 +30,7 @@ from weboob.tools.ordereddict import OrderedDict __all__ = ['UserError', 'FieldNotFound', 'NotAvailable', 'NotLoaded', 'IBaseCap', 'Field', 'IntField', 'DecimalField', 'FloatField', 'StringField', 'BytesField', 'DateField', - 'DeltaField', 'HashTableField', 'empty', 'CapBaseObject'] + 'DeltaField', 'empty', 'CapBaseObject'] def empty(value): diff --git a/weboob/capabilities/cinema.py b/weboob/capabilities/cinema.py index 702a3954..659ca82a 100644 --- a/weboob/capabilities/cinema.py +++ b/weboob/capabilities/cinema.py @@ -18,7 +18,7 @@ # along with weboob. If not, see . -from .base import IBaseCap, CapBaseObject, DateField, StringField, IntField, HashTableField +from .base import IBaseCap, CapBaseObject, DateField, StringField, IntField, Field __all__ = ['Movie', 'Person', 'ICapCinema'] @@ -34,8 +34,8 @@ class Movie(CapBaseObject): duration = IntField('Duration of the movie in minutes') description = StringField('Short description of the movie') note = StringField('Notation of the movie') - awards = StringField('Awards won by the movie') - roles = HashTableField('Lists of Persons related to the movie indexed by roles') + awards = Field('Awards won by the movie',list) + roles = Field('Lists of Persons related to the movie indexed by roles',dict) def __init__(self, id, original_title): CapBaseObject.__init__(self, id) @@ -53,8 +53,8 @@ class Person(CapBaseObject): gender = StringField('Gender of a person') nationality = StringField('Nationality of a person') biography = StringField('Short biography of a person') - awards = StringField('Awards won by the person') - roles = HashTableField('Lists of movies related to the person indexed by roles') + awards = Field('Awards won by the person',list) + roles = Field('Lists of movies related to the person indexed by roles',dict) def __init__(self, id, name): CapBaseObject.__init__(self, id)