imdb and cineoob in progress

This commit is contained in:
Julien Veyssier 2013-03-04 04:07:12 +01:00
commit 3492dbb9d6
8 changed files with 334 additions and 16 deletions

22
modules/imdb/__init__.py Normal file
View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Julien Veyssier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from .backend import ImdbBackend
__all__ = ['ImdbBackend']

58
modules/imdb/backend.py Normal file
View file

@ -0,0 +1,58 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Julien Veyssier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.cinema import ICapCinema
from weboob.tools.backend import BaseBackend
from .browser import ImdbBrowser
from urllib import quote_plus
__all__ = ['ImdbBackend']
class ImdbBackend(BaseBackend, ICapCinema):
NAME = 'imdb'
MAINTAINER = u'Julien Veyssier'
EMAIL = 'julien.veyssier@aiur.fr'
VERSION = '0.f'
DESCRIPTION = 'Internet Movie Database service'
LICENSE = 'AGPLv3+'
BROWSER = ImdbBrowser
def create_default_browser(self):
return self.create_browser()
def get_movie(self, id):
return self.browser.get_movie(id)
def get_person(self, id):
return self.browser.get_person(id)
def iter_movies(self, pattern):
return self.browser.iter_movies(quote_plus(pattern.encode('utf-8')))
def iter_persons(self, pattern):
return self.browser.iter_persons(quote_plus(pattern.encode('utf-8')))
def iter_movie_persons(self, id):
return self.browser.iter_movie_persons(id)
def iter_person_movies(self, id):
return self.browser.iter_person_movies(id)

83
modules/imdb/browser.py Normal file
View file

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Julien Veyssier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from weboob.tools.json import json
from .pages import MoviePage, PersonPage, MovieCrewPage
__all__ = ['ImdbBrowser']
class ImdbBrowser(BaseBrowser):
DOMAIN = 'www.imdb.com'
PROTOCOL = 'http'
ENCODING = 'utf-8'
USER_AGENT = BaseBrowser.USER_AGENTS['wget']
PAGES = {
'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
'http://www.imdb.com/name/nm.*': PersonPage,
}
def iter_movies(self, pattern):
# the api leads to a json result or the html movie page if there is only one result
self.location('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8'))
if self.is_on_page(MoviePage):
yield self.page.get_movie()
else:
res = self.readurl('http://www.imdb.com/xml/find?json=1&tt=on&q=%s' % pattern.encode('utf-8'))
jres = json.loads(res)
for restype,mlist in jres.items():
for m in mlist:
yield self.get_movie(m['id'])
def iter_persons(self, pattern):
# the api leads to a json result or the html movie page if there is only one result
self.location('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8'))
if self.is_on_page(PersonPage):
yield self.page.get_person()
else:
res = self.readurl('http://www.imdb.com/xml/find?json=1&nm=on&q=%s' % pattern.encode('utf-8'))
jres = json.loads(res)
for restype,plist in jres.items():
for p in plist:
yield self.get_person(p['id'])
def get_movie(self, id):
self.location('http://www.imdb.com/title/%s' % id)
assert self.is_on_page(MoviePage)
return self.page.get_movie(id)
def get_person(self, id):
self.location('http://www.imdb.com/name/%s' % id)
assert self.is_on_page(PersonPage)
return self.page.get_person(id)
def iter_movie_persons(self, movie_id):
self.location('http://www.imdb.com/title/%s' % movie_id)
assert self.is_on_page(MoviePage)
return self.page.iter_persons(movie_id)
def iter_person_movies(self, person_id):
self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(PersonPage)
return self.page.iter_movies(person_id)

102
modules/imdb/pages.py Normal file
View file

@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Julien Veyssier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.cinema import Movie, Person
from weboob.capabilities.base import NotAvailable
from weboob.tools.browser import BasePage
import string
from datetime import datetime
__all__ = ['MoviePage','PersonPage','MovieCrewPage']
class MoviePage(BasePage):
def get_movie(self,id):
title = NotAvailable
duration = NotAvailable
description = NotAvailable.__unicode__()
td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1)
for span in self.parser.select(td_overview,'h1.header span[itemprop=name]'):
if span.attrib.get('class','') == 'itemprop':
other_titles = span.text
if title == NotAvailable:
title = other_titles
elif span.attrib.get('class','') == 'title-extra':
title = span.text
meta = self.parser.select(td_overview,'meta[itemprop=datePublished]',1)
datestrings = meta.attrib.get('content','').split('-')
if len(datestrings) == 2:
datestrings.append('1')
time = self.parser.select(td_overview,'time[itemprop=duration]')
if len(time) > 0:
duration = int(time[0].attrib.get('datetime','').strip(string.letters))
desc = self.parser.select(td_overview,'p[itemprop=description]')
if len(desc) > 0:
description = desc[0].text
movie = Movie(id,title.strip())
movie.other_titles = other_titles.strip()
movie.release_date = datetime(int(datestrings[0]),int(datestrings[1]),int(datestrings[2]))
movie.duration = duration
movie.description = description
movie.note = "10/10"
movie.awards = ["aw1","aw2"]
movie.roles = {}
return movie
def iter_persons(self,id):
self.browser.location('http://www.imdb.com/title/%s/fullcredits'%id)
assert self.browser.is_on_page(MovieCrewPage)
for p in self.browser.page.iter_persons():
yield p
class MovieCrewPage(BasePage):
def iter_persons(self):
tables = self.parser.select(self.document.getroot(),'table.cast')
if len(tables) > 0:
table = tables[0]
tds = self.parser.select(table,'td.nm')
for td in tds:
name = td.text_content()
id = td.find('a').attrib.get('href','').strip('/').split('/')[-1]
person = Person(id,name)
person.real_name = NotAvailable
person.birth_date = NotAvailable
person.nationality = NotAvailable
person.gender = NotAvailable
yield person
class PersonPage(BasePage):
def get_person(self,id):
person = Person(id,'nameplop')
person.real_name = 'rn'
person.birth_date = datetime.now()
person.birth_place = "place"
person.gender = "M"
person.nationality = "nn"
person.biography = 'bio'
person.awards = ["aw1","aw2"]
person.roles = {}
return person
def iter_movies(self,person_id):
pass

39
modules/imdb/test.py Normal file
View file

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Julien Veyssier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.test import BackendTest
#from random import choice
class ImdbTest(BackendTest):
BACKEND = 'imdb'
def test_movie(self):
movies = list(self.backend.iter_movies('spiderman'))
for movie in movies:
assert movie.id
assert movie.original_title
assert movie.release_date
def test_persons(self):
persons = list(self.backend.iter_persons('robert'))
for person in persons:
assert person.id
assert person.name
assert person.birth_date

View file

@ -23,6 +23,7 @@ import sys
from datetime import datetime from datetime import datetime
from weboob.capabilities.cinema import ICapCinema from weboob.capabilities.cinema import ICapCinema
from weboob.capabilities.base import NotAvailable
from weboob.tools.application.repl import ReplApplication from weboob.tools.application.repl import ReplApplication
from weboob.tools.application.formatters.iformatter import IFormatter, PrettyFormatter from weboob.tools.application.formatters.iformatter import IFormatter, PrettyFormatter
@ -38,7 +39,7 @@ class MovieInfoFormatter(IFormatter):
result += 'ID: %s\n' % obj.fullid result += 'ID: %s\n' % obj.fullid
result += 'Other titles: %s\n' % obj.other_titles result += 'Other titles: %s\n' % obj.other_titles
result += 'Released: %s\n' % obj.release_date result += 'Released: %s\n' % obj.release_date
result += 'Duration: %d\n' % obj.duration result += 'Duration: %s\n' % obj.duration
result += 'Note: %s\n' % obj.note result += 'Note: %s\n' % obj.note
if obj.roles: if obj.roles:
result += '\n%sRelated persons%s\n' % (self.BOLD, self.NC) result += '\n%sRelated persons%s\n' % (self.BOLD, self.NC)
@ -62,7 +63,7 @@ class MovieListFormatter(PrettyFormatter):
return obj.original_title return obj.original_title
def get_description(self, obj): def get_description(self, obj):
return 'Released: %s (note: %d, duration: %d)' % (obj.release_date, obj.note, obj.duration) return 'Released: %s (note: %s, duration: %s)' % (obj.release_date, obj.note, obj.duration)
def yearsago(years, from_date=None): def yearsago(years, from_date=None):
if from_date is None: if from_date is None:
@ -94,7 +95,7 @@ class PersonInfoFormatter(IFormatter):
result += 'Birth date: %s\n' % obj.birth_date result += 'Birth date: %s\n' % obj.birth_date
age = num_years(obj.birth_date) age = num_years(obj.birth_date)
result += 'Age: %s\n' % age result += 'Age: %s\n' % age
result += 'Birth place: %d\n' % obj.birth_place result += 'Birth place: %s\n' % obj.birth_place
result += 'Gender: %s\n' % obj.gender result += 'Gender: %s\n' % obj.gender
result += 'Nationality: %s\n' % obj.nationality result += 'Nationality: %s\n' % obj.nationality
if obj.roles: if obj.roles:
@ -107,7 +108,7 @@ class PersonInfoFormatter(IFormatter):
result += '\n%sAwards%s\n' % (self.BOLD, self.NC) result += '\n%sAwards%s\n' % (self.BOLD, self.NC)
for a in obj.awards: for a in obj.awards:
result += ' * %s\n' % a result += ' * %s\n' % a
result += '\n%Biography%s\n' % (self.BOLD, self.NC) result += '\n%sBiography%s\n' % (self.BOLD, self.NC)
result += obj.biography result += obj.biography
return result return result
@ -119,8 +120,11 @@ class PersonListFormatter(PrettyFormatter):
return obj.name return obj.name
def get_description(self, obj): def get_description(self, obj):
age = num_years(obj.birth_date) if obj.birth_date != NotAvailable:
return 'Real name: %s (age: %d, nationality: %s, gender: %s)' % (obj.real_name, age, obj.nationality, obj.gender) age = num_years(obj.birth_date)
else:
age = NotAvailable
return 'Real name: %s (age: %s, nationality: %s, gender: %s)' % (obj.real_name, age, obj.nationality, obj.gender)
class Cineoob(ReplApplication): class Cineoob(ReplApplication):
@ -155,7 +159,7 @@ class Cineoob(ReplApplication):
Get information about a movie. Get information about a movie.
""" """
# TODO verify if path = search movie or filmo
movie = self.get_object(id, 'get_movie') movie = self.get_object(id, 'get_movie')
if not movie: if not movie:
print >>sys.stderr, 'Movie not found: %s' % id print >>sys.stderr, 'Movie not found: %s' % id
@ -171,7 +175,7 @@ class Cineoob(ReplApplication):
Get information about a person. Get information about a person.
""" """
# TODO verify if path = search person or casting
person = self.get_object(id, 'get_person') person = self.get_object(id, 'get_person')
if not person: if not person:
print >>sys.stderr, 'Person not found: %s' % id print >>sys.stderr, 'Person not found: %s' % id
@ -217,8 +221,13 @@ class Cineoob(ReplApplication):
List persons related to a movie. List persons related to a movie.
""" """
movie = self.get_object(movie_id, 'get_movie')
if not movie:
print >>sys.stderr, 'Movie not found: %s' % id
return 3
self.change_path([u'casting']) self.change_path([u'casting'])
for backend, person in self.do('iter_movie_persons', movie_id): for backend, person in self.do('iter_movie_persons', movie.id):
self.cached_format(person) self.cached_format(person)
self.flush() self.flush()
@ -228,7 +237,12 @@ class Cineoob(ReplApplication):
List movies of a person. List movies of a person.
""" """
person = self.get_object(person_id, 'get_person')
if not person:
print >>sys.stderr, 'Person not found: %s' % id
return 3
self.change_path([u'filmography']) self.change_path([u'filmography'])
for backend, movie in self.do('iter_person_movies', person_id): for backend, movie in self.do('iter_person_movies', person.id):
self.cached_format(movie) self.cached_format(movie)
self.flush() self.flush()

View file

@ -30,7 +30,7 @@ from weboob.tools.ordereddict import OrderedDict
__all__ = ['UserError', 'FieldNotFound', 'NotAvailable', __all__ = ['UserError', 'FieldNotFound', 'NotAvailable',
'NotLoaded', 'IBaseCap', 'Field', 'IntField', 'DecimalField', 'NotLoaded', 'IBaseCap', 'Field', 'IntField', 'DecimalField',
'FloatField', 'StringField', 'BytesField', 'DateField', 'FloatField', 'StringField', 'BytesField', 'DateField',
'DeltaField', 'HashTableField', 'empty', 'CapBaseObject'] 'DeltaField', 'empty', 'CapBaseObject']
def empty(value): def empty(value):

View file

@ -18,7 +18,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from .base import IBaseCap, CapBaseObject, DateField, StringField, IntField, HashTableField from .base import IBaseCap, CapBaseObject, DateField, StringField, IntField, Field
__all__ = ['Movie', 'Person', 'ICapCinema'] __all__ = ['Movie', 'Person', 'ICapCinema']
@ -34,8 +34,8 @@ class Movie(CapBaseObject):
duration = IntField('Duration of the movie in minutes') duration = IntField('Duration of the movie in minutes')
description = StringField('Short description of the movie') description = StringField('Short description of the movie')
note = StringField('Notation of the movie') note = StringField('Notation of the movie')
awards = StringField('Awards won by the movie') awards = Field('Awards won by the movie',list)
roles = HashTableField('Lists of Persons related to the movie indexed by roles') roles = Field('Lists of Persons related to the movie indexed by roles',dict)
def __init__(self, id, original_title): def __init__(self, id, original_title):
CapBaseObject.__init__(self, id) CapBaseObject.__init__(self, id)
@ -53,8 +53,8 @@ class Person(CapBaseObject):
gender = StringField('Gender of a person') gender = StringField('Gender of a person')
nationality = StringField('Nationality of a person') nationality = StringField('Nationality of a person')
biography = StringField('Short biography of a person') biography = StringField('Short biography of a person')
awards = StringField('Awards won by the person') awards = Field('Awards won by the person',list)
roles = HashTableField('Lists of movies related to the person indexed by roles') roles = Field('Lists of movies related to the person indexed by roles',dict)
def __init__(self, id, name): def __init__(self, id, name):
CapBaseObject.__init__(self, id) CapBaseObject.__init__(self, id)