[imdb] fix : site changed

This commit is contained in:
Bezleputh 2014-07-17 10:30:59 +02:00
commit 7b02b2750b
3 changed files with 34 additions and 53 deletions

View file

@ -25,7 +25,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.capabilities.cinema import Movie, Person from weboob.capabilities.cinema import Movie, Person
from weboob.tools.json import json from weboob.tools.json import json
from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage, ReleasePage from .pages import PersonPage, MovieCrewPage, BiographyPage, ReleasePage
from datetime import datetime from datetime import datetime
@ -42,7 +42,6 @@ class ImdbBrowser(BaseBrowser):
'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage, 'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage,
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
} }
def iter_movies(self, pattern): def iter_movies(self, pattern):
@ -174,13 +173,13 @@ class ImdbBrowser(BaseBrowser):
yield p yield p
def iter_person_movies(self, person_id, role): def iter_person_movies(self, person_id, role):
self.location('http://www.imdb.com/name/%s/filmotype' % person_id) self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(FilmographyPage) assert self.is_on_page(PersonPage)
return self.page.iter_movies(role) return self.page.iter_movies(role)
def iter_person_movies_ids(self, person_id): def iter_person_movies_ids(self, person_id):
self.location('http://www.imdb.com/name/%s/filmotype' % person_id) self.location('http://www.imdb.com/name/%s' % person_id)
assert self.is_on_page(FilmographyPage) assert self.is_on_page(PersonPage)
for movie in self.page.iter_movies_ids(): for movie in self.page.iter_movies_ids():
yield movie yield movie

View file

@ -21,12 +21,12 @@
from weboob.capabilities.cinema import Person, Movie from weboob.capabilities.cinema import Person, Movie
from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.base import NotAvailable, NotLoaded
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.html import html2text
from datetime import datetime from datetime import datetime
import re import re
__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'FilmographyPage', 'ReleasePage'] __all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'ReleasePage']
class ReleasePage(BasePage): class ReleasePage(BasePage):
@ -62,13 +62,15 @@ class BiographyPage(BasePage):
''' '''
def get_biography(self): def get_biography(self):
bio = unicode() bio = unicode()
tn = self.parser.select(self.document.getroot(), 'div#tn15content', 1) start = False
# we only read paragraphs, titles and links tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1)
for ch in tn.getchildren(): for el in tn.getchildren():
if ch.tag in ['p', 'h5', 'a']: if el.attrib.get('name') == 'mini_bio':
bio += '%s\n\n' % ch.text_content().strip() start = True
if bio == u'':
bio = NotAvailable if start:
bio += html2text(self.parser.tostring(el))
return bio return bio
@ -173,10 +175,7 @@ class PersonPage(BasePage):
if len(img_thumbnail) > 0: if len(img_thumbnail) > 0:
thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', '')) thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', ''))
# go to the filmography page roles = self.get_roles()
self.browser.location('http://www.imdb.com/name/%s/filmotype' % id)
assert self.browser.is_on_page(FilmographyPage)
roles = self.browser.page.get_roles()
person = Person(id, name) person = Person(id, name)
person.real_name = real_name person.real_name = real_name
@ -191,45 +190,27 @@ class PersonPage(BasePage):
person.thumbnail_url = thumbnail_url person.thumbnail_url = thumbnail_url
return person return person
class FilmographyPage(BasePage):
''' Page of detailed filmography of a person, sorted by type of role
This page is easier to parse than the main person page filmography
'''
def iter_movies_ids(self): def iter_movies_ids(self):
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): for role_div in self.parser.select(self.document.getroot(), 'div#filmography div.filmo-category-section > div'):
for a in self.parser.select(role_div, 'ol > li > a'): for a in self.parser.select(role_div, 'a'):
id = a.attrib.get('href', '').strip('/').split('/')[-1] m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
if id.startswith('tt'): if m:
yield id yield m.group(1)
def get_roles(self): def get_roles(self):
roles = {} roles = {}
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.head'):
role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') role = self.parser.select(role_div, 'a')[-1].text
roles[role] = [] roles[role] = []
for a in self.parser.select(role_div, 'ol > li > a'): category = role_div.attrib.get('data-category')
id = a.attrib.get('href', '').strip('/').split('/')[-1] for infos in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
if id.startswith('tt'): if category in infos.attrib.get('id'):
if '(' in a.tail and ')' in a.tail: roles[role].append(infos.text_content().replace('\n', ' ').strip())
between_p = a.tail.split(')')[0].split('(')[1]
else:
between_p = '????'
roles[role].append('(%s) %s' % (between_p, a.text))
return roles return roles
def iter_movies(self, role_filter=None): def iter_movies(self, role_filter=None):
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') for a in self.parser.select(role_div, 'a'):
if (role_filter is None or (role_filter is not None and role.lower().strip() == role_filter))\ m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
and role != 'In Development': if m:
for a in self.parser.select(role_div, 'ol > li > a'): yield Movie(m.group(1), a.text)
id = a.attrib.get('href', '').strip('/').split('/')[-1]
if id.startswith('tt'):
title = unicode(a.text)
role_detail = NotAvailable
if len(a.tail) > 0:
role_detail = unicode(' '.join(a.tail.replace('..', '').split()))
movie = Movie(id, title)
movie.short_description = role_detail
yield movie

View file

@ -19,6 +19,7 @@
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
class ImdbTest(BackendTest): class ImdbTest(BackendTest):
BACKEND = 'imdb' BACKEND = 'imdb'