[imdb] fix : site changed
This commit is contained in:
parent
49caf13c6b
commit
7b02b2750b
3 changed files with 34 additions and 53 deletions
|
|
@ -25,7 +25,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded
|
||||||
from weboob.capabilities.cinema import Movie, Person
|
from weboob.capabilities.cinema import Movie, Person
|
||||||
from weboob.tools.json import json
|
from weboob.tools.json import json
|
||||||
|
|
||||||
from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage, ReleasePage
|
from .pages import PersonPage, MovieCrewPage, BiographyPage, ReleasePage
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
@ -42,7 +42,6 @@ class ImdbBrowser(BaseBrowser):
|
||||||
'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage,
|
'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage,
|
||||||
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
|
'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
|
||||||
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
|
'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
|
||||||
'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def iter_movies(self, pattern):
|
def iter_movies(self, pattern):
|
||||||
|
|
@ -174,13 +173,13 @@ class ImdbBrowser(BaseBrowser):
|
||||||
yield p
|
yield p
|
||||||
|
|
||||||
def iter_person_movies(self, person_id, role):
|
def iter_person_movies(self, person_id, role):
|
||||||
self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
|
self.location('http://www.imdb.com/name/%s' % person_id)
|
||||||
assert self.is_on_page(FilmographyPage)
|
assert self.is_on_page(PersonPage)
|
||||||
return self.page.iter_movies(role)
|
return self.page.iter_movies(role)
|
||||||
|
|
||||||
def iter_person_movies_ids(self, person_id):
|
def iter_person_movies_ids(self, person_id):
|
||||||
self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
|
self.location('http://www.imdb.com/name/%s' % person_id)
|
||||||
assert self.is_on_page(FilmographyPage)
|
assert self.is_on_page(PersonPage)
|
||||||
for movie in self.page.iter_movies_ids():
|
for movie in self.page.iter_movies_ids():
|
||||||
yield movie
|
yield movie
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,12 +21,12 @@
|
||||||
from weboob.capabilities.cinema import Person, Movie
|
from weboob.capabilities.cinema import Person, Movie
|
||||||
from weboob.capabilities.base import NotAvailable, NotLoaded
|
from weboob.capabilities.base import NotAvailable, NotLoaded
|
||||||
from weboob.tools.browser import BasePage
|
from weboob.tools.browser import BasePage
|
||||||
|
from weboob.tools.html import html2text
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'FilmographyPage', 'ReleasePage']
|
__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'ReleasePage']
|
||||||
|
|
||||||
|
|
||||||
class ReleasePage(BasePage):
|
class ReleasePage(BasePage):
|
||||||
|
|
@ -62,13 +62,15 @@ class BiographyPage(BasePage):
|
||||||
'''
|
'''
|
||||||
def get_biography(self):
|
def get_biography(self):
|
||||||
bio = unicode()
|
bio = unicode()
|
||||||
tn = self.parser.select(self.document.getroot(), 'div#tn15content', 1)
|
start = False
|
||||||
# we only read paragraphs, titles and links
|
tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1)
|
||||||
for ch in tn.getchildren():
|
for el in tn.getchildren():
|
||||||
if ch.tag in ['p', 'h5', 'a']:
|
if el.attrib.get('name') == 'mini_bio':
|
||||||
bio += '%s\n\n' % ch.text_content().strip()
|
start = True
|
||||||
if bio == u'':
|
|
||||||
bio = NotAvailable
|
if start:
|
||||||
|
bio += html2text(self.parser.tostring(el))
|
||||||
|
|
||||||
return bio
|
return bio
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -173,10 +175,7 @@ class PersonPage(BasePage):
|
||||||
if len(img_thumbnail) > 0:
|
if len(img_thumbnail) > 0:
|
||||||
thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', ''))
|
thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', ''))
|
||||||
|
|
||||||
# go to the filmography page
|
roles = self.get_roles()
|
||||||
self.browser.location('http://www.imdb.com/name/%s/filmotype' % id)
|
|
||||||
assert self.browser.is_on_page(FilmographyPage)
|
|
||||||
roles = self.browser.page.get_roles()
|
|
||||||
|
|
||||||
person = Person(id, name)
|
person = Person(id, name)
|
||||||
person.real_name = real_name
|
person.real_name = real_name
|
||||||
|
|
@ -191,45 +190,27 @@ class PersonPage(BasePage):
|
||||||
person.thumbnail_url = thumbnail_url
|
person.thumbnail_url = thumbnail_url
|
||||||
return person
|
return person
|
||||||
|
|
||||||
|
|
||||||
class FilmographyPage(BasePage):
|
|
||||||
''' Page of detailed filmography of a person, sorted by type of role
|
|
||||||
This page is easier to parse than the main person page filmography
|
|
||||||
'''
|
|
||||||
def iter_movies_ids(self):
|
def iter_movies_ids(self):
|
||||||
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'):
|
for role_div in self.parser.select(self.document.getroot(), 'div#filmography div.filmo-category-section > div'):
|
||||||
for a in self.parser.select(role_div, 'ol > li > a'):
|
for a in self.parser.select(role_div, 'a'):
|
||||||
id = a.attrib.get('href', '').strip('/').split('/')[-1]
|
m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
|
||||||
if id.startswith('tt'):
|
if m:
|
||||||
yield id
|
yield m.group(1)
|
||||||
|
|
||||||
def get_roles(self):
|
def get_roles(self):
|
||||||
roles = {}
|
roles = {}
|
||||||
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'):
|
for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.head'):
|
||||||
role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '')
|
role = self.parser.select(role_div, 'a')[-1].text
|
||||||
roles[role] = []
|
roles[role] = []
|
||||||
for a in self.parser.select(role_div, 'ol > li > a'):
|
category = role_div.attrib.get('data-category')
|
||||||
id = a.attrib.get('href', '').strip('/').split('/')[-1]
|
for infos in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
|
||||||
if id.startswith('tt'):
|
if category in infos.attrib.get('id'):
|
||||||
if '(' in a.tail and ')' in a.tail:
|
roles[role].append(infos.text_content().replace('\n', ' ').strip())
|
||||||
between_p = a.tail.split(')')[0].split('(')[1]
|
|
||||||
else:
|
|
||||||
between_p = '????'
|
|
||||||
roles[role].append('(%s) %s' % (between_p, a.text))
|
|
||||||
return roles
|
return roles
|
||||||
|
|
||||||
def iter_movies(self, role_filter=None):
|
def iter_movies(self, role_filter=None):
|
||||||
for role_div in self.parser.select(self.document.getroot(), 'div.filmo'):
|
for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'):
|
||||||
role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '')
|
for a in self.parser.select(role_div, 'a'):
|
||||||
if (role_filter is None or (role_filter is not None and role.lower().strip() == role_filter))\
|
m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href'))
|
||||||
and role != 'In Development':
|
if m:
|
||||||
for a in self.parser.select(role_div, 'ol > li > a'):
|
yield Movie(m.group(1), a.text)
|
||||||
id = a.attrib.get('href', '').strip('/').split('/')[-1]
|
|
||||||
if id.startswith('tt'):
|
|
||||||
title = unicode(a.text)
|
|
||||||
role_detail = NotAvailable
|
|
||||||
if len(a.tail) > 0:
|
|
||||||
role_detail = unicode(' '.join(a.tail.replace('..', '').split()))
|
|
||||||
movie = Movie(id, title)
|
|
||||||
movie.short_description = role_detail
|
|
||||||
yield movie
|
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
from weboob.tools.test import BackendTest
|
from weboob.tools.test import BackendTest
|
||||||
|
|
||||||
|
|
||||||
class ImdbTest(BackendTest):
|
class ImdbTest(BackendTest):
|
||||||
BACKEND = 'imdb'
|
BACKEND = 'imdb'
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue