From 7b02b2750bae7fc98c7eb7c4a9518f2415d564f6 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Thu, 17 Jul 2014 10:30:59 +0200 Subject: [PATCH] [imdb] fix : site changed --- modules/imdb/browser.py | 11 +++--- modules/imdb/pages.py | 75 +++++++++++++++-------------------------- modules/imdb/test.py | 1 + 3 files changed, 34 insertions(+), 53 deletions(-) diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py index 947f9d0f..15864ce5 100644 --- a/modules/imdb/browser.py +++ b/modules/imdb/browser.py @@ -25,7 +25,7 @@ from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.cinema import Movie, Person from weboob.tools.json import json -from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage, ReleasePage +from .pages import PersonPage, MovieCrewPage, BiographyPage, ReleasePage from datetime import datetime @@ -42,7 +42,6 @@ class ImdbBrowser(BaseBrowser): 'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage, 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, - 'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage, } def iter_movies(self, pattern): @@ -174,13 +173,13 @@ class ImdbBrowser(BaseBrowser): yield p def iter_person_movies(self, person_id, role): - self.location('http://www.imdb.com/name/%s/filmotype' % person_id) - assert self.is_on_page(FilmographyPage) + self.location('http://www.imdb.com/name/%s' % person_id) + assert self.is_on_page(PersonPage) return self.page.iter_movies(role) def iter_person_movies_ids(self, person_id): - self.location('http://www.imdb.com/name/%s/filmotype' % person_id) - assert self.is_on_page(FilmographyPage) + self.location('http://www.imdb.com/name/%s' % person_id) + assert self.is_on_page(PersonPage) for movie in self.page.iter_movies_ids(): yield movie diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py index 5426bd77..fe4ab40d 100644 --- a/modules/imdb/pages.py +++ b/modules/imdb/pages.py @@ -21,12 +21,12 @@ from weboob.capabilities.cinema import Person, Movie from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.tools.browser import BasePage - +from weboob.tools.html import html2text from datetime import datetime import re -__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'FilmographyPage', 'ReleasePage'] +__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'ReleasePage'] class ReleasePage(BasePage): @@ -62,13 +62,15 @@ class BiographyPage(BasePage): ''' def get_biography(self): bio = unicode() - tn = self.parser.select(self.document.getroot(), 'div#tn15content', 1) - # we only read paragraphs, titles and links - for ch in tn.getchildren(): - if ch.tag in ['p', 'h5', 'a']: - bio += '%s\n\n' % ch.text_content().strip() - if bio == u'': - bio = NotAvailable + start = False + tn = self.parser.select(self.document.getroot(), 'div#bio_content', 1) + for el in tn.getchildren(): + if el.attrib.get('name') == 'mini_bio': + start = True + + if start: + bio += html2text(self.parser.tostring(el)) + return bio @@ -173,10 +175,7 @@ class PersonPage(BasePage): if len(img_thumbnail) > 0: thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', '')) - # go to the filmography page - self.browser.location('http://www.imdb.com/name/%s/filmotype' % id) - assert self.browser.is_on_page(FilmographyPage) - roles = self.browser.page.get_roles() + roles = self.get_roles() person = Person(id, name) person.real_name = real_name @@ -191,45 +190,27 @@ class PersonPage(BasePage): person.thumbnail_url = thumbnail_url return person - -class FilmographyPage(BasePage): - ''' Page of detailed filmography of a person, sorted by type of role - This page is easier to parse than the main person page filmography - ''' def iter_movies_ids(self): - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - yield id + for role_div in self.parser.select(self.document.getroot(), 'div#filmography div.filmo-category-section > div'): + for a in self.parser.select(role_div, 'a'): + m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href')) + if m: + yield m.group(1) def get_roles(self): roles = {} - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') + for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.head'): + role = self.parser.select(role_div, 'a')[-1].text roles[role] = [] - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - if '(' in a.tail and ')' in a.tail: - between_p = a.tail.split(')')[0].split('(')[1] - else: - between_p = '????' - roles[role].append('(%s) %s' % (between_p, a.text)) + category = role_div.attrib.get('data-category') + for infos in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'): + if category in infos.attrib.get('id'): + roles[role].append(infos.text_content().replace('\n', ' ').strip()) return roles def iter_movies(self, role_filter=None): - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') - if (role_filter is None or (role_filter is not None and role.lower().strip() == role_filter))\ - and role != 'In Development': - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - title = unicode(a.text) - role_detail = NotAvailable - if len(a.tail) > 0: - role_detail = unicode(' '.join(a.tail.replace('..', '').split())) - movie = Movie(id, title) - movie.short_description = role_detail - yield movie + for role_div in self.parser.select(self.document.getroot(), 'div#filmography > div.filmo-category-section > div'): + for a in self.parser.select(role_div, 'a'): + m = re.search('/title/(tt.*)/\?.*', a.attrib.get('href')) + if m: + yield Movie(m.group(1), a.text) diff --git a/modules/imdb/test.py b/modules/imdb/test.py index 81e66854..a2b94fa3 100644 --- a/modules/imdb/test.py +++ b/modules/imdb/test.py @@ -19,6 +19,7 @@ from weboob.tools.test import BackendTest + class ImdbTest(BackendTest): BACKEND = 'imdb'