diff --git a/modules/allocine/browser.py b/modules/allocine/browser.py index d76a9423..c088d3de 100644 --- a/modules/allocine/browser.py +++ b/modules/allocine/browser.py @@ -18,14 +18,11 @@ # along with weboob. If not, see . -import HTMLParser -from weboob.tools.browser import BaseBrowser, BrowserHTTPNotFound +from weboob.tools.browser import BaseBrowser from weboob.capabilities.base import NotAvailable, NotLoaded from weboob.capabilities.cinema import Movie, Person from weboob.tools.json import json -from .pages import PersonPage, MovieCrewPage, BiographyPage, FilmographyPage, ReleasePage - from datetime import datetime __all__ = ['AllocineBrowser'] @@ -36,13 +33,6 @@ class AllocineBrowser(BaseBrowser): PROTOCOL = 'http' ENCODING = 'utf-8' USER_AGENT = BaseBrowser.USER_AGENTS['wget'] - #PAGES = { - # 'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage, - # 'http://www.imdb.com/title/tt[0-9]*/releaseinfo.*': ReleasePage, - # 'http://www.imdb.com/name/nm[0-9]*/*': PersonPage, - # 'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage, - # 'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage, - #} def iter_movies(self, pattern): res = self.readurl('http://api.allocine.fr/rest/v3/search?partner=YW5kcm9pZC12M3M&filter=movie&q=%s&format=json' % pattern.encode('utf-8')) @@ -236,10 +226,34 @@ class AllocineBrowser(BaseBrowser): return person def iter_movie_persons(self, movie_id, role): - self.location('http://www.imdb.com/title/%s/fullcredits' % movie_id) - assert self.is_on_page(MovieCrewPage) - for p in self.page.iter_persons(role): - yield p + res = self.readurl( + 'http://api.allocine.fr/rest/v3/movie?partner=YW5kcm9pZC12M3M&code=%s&profile=large&mediafmt=mp4-lc&format=json&filter=movie&striptags=synopsis,synopsisshort' % movie_id) + if res is not None: + jres = json.loads(res)['movie'] + else: + return + if 'castMember' in jres: + for cast in jres['castMember']: + id = cast['person']['code'] + name = unicode(cast['person']['name']) + short_description = unicode(cast['activity']['$']) + if 'role' in cast: + short_description += ', %s' % cast['role'] + thumbnail_url = NotAvailable + if 'picture' in cast: + thumbnail_url = unicode(cast['picture']['href']) + person = Person(id, name) + person.short_description = short_description + person.real_name = NotLoaded + person.birth_place = NotLoaded + person.birth_date = NotLoaded + person.death_date = NotLoaded + person.gender = NotLoaded + person.nationality = NotLoaded + person.short_biography = NotLoaded + person.roles = NotLoaded + person.thumbnail_url = thumbnail_url + yield person def iter_person_movies(self, person_id, role_filter): res = self.readurl( @@ -270,45 +284,25 @@ class AllocineBrowser(BaseBrowser): yield movie def iter_person_movies_ids(self, person_id): - self.location('http://www.imdb.com/name/%s/filmotype' % person_id) - assert self.is_on_page(FilmographyPage) - for movie in self.page.iter_movies_ids(): - yield movie + res = self.readurl( + 'http://api.allocine.fr/rest/v3/filmography?partner=YW5kcm9pZC12M3M&profile=medium&code=%s&filter=movie&format=json' % person_id) + if res is not None: + jres = json.loads(res)['person'] + else: + return + for m in jres['participation']: + yield unicode(m['movie']['code']) def iter_movie_persons_ids(self, movie_id): - self.location('http://www.imdb.com/title/%s/fullcredits' % movie_id) - assert self.is_on_page(MovieCrewPage) - for person in self.page.iter_persons_ids(): - yield person + res = self.readurl( + 'http://api.allocine.fr/rest/v3/movie?partner=YW5kcm9pZC12M3M&code=%s&profile=large&mediafmt=mp4-lc&format=json&filter=movie&striptags=synopsis,synopsisshort' % movie_id) + if res is not None: + jres = json.loads(res)['movie'] + else: + return + if 'castMember' in jres: + for cast in jres['castMember']: + yield unicode(cast['person']['code']) def get_movie_releases(self, id, country): return - self.location('http://www.imdb.com/title/%s/releaseinfo' % id) - assert self.is_on_page(ReleasePage) - return self.page.get_movie_releases(country) - - -dict_hex = {'á': u'á', - 'é': u'é', - 'è': u'è', - 'í': u'í', - 'ñ': u'ñ', - 'ó': u'ó', - 'ú': u'ú', - 'ü': u'ü', - '&': u'&', - ''': u"'", - 'à': u'à', - 'À': u'À', - 'â': u'â', - 'É': u'É', - 'ë': u'ë', - 'ô': u'ô', - 'ç': u'ç' - } - - -def latin2unicode(word): - for key in dict_hex.keys(): - word = word.replace(key, dict_hex[key]) - return unicode(word) diff --git a/modules/allocine/pages.py b/modules/allocine/pages.py deleted file mode 100644 index 43f51715..00000000 --- a/modules/allocine/pages.py +++ /dev/null @@ -1,231 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright(C) 2013 Julien Veyssier -# -# This file is part of weboob. -# -# weboob is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# weboob is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with weboob. If not, see . - - -from weboob.capabilities.cinema import Person, Movie -from weboob.capabilities.base import NotAvailable, NotLoaded -from weboob.tools.browser import BasePage - -from datetime import datetime - - -__all__ = ['PersonPage', 'MovieCrewPage', 'BiographyPage', 'FilmographyPage', 'ReleasePage'] - - -class ReleasePage(BasePage): - ''' Page containing releases of a movie - ''' - def get_movie_releases(self, country_filter): - result = unicode() - links = self.parser.select(self.document.getroot(), 'b a') - for a in links: - href = a.attrib.get('href', '') - if href.strip('/').split('/')[0] == 'calendar' and\ - (country_filter is None or href.split('region=')[-1].lower() == country_filter): - country = a.text - td_date = self.parser.select(a.getparent().getparent().getparent(), 'td')[1] - date_links = self.parser.select(td_date, 'a') - if len(date_links) > 1: - date = date_links[1].attrib.get('href', '').strip('/').split('/')[-1] - date += '-'+date_links[0].attrib.get('href', '').strip('/').split('/')[-1] - else: - date = unicode(self.parser.select(a.getparent().getparent().getparent(), 'td')[1].text_content()) - result += '%s : %s\n' % (country, date) - if result == u'': - result = NotAvailable - else: - result = result.strip() - return result - - -class BiographyPage(BasePage): - ''' Page containing biography of a person - ''' - def get_biography(self): - bio = unicode() - tn = self.parser.select(self.document.getroot(), 'div#tn15content', 1) - # we only read paragraphs, titles and links - for ch in tn.getchildren(): - if ch.tag in ['p', 'h5', 'a']: - bio += '%s\n\n' % ch.text_content().strip() - if bio == u'': - bio = NotAvailable - return bio - - -class MovieCrewPage(BasePage): - ''' Page listing all the persons related to a movie - ''' - def iter_persons(self, role_filter=None): - if (role_filter is None or (role_filter is not None and role_filter == 'actor')): - tables = self.parser.select(self.document.getroot(), 'table.cast') - if len(tables) > 0: - table = tables[0] - tds = self.parser.select(table, 'td.nm') - for td in tds: - id = td.find('a').attrib.get('href', '').strip('/').split('/')[-1] - name = unicode(td.find('a').text) - char_name = unicode(self.parser.select(td.getparent(), 'td.char', 1).text_content()) - person = Person(id, name) - person.short_description = char_name - person.real_name = NotLoaded - person.birth_place = NotLoaded - person.birth_date = NotLoaded - person.death_date = NotLoaded - person.gender = NotLoaded - person.nationality = NotLoaded - person.short_biography = NotLoaded - person.roles = NotLoaded - person.thumbnail_url = NotLoaded - yield person - - for gloss_link in self.parser.select(self.document.getroot(), 'table[cellspacing=1] h5 a'): - role = gloss_link.attrib.get('name', '').rstrip('s') - if (role_filter is None or (role_filter is not None and role == role_filter)): - tbody = gloss_link.getparent().getparent().getparent().getparent() - for line in self.parser.select(tbody, 'tr')[1:]: - for a in self.parser.select(line, 'a'): - role_detail = NotAvailable - href = a.attrib.get('href', '') - if '/name/nm' in href: - id = href.strip('/').split('/')[-1] - name = unicode(a.text) - if 'glossary' in href: - role_detail = unicode(a.text) - person = Person(id, name) - person.short_description = role_detail - yield person - # yield self.browser.get_person(id) - - def iter_persons_ids(self): - tables = self.parser.select(self.document.getroot(), 'table.cast') - if len(tables) > 0: - table = tables[0] - tds = self.parser.select(table, 'td.nm') - for td in tds: - id = td.find('a').attrib.get('href', '').strip('/').split('/')[-1] - yield id - - -class PersonPage(BasePage): - ''' Page giving informations about a person - It is used to build a Person instance and to get the movie list related to a person - ''' - def get_person(self, id): - name = NotAvailable - short_biography = NotAvailable - short_description = NotAvailable - birth_place = NotAvailable - birth_date = NotAvailable - death_date = NotAvailable - real_name = NotAvailable - gender = NotAvailable - thumbnail_url = NotAvailable - roles = {} - nationality = NotAvailable - td_overview = self.parser.select(self.document.getroot(), 'td#overview-top', 1) - descs = self.parser.select(td_overview, 'span[itemprop=description]') - if len(descs) > 0: - short_biography = unicode(descs[0].text) - rname_block = self.parser.select(td_overview, 'div.txt-block h4.inline') - if len(rname_block) > 0 and "born" in rname_block[0].text.lower(): - links = self.parser.select(rname_block[0].getparent(), 'a') - for a in links: - href = a.attrib.get('href', '').strip() - if href == 'bio': - real_name = unicode(a.text.strip()) - elif 'birth_place' in href: - birth_place = unicode(a.text.lower().strip()) - names = self.parser.select(td_overview, 'h1[itemprop=name]') - if len(names) > 0: - name = unicode(names[0].text.strip()) - times = self.parser.select(td_overview, 'time[itemprop=birthDate]') - if len(times) > 0: - time = times[0].attrib.get('datetime', '').split('-') - if len(time) == 3 and int(time[0]) >= 1900: - birth_date = datetime(int(time[0]), int(time[1]), int(time[2])) - dtimes = self.parser.select(td_overview, 'time[itemprop=deathDate]') - if len(dtimes) > 0: - dtime = dtimes[0].attrib.get('datetime', '').split('-') - if len(dtime) == 3 and int(dtime[0]) >= 1900: - death_date = datetime(int(dtime[0]), int(dtime[1]), int(dtime[2])) - img_thumbnail = self.parser.select(self.document.getroot(), 'td#img_primary img') - if len(img_thumbnail) > 0: - thumbnail_url = unicode(img_thumbnail[0].attrib.get('src', '')) - - # go to the filmography page - self.browser.location('http://www.imdb.com/name/%s/filmotype' % id) - assert self.browser.is_on_page(FilmographyPage) - roles = self.browser.page.get_roles() - - person = Person(id, name) - person.real_name = real_name - person.birth_date = birth_date - person.death_date = death_date - person.birth_place = birth_place - person.gender = gender - person.nationality = nationality - person.short_biography = short_biography - person.short_description = short_description - person.roles = roles - person.thumbnail_url = thumbnail_url - return person - - -class FilmographyPage(BasePage): - ''' Page of detailed filmography of a person, sorted by type of role - This page is easier to parse than the main person page filmography - ''' - def iter_movies_ids(self): - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - yield id - - def get_roles(self): - roles = {} - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') - roles[role] = [] - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - if '(' in a.tail and ')' in a.tail: - between_p = a.tail.split(')')[0].split('(')[1] - else: - between_p = '????' - roles[role].append('(%s) %s' % (between_p, a.text)) - return roles - - def iter_movies(self, role_filter=None): - for role_div in self.parser.select(self.document.getroot(), 'div.filmo'): - role = self.parser.select(role_div, 'h5 a', 1).text.replace(':', '') - if (role_filter is None or (role_filter is not None and role.lower().strip() == role_filter))\ - and role != 'In Development': - for a in self.parser.select(role_div, 'ol > li > a'): - id = a.attrib.get('href', '').strip('/').split('/')[-1] - if id.startswith('tt'): - title = unicode(a.text) - role_detail = NotAvailable - if len(a.tail) > 0: - role_detail = unicode(' '.join(a.tail.replace('..', '').split())) - movie = Movie(id, title) - movie.short_description = role_detail - yield movie diff --git a/modules/allocine/test.py b/modules/allocine/test.py index aae7ba71..46c12193 100644 --- a/modules/allocine/test.py +++ b/modules/allocine/test.py @@ -20,8 +20,8 @@ from weboob.tools.test import BackendTest -class ImdbTest(BackendTest): - BACKEND = 'imdb' +class AllocineTest(BackendTest): + BACKEND = 'allocine' def test_search_movie(self): movies = list(self.backend.iter_movies('spiderman')) @@ -29,7 +29,7 @@ class ImdbTest(BackendTest): assert movie.id def test_get_movie(self): - movie = self.backend.get_movie('tt0079980') + movie = self.backend.get_movie('5032') assert movie.id assert movie.original_title @@ -39,29 +39,19 @@ class ImdbTest(BackendTest): assert person.id def test_get_person(self): - person = self.backend.get_person('nm0223033') + person = self.backend.get_person('1116') assert person.id assert person.name assert person.birth_date def test_movie_persons(self): - persons = list(self.backend.iter_movie_persons('tt0079980')) + persons = list(self.backend.iter_movie_persons('5032')) for person in persons: assert person.id assert person.name def test_person_movies(self): - movies = list(self.backend.iter_person_movies('nm0223033')) + movies = list(self.backend.iter_person_movies('1115')) for movie in movies: assert movie.id assert movie.original_title - - def test_get_person_biography(self): - bio = self.backend.get_person_biography('nm0223033') - assert bio != '' - assert bio is not None - - def test_get_movie_releases(self): - rel = self.backend.get_movie_releases('tt0079980') - assert rel != '' - assert rel is not None diff --git a/weboob/applications/qcineoob/person.py b/weboob/applications/qcineoob/person.py index 32137dbc..eb166c87 100644 --- a/weboob/applications/qcineoob/person.py +++ b/weboob/applications/qcineoob/person.py @@ -82,7 +82,9 @@ class Person(QFrame): def biography(self): QApplication.setOverrideCursor(Qt.WaitCursor) - bio = self.backend.get_person_biography(self.person.id) + self.backend.fill_person(self.person, 'biography') + bio = self.person.biography + #bio = self.backend.get_person_biography(self.person.id) self.ui.shortBioPlain.setPlainText(bio) self.ui.biographyLabel.setText('Full biography:') self.ui.biographyButton.hide()