[imdb] filmography page muuuuch more readable than person page

2013-03-06 00:19:00 +01:00 · 2013-03-06 00:19:00 +01:00 · b91a1cd481
commit b91a1cd481
parent 27c36d412b
3 changed files with 55 additions and 42 deletions
--- a/modules/imdb/browser.py
+++ b/modules/imdb/browser.py
@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
 from weboob.capabilities.cinema import Movie
 from weboob.tools.json import json
-from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
+from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
 from datetime import datetime
@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser):
        'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
        'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
        'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
        'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
        }
    def iter_movies(self, pattern):
@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser):
        for cat in ['title_popular','title_exact','title_approx']:
            if jres.has_key(cat):
                for m in jres[cat]:
-                    yield self.get_movie(m['id'])
+                    movie = self.get_movie(m['id'])
                    if movie != None:
                        yield movie
    def iter_persons(self, pattern):
        res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser):
    def get_movie(self, id):
        res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id )
-        jres = json.loads(res)
+        if res != None:
            jres = json.loads(res)
        else:
            return None
        title = NotAvailable
        duration = NotAvailable
@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser):
        return self.page.iter_persons(movie_id)
    def iter_person_movies(self, person_id):
-        self.location('http://www.imdb.com/name/%s' % person_id)
+        self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
-        assert self.is_on_page(PersonPage)
+        assert self.is_on_page(FilmographyPage)
-        return self.page.iter_movies(person_id)
+        return self.page.iter_movies()
    def iter_person_movies_ids(self, person_id):
        self.location('http://www.imdb.com/name/%s' % person_id)
--- a/modules/imdb/pages.py
+++ b/modules/imdb/pages.py
@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage
 from datetime import datetime
-__all__ = ['MoviePage','PersonPage','MovieCrewPage']
+__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
 class MoviePage(BasePage):
@ -50,8 +50,12 @@ class BiographyPage(BasePage):
    def get_biography(self):
        bio = ''
        tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
-        for p in self.parser.select(tn,'p'):
+        #for p in self.parser.select(tn,'p'):
-            bio += '\n\n%s'%p.text_content().strip()
+        #    bio += '\n\n%s'%p.text_content().strip()
        # get children, append if label or tag = a,p,h...
        bio = tn.text_content().strip()
        if bio == "":
            bio = NotAvailable
        return bio
@ -125,31 +129,10 @@ class PersonPage(BasePage):
                dtime.append('1')
                dtime.append('1')
            death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
-        # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
+        # go to the filmography page
-
+        self.browser.location('http://www.imdb.com/name/%s/filmotype'%id)
-        #filmo_block =  self.parser.select(self.document.getroot(),'div#filmography',1)
+        assert self.browser.is_on_page(FilmographyPage)
-        #role_list = []
+        roles = self.browser.page.get_roles()
        #for span in self.parser.select(self.document.getroot(),'span.show-link'):
        #    role_list.append(span.attrib.get('id','').replace('show-',''))
        #role_index = -1
        #current_parent = None
        ##for sp in self.parser.select(filmo_block[0],'span.show-link'):
        #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
        #    divhead = divmovie.getparent()
        #    print "-- %s"%(self.document.getpath(divhead))
        #    print divmovie.attrib.get('class','')
        #    if current_parent != self.document.getpath(divhead):
        #        role_index += 1
        #        current_parent = self.document.getpath(divhead)
        #    role = role_list[role_index]
        #    a = self.parser.select(divmovie,'b a',1)
        #    roles[role].append(a.text)
        #print roles
        roles['any activity'] = []
        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
            a = self.parser.select(movie_div,'b a',1)
            roles['any activity'].append(a.text)
        person = Person(id,name)
        person.real_name       = real_name
@ -162,14 +145,38 @@ class PersonPage(BasePage):
        person.roles           = roles
        return person
    def iter_movies(self,person_id):
        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
            a = self.parser.select(movie_div,'b a',1)
            id = a.attrib.get('href','').strip('/').split('/')[-1]
            yield self.browser.get_movie(id)
    def iter_movies_ids(self,person_id):
        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
            a = self.parser.select(movie_div,'b a',1)
            id = a.attrib.get('href','').strip('/').split('/')[-1]
            yield id
 class FilmographyPage(BasePage):
    def get_roles(self):
        roles = {}
        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
            roles[role] = []
            for a in self.parser.select(role_div,'ol > li > a'):
                id = a.attrib.get('href','').strip('/').split('/')[-1]
                if id.startswith('tt'):
                    #li = a.getparent()
                    #between_p = li.text_content().split(')')[0].split('(')[1]
                    if '(' in a.tail and ')' in a.tail:
                        between_p = a.tail.split(')')[0].split('(')[1]
                    else:
                        between_p = '????'
                    roles[role].append('(%s) %s'%(between_p,a.text))
        return roles
    def iter_movies(self):
        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
            if role != 'In Development':
                for a in self.parser.select(role_div,'ol > li > a'):
                    id = a.attrib.get('href','').strip('/').split('/')[-1]
                    if id.startswith('tt'):
                        movie = self.browser.get_movie(id)
                        if movie != None:
                            yield movie
--- a/weboob/applications/cineoob/cineoob.py
+++ b/weboob/applications/cineoob/cineoob.py
@ -347,7 +347,7 @@ class Cineoob(ReplApplication):
            print >>sys.stderr, 'Person not found: %s' % id
            return 3
        self.change_path([u'biography'])
        for backend, bio in self.do('get_person_biography', person.id):
            print bio
-        self.flush()
+        if bio != NotAvailable:
            self.flush()