[imdb] filmography page muuuuch more readable than person page

2013-03-06 00:19:00 +01:00 · 2013-03-06 00:19:00 +01:00 · b91a1cd481
commit b91a1cd481
parent 27c36d412b
3 changed files with 55 additions and 42 deletions
--- a/modules/imdb/browser.py
+++ b/modules/imdb/browser.py
@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
 from weboob.capabilities.cinema import Movie
 from weboob.tools.json import json

-from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
+from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage

 from datetime import datetime

@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser):
        'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
        'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
        'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
+        'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
        }

    def iter_movies(self, pattern):
@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser):
        for cat in ['title_popular','title_exact','title_approx']:
            if jres.has_key(cat):
                for m in jres[cat]:
-                    yield self.get_movie(m['id'])
+                    movie = self.get_movie(m['id'])
+                    if movie != None:
+                        yield movie

    def iter_persons(self, pattern):
        res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser):

    def get_movie(self, id):
        res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id )
-        jres = json.loads(res)
+        if res != None:
+            jres = json.loads(res)
+        else:
+            return None

        title = NotAvailable
        duration = NotAvailable
@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser):
        return self.page.iter_persons(movie_id)

    def iter_person_movies(self, person_id):
-        self.location('http://www.imdb.com/name/%s' % person_id)
-        assert self.is_on_page(PersonPage)
-        return self.page.iter_movies(person_id)
+        self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
+        assert self.is_on_page(FilmographyPage)
+        return self.page.iter_movies()

    def iter_person_movies_ids(self, person_id):
        self.location('http://www.imdb.com/name/%s' % person_id)
--- a/modules/imdb/pages.py
+++ b/modules/imdb/pages.py
@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage
 from datetime import datetime


-__all__ = ['MoviePage','PersonPage','MovieCrewPage']
+__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']


 class MoviePage(BasePage):
@ -50,8 +50,12 @@ class BiographyPage(BasePage):
    def get_biography(self):
        bio = ''
        tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
-        for p in self.parser.select(tn,'p'):
-            bio += '\n\n%s'%p.text_content().strip()
+        #for p in self.parser.select(tn,'p'):
+        #    bio += '\n\n%s'%p.text_content().strip()
+        # get children, append if label or tag = a,p,h...
+        bio = tn.text_content().strip()
+        if bio == "":
+            bio = NotAvailable
        return bio


@ -125,31 +129,10 @@ class PersonPage(BasePage):
                dtime.append('1')
                dtime.append('1')
            death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
-        # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
-
-        #filmo_block =  self.parser.select(self.document.getroot(),'div#filmography',1)
-        #role_list = []
-        #for span in self.parser.select(self.document.getroot(),'span.show-link'):
-        #    role_list.append(span.attrib.get('id','').replace('show-',''))
-        #role_index = -1
-        #current_parent = None
-        ##for sp in self.parser.select(filmo_block[0],'span.show-link'):
-        #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-        #    divhead = divmovie.getparent()
-        #    print "-- %s"%(self.document.getpath(divhead))
-        #    print divmovie.attrib.get('class','')
-        #    if current_parent != self.document.getpath(divhead):
-        #        role_index += 1
-        #        current_parent = self.document.getpath(divhead)
-        #    role = role_list[role_index]
-        #    a = self.parser.select(divmovie,'b a',1)
-        #    roles[role].append(a.text)
-        #print roles
-
-        roles['any activity'] = []
-        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-            a = self.parser.select(movie_div,'b a',1)
-            roles['any activity'].append(a.text)
+        # go to the filmography page
+        self.browser.location('http://www.imdb.com/name/%s/filmotype'%id)
+        assert self.browser.is_on_page(FilmographyPage)
+        roles = self.browser.page.get_roles()

        person = Person(id,name)
        person.real_name       = real_name
@ -162,14 +145,38 @@ class PersonPage(BasePage):
        person.roles           = roles
        return person

-    def iter_movies(self,person_id):
-        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-            a = self.parser.select(movie_div,'b a',1)
-            id = a.attrib.get('href','').strip('/').split('/')[-1]
-            yield self.browser.get_movie(id)
-
    def iter_movies_ids(self,person_id):
        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
            a = self.parser.select(movie_div,'b a',1)
            id = a.attrib.get('href','').strip('/').split('/')[-1]
            yield id
+
+class FilmographyPage(BasePage):
+    def get_roles(self):
+        roles = {}
+        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
+            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
+            roles[role] = []
+            for a in self.parser.select(role_div,'ol > li > a'):
+                id = a.attrib.get('href','').strip('/').split('/')[-1]
+                if id.startswith('tt'):
+                    #li = a.getparent()
+                    #between_p = li.text_content().split(')')[0].split('(')[1]
+                    if '(' in a.tail and ')' in a.tail:
+                        between_p = a.tail.split(')')[0].split('(')[1]
+                    else:
+                        between_p = '????'
+                    roles[role].append('(%s) %s'%(between_p,a.text))
+        return roles
+
+    def iter_movies(self):
+        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
+            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
+            if role != 'In Development':
+                for a in self.parser.select(role_div,'ol > li > a'):
+                    id = a.attrib.get('href','').strip('/').split('/')[-1]
+                    if id.startswith('tt'):
+                        movie = self.browser.get_movie(id)
+                        if movie != None:
+                            yield movie
+            
--- a/weboob/applications/cineoob/cineoob.py
+++ b/weboob/applications/cineoob/cineoob.py
@ -347,7 +347,7 @@ class Cineoob(ReplApplication):
            print >>sys.stderr, 'Person not found: %s' % id
            return 3

-        self.change_path([u'biography'])
        for backend, bio in self.do('get_person_biography', person.id):
            print bio
-        self.flush()
+        if bio != NotAvailable:
+            self.flush()