From b91a1cd481ef68ffff17f6d502f5f9577b59309b Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien.veyssier@aiur.fr>
Date: Wed, 6 Mar 2013 00:19:00 +0100
Subject: [PATCH] [imdb] filmography page muuuuch more readable than person
 page

---
 modules/imdb/browser.py                | 18 ++++---
 modules/imdb/pages.py                  | 75 ++++++++++++++------------
 weboob/applications/cineoob/cineoob.py |  4 +-
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py
index bd8bc08e..c7e22f10 100644
--- a/modules/imdb/browser.py
+++ b/modules/imdb/browser.py
@@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
 from weboob.capabilities.cinema import Movie
 from weboob.tools.json import json
 
-from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
+from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage, FilmographyPage
 
 from datetime import datetime
 
@@ -40,6 +40,7 @@ class ImdbBrowser(BaseBrowser):
         'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
         'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
         'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
+        'http://www.imdb.com/name/nm[0-9]*/filmo.*': FilmographyPage,
         }
 
     def iter_movies(self, pattern):
@@ -48,7 +49,9 @@ class ImdbBrowser(BaseBrowser):
         for cat in ['title_popular','title_exact','title_approx']:
             if jres.has_key(cat):
                 for m in jres[cat]:
-                    yield self.get_movie(m['id'])
+                    movie = self.get_movie(m['id'])
+                    if movie != None:
+                        yield movie
 
     def iter_persons(self, pattern):
         res = self.readurl('http://www.imdb.com/xml/find?json=1&nr=1&nm=on&q=%s' % pattern.encode('utf-8'))
@@ -60,7 +63,10 @@ class ImdbBrowser(BaseBrowser):
 
     def get_movie(self, id):
         res = self.readurl('http://imdbapi.org/?id=%s&type=json&plot=simple&episode=1&lang=en-US&aka=full&release=simple&business=0&tech=0' % id )
-        jres = json.loads(res)
+        if res != None:
+            jres = json.loads(res)
+        else:
+            return None
 
         title = NotAvailable
         duration = NotAvailable
@@ -133,9 +139,9 @@ class ImdbBrowser(BaseBrowser):
         return self.page.iter_persons(movie_id)
 
     def iter_person_movies(self, person_id):
-        self.location('http://www.imdb.com/name/%s' % person_id)
-        assert self.is_on_page(PersonPage)
-        return self.page.iter_movies(person_id)
+        self.location('http://www.imdb.com/name/%s/filmotype' % person_id)
+        assert self.is_on_page(FilmographyPage)
+        return self.page.iter_movies()
 
     def iter_person_movies_ids(self, person_id):
         self.location('http://www.imdb.com/name/%s' % person_id)
diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py
index 72ea61b4..813eec2f 100644
--- a/modules/imdb/pages.py
+++ b/modules/imdb/pages.py
@@ -25,7 +25,7 @@ from weboob.tools.browser import BasePage
 from datetime import datetime
 
 
-__all__ = ['MoviePage','PersonPage','MovieCrewPage']
+__all__ = ['MoviePage','PersonPage','MovieCrewPage','BiographyPage','FilmographyPage']
 
 
 class MoviePage(BasePage):
@@ -50,8 +50,12 @@ class BiographyPage(BasePage):
     def get_biography(self):
         bio = ''
         tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
-        for p in self.parser.select(tn,'p'):
-            bio += '\n\n%s'%p.text_content().strip()
+        #for p in self.parser.select(tn,'p'):
+        #    bio += '\n\n%s'%p.text_content().strip()
+        # get children, append if label or tag = a,p,h...
+        bio = tn.text_content().strip()
+        if bio == "":
+            bio = NotAvailable
         return bio
 
 
@@ -125,31 +129,10 @@ class PersonPage(BasePage):
                 dtime.append('1')
                 dtime.append('1')
             death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
-        # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
-
-        #filmo_block =  self.parser.select(self.document.getroot(),'div#filmography',1)
-        #role_list = []
-        #for span in self.parser.select(self.document.getroot(),'span.show-link'):
-        #    role_list.append(span.attrib.get('id','').replace('show-',''))
-        #role_index = -1
-        #current_parent = None
-        ##for sp in self.parser.select(filmo_block[0],'span.show-link'):
-        #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-        #    divhead = divmovie.getparent()
-        #    print "-- %s"%(self.document.getpath(divhead))
-        #    print divmovie.attrib.get('class','')
-        #    if current_parent != self.document.getpath(divhead):
-        #        role_index += 1
-        #        current_parent = self.document.getpath(divhead)
-        #    role = role_list[role_index]
-        #    a = self.parser.select(divmovie,'b a',1)
-        #    roles[role].append(a.text)
-        #print roles
-
-        roles['any activity'] = []
-        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-            a = self.parser.select(movie_div,'b a',1)
-            roles['any activity'].append(a.text)
+        # go to the filmography page
+        self.browser.location('http://www.imdb.com/name/%s/filmotype'%id)
+        assert self.browser.is_on_page(FilmographyPage)
+        roles = self.browser.page.get_roles()
 
         person = Person(id,name)
         person.real_name       = real_name
@@ -162,14 +145,38 @@ class PersonPage(BasePage):
         person.roles           = roles
         return person
 
-    def iter_movies(self,person_id):
-        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
-            a = self.parser.select(movie_div,'b a',1)
-            id = a.attrib.get('href','').strip('/').split('/')[-1]
-            yield self.browser.get_movie(id)
-
     def iter_movies_ids(self,person_id):
         for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
             a = self.parser.select(movie_div,'b a',1)
             id = a.attrib.get('href','').strip('/').split('/')[-1]
             yield id
+
+class FilmographyPage(BasePage):
+    def get_roles(self):
+        roles = {}
+        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
+            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
+            roles[role] = []
+            for a in self.parser.select(role_div,'ol > li > a'):
+                id = a.attrib.get('href','').strip('/').split('/')[-1]
+                if id.startswith('tt'):
+                    #li = a.getparent()
+                    #between_p = li.text_content().split(')')[0].split('(')[1]
+                    if '(' in a.tail and ')' in a.tail:
+                        between_p = a.tail.split(')')[0].split('(')[1]
+                    else:
+                        between_p = '????'
+                    roles[role].append('(%s) %s'%(between_p,a.text))
+        return roles
+
+    def iter_movies(self):
+        for role_div in self.parser.select(self.document.getroot(),'div.filmo'):
+            role = self.parser.select(role_div,'h5 a',1).text.replace(':','')
+            if role != 'In Development':
+                for a in self.parser.select(role_div,'ol > li > a'):
+                    id = a.attrib.get('href','').strip('/').split('/')[-1]
+                    if id.startswith('tt'):
+                        movie = self.browser.get_movie(id)
+                        if movie != None:
+                            yield movie
+            
diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py
index d16325cf..72daad1a 100644
--- a/weboob/applications/cineoob/cineoob.py
+++ b/weboob/applications/cineoob/cineoob.py
@@ -347,7 +347,7 @@ class Cineoob(ReplApplication):
             print >>sys.stderr, 'Person not found: %s' % id
             return 3
 
-        self.change_path([u'biography'])
         for backend, bio in self.do('get_person_biography', person.id):
             print bio
-        self.flush()
+        if bio != NotAvailable:
+            self.flush()