From 27c36d412beb56bb1114e77a4b1c9eff29e51d8b Mon Sep 17 00:00:00 2001
From: Julien Veyssier <julien.veyssier@aiur.fr>
Date: Tue, 5 Mar 2013 16:24:10 +0100
Subject: [PATCH] [cineoob] new command : biography

---
 modules/imdb/backend.py                |  3 ++
 modules/imdb/browser.py                | 10 ++++--
 modules/imdb/pages.py                  | 49 +++++++++++++++++++-------
 weboob/applications/cineoob/cineoob.py | 20 +++++++++--
 weboob/capabilities/cinema.py          |  2 +-
 5 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/modules/imdb/backend.py b/modules/imdb/backend.py
index d4b40b6d..7219585d 100644
--- a/modules/imdb/backend.py
+++ b/modules/imdb/backend.py
@@ -62,3 +62,6 @@ class ImdbBackend(BaseBackend, ICapCinema):
 
     def iter_movie_persons_ids(self, id):
         return self.browser.iter_movie_persons_ids(id)
+
+    def get_person_biography(self,id):
+        return self.browser.get_person_biography(id)
diff --git a/modules/imdb/browser.py b/modules/imdb/browser.py
index 076d80d5..bd8bc08e 100644
--- a/modules/imdb/browser.py
+++ b/modules/imdb/browser.py
@@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
 from weboob.capabilities.cinema import Movie
 from weboob.tools.json import json
 
-from .pages import MoviePage, PersonPage, MovieCrewPage
+from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage
 
 from datetime import datetime
 
@@ -38,7 +38,8 @@ class ImdbBrowser(BaseBrowser):
     PAGES = {
         'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
         'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
-        'http://www.imdb.com/name/nm.*': PersonPage,
+        'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
+        'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
         }
 
     def iter_movies(self, pattern):
@@ -121,6 +122,11 @@ class ImdbBrowser(BaseBrowser):
         assert self.is_on_page(PersonPage)
         return self.page.get_person(id)
 
+    def get_person_biography(self, id):
+        self.location('http://www.imdb.com/name/%s/bio' % id)
+        assert self.is_on_page(BiographyPage)
+        return self.page.get_biography()
+
     def iter_movie_persons(self, movie_id):
         self.location('http://www.imdb.com/title/%s' % movie_id)
         assert self.is_on_page(MoviePage)
diff --git a/modules/imdb/pages.py b/modules/imdb/pages.py
index 4a3daa17..72ea61b4 100644
--- a/modules/imdb/pages.py
+++ b/modules/imdb/pages.py
@@ -44,6 +44,17 @@ class MoviePage(BasePage):
             yield p
 
 
+class BiographyPage(BasePage):
+    ''' Page containing biography of a person
+    '''
+    def get_biography(self):
+        bio = ''
+        tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
+        for p in self.parser.select(tn,'p'):
+            bio += '\n\n%s'%p.text_content().strip()
+        return bio
+
+
 class MovieCrewPage(BasePage):
     ''' Page listing all the persons related to a movie
     '''
@@ -72,7 +83,7 @@ class PersonPage(BasePage):
     '''
     def get_person(self,id):
         name = NotAvailable
-        biography = NotAvailable
+        short_biography = NotAvailable
         birth_place = NotAvailable
         birth_date = NotAvailable
         death_date = NotAvailable
@@ -83,7 +94,7 @@ class PersonPage(BasePage):
         td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1)
         descs = self.parser.select(td_overview,'span[itemprop=description]')
         if len(descs) > 0:
-            biography = descs[0].text
+            short_biography = descs[0].text
         rname_block = self.parser.select(td_overview,'div.txt-block h4.inline')
         if len(rname_block) > 0 and "born" in rname_block[0].text.lower():
             links = self.parser.select(rname_block[0].getparent(),'a')
@@ -114,17 +125,31 @@ class PersonPage(BasePage):
                 dtime.append('1')
                 dtime.append('1')
             death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
-        # TODO IMPROVE THIS -----------
-        #for role in ['Actor','Composer']:
-        #    show_span =  self.parser.select(self.document.getroot(),'span[id=show-%s]' % role)
-        #    if len(show_span) > 0:
-        #        roles[role] = []
-        #        filmo_block = show_span[0].getparent()
-        #        filmo_block = filmo_block.getnext()
-        roles['actor'] = []
+        # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
+
+        #filmo_block =  self.parser.select(self.document.getroot(),'div#filmography',1)
+        #role_list = []
+        #for span in self.parser.select(self.document.getroot(),'span.show-link'):
+        #    role_list.append(span.attrib.get('id','').replace('show-',''))
+        #role_index = -1
+        #current_parent = None
+        ##for sp in self.parser.select(filmo_block[0],'span.show-link'):
+        #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
+        #    divhead = divmovie.getparent()
+        #    print "-- %s"%(self.document.getpath(divhead))
+        #    print divmovie.attrib.get('class','')
+        #    if current_parent != self.document.getpath(divhead):
+        #        role_index += 1
+        #        current_parent = self.document.getpath(divhead)
+        #    role = role_list[role_index]
+        #    a = self.parser.select(divmovie,'b a',1)
+        #    roles[role].append(a.text)
+        #print roles
+
+        roles['any activity'] = []
         for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
             a = self.parser.select(movie_div,'b a',1)
-            roles['actor'].append(a.text)
+            roles['any activity'].append(a.text)
 
         person = Person(id,name)
         person.real_name       = real_name
@@ -133,7 +158,7 @@ class PersonPage(BasePage):
         person.birth_place     = birth_place
         person.gender          = gender
         person.nationality     = nationality
-        person.biography       = biography
+        person.short_biography = short_biography
         person.roles           = roles
         return person
 
diff --git a/weboob/applications/cineoob/cineoob.py b/weboob/applications/cineoob/cineoob.py
index 91214c2b..d16325cf 100644
--- a/weboob/applications/cineoob/cineoob.py
+++ b/weboob/applications/cineoob/cineoob.py
@@ -98,7 +98,7 @@ def num_years(begin, end=None):
         return num_years
 
 class PersonInfoFormatter(IFormatter):
-    MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'biography', 'roles')
+    MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'short_biography', 'roles')
 
     def format_obj(self, obj, alias):
         result = u'%s%s%s\n' % (self.BOLD, obj.name, self.NC)
@@ -124,7 +124,7 @@ class PersonInfoFormatter(IFormatter):
                 for movie in lmovies:
                     result += '   * %s\n' % movie
         result += '\n%sBiography%s\n' % (self.BOLD, self.NC)
-        result += '%s'%obj.biography
+        result += '%s'%obj.short_biography
         return result
 
 
@@ -335,3 +335,19 @@ class Cineoob(ReplApplication):
         for backend, movie in self.do('iter_person_movies', person.id):
             self.cached_format(movie)
         self.flush()
+
+    def do_biography(self, person_id):
+        """
+        biography  person_ID
+
+        Show the complete biography of a person.
+        """
+        person = self.get_object(person_id, 'get_person')
+        if not person:
+            print >>sys.stderr, 'Person not found: %s' % id
+            return 3
+
+        self.change_path([u'biography'])
+        for backend, bio in self.do('get_person_biography', person.id):
+            print bio
+        self.flush()
diff --git a/weboob/capabilities/cinema.py b/weboob/capabilities/cinema.py
index 65b0c940..caee2ba8 100644
--- a/weboob/capabilities/cinema.py
+++ b/weboob/capabilities/cinema.py
@@ -53,7 +53,7 @@ class Person(CapBaseObject):
     birth_place     = StringField('City and country of birth of a person')
     gender          = StringField('Gender of a person')
     nationality     = StringField('Nationality of a person')
-    biography       = StringField('Short biography of a person')
+    short_biography = StringField('Short biography of a person')
     roles           = Field('Lists of movies related to the person indexed by roles',dict)
 
     def __init__(self, id, name):