[cineoob] new command : biography

2013-03-05 16:24:10 +01:00 · 2013-03-05 16:24:10 +01:00 · 27c36d412b
commit 27c36d412b
parent 393800e7fd
5 changed files with 67 additions and 17 deletions
--- a/modules/imdb/backend.py
+++ b/modules/imdb/backend.py
@ -62,3 +62,6 @@ class ImdbBackend(BaseBackend, ICapCinema):

    def iter_movie_persons_ids(self, id):
        return self.browser.iter_movie_persons_ids(id)
+
+    def get_person_biography(self,id):
+        return self.browser.get_person_biography(id)
--- a/modules/imdb/browser.py
+++ b/modules/imdb/browser.py
@ -23,7 +23,7 @@ from weboob.capabilities.base import NotAvailable
 from weboob.capabilities.cinema import Movie
 from weboob.tools.json import json

-from .pages import MoviePage, PersonPage, MovieCrewPage
+from .pages import MoviePage, PersonPage, MovieCrewPage, BiographyPage

 from datetime import datetime

@ -38,7 +38,8 @@ class ImdbBrowser(BaseBrowser):
    PAGES = {
        'http://www.imdb.com/title/tt[0-9]*/*': MoviePage,
        'http://www.imdb.com/title/tt[0-9]*/fullcredits.*': MovieCrewPage,
-        'http://www.imdb.com/name/nm.*': PersonPage,
+        'http://www.imdb.com/name/nm[0-9]*/*': PersonPage,
+        'http://www.imdb.com/name/nm[0-9]*/bio.*': BiographyPage,
        }

    def iter_movies(self, pattern):
@ -121,6 +122,11 @@ class ImdbBrowser(BaseBrowser):
        assert self.is_on_page(PersonPage)
        return self.page.get_person(id)

+    def get_person_biography(self, id):
+        self.location('http://www.imdb.com/name/%s/bio' % id)
+        assert self.is_on_page(BiographyPage)
+        return self.page.get_biography()
+
    def iter_movie_persons(self, movie_id):
        self.location('http://www.imdb.com/title/%s' % movie_id)
        assert self.is_on_page(MoviePage)
--- a/modules/imdb/pages.py
+++ b/modules/imdb/pages.py
@ -44,6 +44,17 @@ class MoviePage(BasePage):
            yield p


+class BiographyPage(BasePage):
+    ''' Page containing biography of a person
+    '''
+    def get_biography(self):
+        bio = ''
+        tn = self.parser.select(self.document.getroot(),'div#tn15content',1)
+        for p in self.parser.select(tn,'p'):
+            bio += '\n\n%s'%p.text_content().strip()
+        return bio
+
+
 class MovieCrewPage(BasePage):
    ''' Page listing all the persons related to a movie
    '''
@ -72,7 +83,7 @@ class PersonPage(BasePage):
    '''
    def get_person(self,id):
        name = NotAvailable
-        biography = NotAvailable
+        short_biography = NotAvailable
        birth_place = NotAvailable
        birth_date = NotAvailable
        death_date = NotAvailable
@ -83,7 +94,7 @@ class PersonPage(BasePage):
        td_overview = self.parser.select(self.document.getroot(),'td#overview-top',1)
        descs = self.parser.select(td_overview,'span[itemprop=description]')
        if len(descs) > 0:
-            biography = descs[0].text
+            short_biography = descs[0].text
        rname_block = self.parser.select(td_overview,'div.txt-block h4.inline')
        if len(rname_block) > 0 and "born" in rname_block[0].text.lower():
            links = self.parser.select(rname_block[0].getparent(),'a')
@ -114,17 +125,31 @@ class PersonPage(BasePage):
                dtime.append('1')
                dtime.append('1')
            death_date = datetime(int(dtime[0]),int(dtime[1]),int(dtime[2]))
-        # TODO IMPROVE THIS -----------
-        #for role in ['Actor','Composer']:
-        #    show_span =  self.parser.select(self.document.getroot(),'span[id=show-%s]' % role)
-        #    if len(show_span) > 0:
-        #        roles[role] = []
-        #        filmo_block = show_span[0].getparent()
-        #        filmo_block = filmo_block.getnext()
-        roles['actor'] = []
+        # TODO IMPROVE THIS, apparently there's an error in parsing, quite hard to handle -----------
+
+        #filmo_block =  self.parser.select(self.document.getroot(),'div#filmography',1)
+        #role_list = []
+        #for span in self.parser.select(self.document.getroot(),'span.show-link'):
+        #    role_list.append(span.attrib.get('id','').replace('show-',''))
+        #role_index = -1
+        #current_parent = None
+        ##for sp in self.parser.select(filmo_block[0],'span.show-link'):
+        #for divmovie in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
+        #    divhead = divmovie.getparent()
+        #    print "-- %s"%(self.document.getpath(divhead))
+        #    print divmovie.attrib.get('class','')
+        #    if current_parent != self.document.getpath(divhead):
+        #        role_index += 1
+        #        current_parent = self.document.getpath(divhead)
+        #    role = role_list[role_index]
+        #    a = self.parser.select(divmovie,'b a',1)
+        #    roles[role].append(a.text)
+        #print roles
+
+        roles['any activity'] = []
        for movie_div in self.parser.select(self.document.getroot(),'div[class~=filmo-row]'):
            a = self.parser.select(movie_div,'b a',1)
-            roles['actor'].append(a.text)
+            roles['any activity'].append(a.text)

        person = Person(id,name)
        person.real_name       = real_name
@ -133,7 +158,7 @@ class PersonPage(BasePage):
        person.birth_place     = birth_place
        person.gender          = gender
        person.nationality     = nationality
-        person.biography       = biography
+        person.short_biography = short_biography
        person.roles           = roles
        return person

--- a/weboob/applications/cineoob/cineoob.py
+++ b/weboob/applications/cineoob/cineoob.py
@ -98,7 +98,7 @@ def num_years(begin, end=None):
        return num_years

 class PersonInfoFormatter(IFormatter):
-    MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'biography', 'roles')
+    MANDATORY_FIELDS = ('id', 'name', 'real_name', 'birth_date', 'birth_place', 'gender', 'nationality', 'short_biography', 'roles')

    def format_obj(self, obj, alias):
        result = u'%s%s%s\n' % (self.BOLD, obj.name, self.NC)
@ -124,7 +124,7 @@ class PersonInfoFormatter(IFormatter):
                for movie in lmovies:
                    result += '   * %s\n' % movie
        result += '\n%sBiography%s\n' % (self.BOLD, self.NC)
-        result += '%s'%obj.biography
+        result += '%s'%obj.short_biography
        return result


@ -335,3 +335,19 @@ class Cineoob(ReplApplication):
        for backend, movie in self.do('iter_person_movies', person.id):
            self.cached_format(movie)
        self.flush()
+
+    def do_biography(self, person_id):
+        """
+        biography  person_ID
+
+        Show the complete biography of a person.
+        """
+        person = self.get_object(person_id, 'get_person')
+        if not person:
+            print >>sys.stderr, 'Person not found: %s' % id
+            return 3
+
+        self.change_path([u'biography'])
+        for backend, bio in self.do('get_person_biography', person.id):
+            print bio
+        self.flush()
--- a/weboob/capabilities/cinema.py
+++ b/weboob/capabilities/cinema.py
@ -53,7 +53,7 @@ class Person(CapBaseObject):
    birth_place     = StringField('City and country of birth of a person')
    gender          = StringField('Gender of a person')
    nationality     = StringField('Nationality of a person')
-    biography       = StringField('Short biography of a person')
+    short_biography = StringField('Short biography of a person')
    roles           = Field('Lists of movies related to the person indexed by roles',dict)

    def __init__(self, id, name):