weboob-devel/weboob/backends/aum/pages/profile.py

# -*- coding: utf-8 -*-

# Copyright(C) 2008-2011  Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.


from weboob.backends.aum.pages.base import PageBase
from weboob.tools.ordereddict import OrderedDict

from copy import deepcopy
from logging import warning
import re

class FieldBase:

    def __init__(self, key):
        self.key = key

    def put_value(self, d, value):
        raise NotImplementedError

class FieldString(FieldBase):
    def put_value(self, d, value):
        d[self.key] = unicode(value)

class FieldList(FieldBase):
    def put_value(self, d, value):
        d[self.key] = value.split(', ')

class FieldWideList(FieldBase):

    def put_value(self, d, value):
        d[self.key] += [value]

class FieldOld(FieldBase):
    regexp = re.compile(u'([0-9]+) ans( \(Née le  ([0-9]+) ([^ ]+) ([0-9]+)\))?')
    month2i = ['', 'janvier', u'février', 'mars', 'avril', 'mai', 'juin', 'juillet', u'août', 'septembre', 'octobre', 'novembre', u'décembre']

    def put_value(self, d, value):
        m = self.regexp.match(value)
        if not m:
            return

        d[self.key] = int(m.group(1))
        if not m.group(2):
            return

        try:
            d['birthday'] = (int(m.group(3)),
                             self.month2i.index(m.group(4)),
                             int(m.group(5)))
        except ValueError, e:
            print str(e)

class FieldLocation(FieldBase):
    location = re.compile('(?P<location>.+?)( \((?P<zipcode>[0-9]{5})\))?, (?P<country>.*)')

    def __init__(self):
        FieldBase.__init__(self, '')
    def put_value(self, d, value):
        # TODO: determine distance, or something like
        m = self.location.match(value)
        if m:
            for field in ('country', 'location', 'zipcode'):
                d[field] = m.groupdict().get(field)
        else:
            warning('Unable to parse the location "%s"' % value)
            d['location'] = unicode(value)

class FieldMeasurements(FieldBase):
    height = re.compile('([0-9]{1,3}) cm')
    weight = re.compile('([0-9]{1,3}) kg')
    # TODO: parse third parameter

    def __init__(self):
        FieldBase.__init__(self, '')
    def put_value(self, d, value):
        for s in value.split(', '):
            m = self.height.match(s)
            if m:
                d['height'] = int(m.group(1))
                continue
            m = self.weight.match(s)
            if m:
                d['weight'] = int(m.group(1))
                continue
        if d['height'] and d['weight']:
            bmi = (d['weight']/float(pow(d['height']/100.0, 2)))
            if bmi < 15.5:
                d['fat'] = 'severely underweight'
            elif bmi < 18.4:
                d['fat'] = 'underweight'
            elif bmi < 24.9:
                d['fat'] = 'normal'
            elif bmi < 30:
                d['fat'] = 'overweight'
            else:
                d['fat'] = 'obese'
            d['BMI'] = bmi

class FieldParticularSignes(FieldBase):
    def __init__(self): FieldBase.__init__(self, '')
    def put_value(self, d, value):
        for s in value.split(', '):
            if s.find('tatouages') >= 0:
                d['tatoos'] = True
            elif s.find('piercing') >= 0:
                d['piercing'] = True
            elif s.find('lunettes') >= 0:
                d['glasses'] = True
            elif s.find('rousseur') >= 0:
                d['freckle'] = True

class ProfilePage(PageBase):
    empty_table = OrderedDict((
                    ('details',    OrderedDict((
                                    ('old',           0),
                                    ('birthday',      (0,0,0)),
                                    ('zipcode',       0),
                                    ('location',      ''),
                                    ('country',       ''),
                                    ('eyes',          ''),
                                    ('hairs',         []),
                                    ('height',        0),
                                    ('weight',        0),
                                    ('BMI',           0),
                                    ('fat',           ''),
                                    ('from',          ''),
                                    ('tatoos',        False),
                                    ('piercing',      False),
                                    ('freckle',       False),
                                    ('glasses',       False),
                                    ('job',           ''),
                                    ('style',         ''),
                                    ('alimentation',  ''),
                                    ('alcool',        ''),
                                    ('tabac',         ''),
                                   ))),
                    ('liking',      OrderedDict((
                                    ('activities',    ''),
                                    ('music',         []),
                                    ('cinema',        []),
                                    ('books',         []),
                                    ('tv',            []),
                                   ))),
                    ('sex',         OrderedDict((
                                    ('underwear',     []),
                                    ('top',           []),
                                    ('bottom',        []),
                                    ('interval',      ''),
                                    ('favorite',      []),
                                    ('practices',     []),
                                    ('toys',          []),
                                   ))),
                   ('personality', OrderedDict((
                                    ('snap',          ''),
                                    ('exciting',      ''),
                                    ('hate',          ''),
                                    ('vices',         ''),
                                    ('assets',        ''),
                                    ('fantasies',     ''),
                                    ('is',            []),
                                   ))),
                ))

    tables = {'tab_0': 'details',
              'tab_1': 'liking',
              'tab_2': 'sex',
              'tab_3': 'personality'
             }

    fields =      {'details':      {'Age':                  FieldOld('old'),
                                    u'Réside à':            FieldLocation(),
                                    'Yeux':                 FieldString('eyes'),
                                    'Cheveux ':             FieldList('hairs'),
                                    'Mensurations ':        FieldMeasurements(),
                                    'Origines ':            FieldString('from'),
                                    'Signes particuliers ': FieldParticularSignes(),
                                    'Style ':               FieldString('style'),
                                    'Profession ':          FieldString('job'),
                                    'Alimentation':         FieldString('alimentation'),
                                    'Alcool':               FieldString('alcool'),
                                    'Tabac':                FieldString('tabac'),
                                   },
                   'liking':       {'Hobbies ':             FieldString('activities'),
                                    'Musique ':             FieldWideList('music'),
                                   u'Cinéma':               FieldWideList('cinema'),
                                    'Livres ':              FieldWideList('books'),
                                   u'Télé':                 FieldWideList('tv'),
                                   },
                   'sex':          {u'Sous-v\xeatements ':  FieldList('underwear'),
                                    '... en haut ':         FieldList('top'),
                                    '... en bas ':          FieldList('bottom'),
                                    u'Fréquence idéale des rapports sexuels ':
                                                            FieldString('interval'),
                                   u'Position favorite ':   FieldList('favorite'),
                                    'Pratiques sexuelles ': FieldList('practices'),
                                   u'Accessoires préférés ':FieldList('toys'),
                                   u'Ce qui se cache en dessous ':
                                                            FieldList('underwear'),
                                   u"Ce qui m'\xe9moustille ":
                                                            FieldList('favorite'),
                                   u"Au lit j'aime ":       FieldList('practices'),
                                   u'Mes accessoires ':     FieldList('toys'),
                                   },
                   'personality':  {u'Ça la fait craquer ': FieldString('snap'),
                                    u'Ça l\'excite ':       FieldString('exciting'),
                                    u'Elle déteste ':       FieldString('hate'),
                                    'Ses vices ':           FieldString('vices'),
                                    'Ses atouts ':          FieldString('assets'),
                                    'Ses fantasmes ':       FieldString('fantasies'),
                                    'Elle est  ':           FieldList('is'),
                                   },
                }

    ID_REGEXP = re.compile('(charm|addBasket|openAlbum)\(([0-9]+)(,[\s\'\d]+)?\)')
    PHOTO_REGEXP = re.compile('http://(s|p)([0-9]+)\.adopteunmec\.com/(.*)')

    STATS2ID = {'visites':   'visits',
                'charmes':   'charms',
                'paniers':   'baskets',
                'mails':     'mails',
                'POPULARIT': 'score',
               }
    STATS_VALUE_REGEXP = re.compile('([0-9\s]+).*')

    def __repr__(self):
        if isinstance(self.name, unicode):
            name = self.name.encode('utf-8', 'backslashreplace')
        else:
            name = self.name
        return '<Profile name="%s">' % name

    def on_loaded(self):
        self.name = u''
        self.description = u''
        self.table = deepcopy(self.empty_table)
        self.id = 0
        self.photos = []
        self.status = ''
        self.stats = OrderedDict((
                      ('score',   0),
                      ('visits',  0),
                      ('charms',  0),
                      ('baskets', 0),
                      ('mails',   0),
                     ))

        divs = self.document.getElementsByTagName('td')
        for div in divs:
            if (div.hasAttribute('style') and
                    div.getAttribute('style') == "color:#ffffff;font-size:32px;font-weight:bold;letter-spacing:-2px" and
                    hasattr(div.firstChild, 'data')):
                self.name = div.firstChild.data
            if (div.hasAttribute('style') and
                    div.getAttribute('style') == "font-size:12px;font-weight:bold" and
                    hasattr(div.firstChild, 'data')):
                self.status = div.firstChild.data
            if div.hasAttribute('background'):
                m = self.PHOTO_REGEXP.match(div.getAttribute('background'))
                if m:
                    self.photos.append(dict(url=re.sub(u'thumb[0-2]_', u'image', div.getAttribute('background')),
                                            hidden=False))
            if div.hasAttribute('width') and str(div.getAttribute('width')) == '226':
                trs = div.getElementsByTagName('tr')
                for tr in trs:
                    tds = tr.getElementsByTagName('td')
                    if len(tds) > 2 and hasattr(tds[2].firstChild, 'data'):
                        label = tds[0].firstChild.data
                        value = tds[2].firstChild.data
                    elif len(tds) == 2:
                        label = unicode(tds[0].childNodes[1].data)
                        value = tds[1].childNodes[1].data
                    else:
                        continue

                    m = self.STATS_VALUE_REGEXP.match(value)
                    if m and self.STATS2ID.has_key(label):
                        self.stats[self.STATS2ID[label]] = int(m.group(1).replace(' ', ''))

        divs = self.document.getElementsByTagName('div')
        for div in divs:
            if div.hasAttribute('id'):
                if div.getAttribute('id') == 'about_div':
                    self.parse_description(div)

                if div.getAttribute('id').startswith('tab_'):
                    self.parse_table(div)

        for tag in ('img', 'td'):
            imgs = self.document.getElementsByTagName(tag)
            for img in imgs:
                if img.hasAttribute('onclick'):
                    m = self.ID_REGEXP.match(img.getAttribute('onclick'))
                    if m:
                        self.id = int(m.group(2))
                        break
            if self.id:
                break

        if len(self.photos) == 0:
            return

        # find hidden photos.
        photo_regex = re.compile('(?P<base_url>http://.+\.adopteunmec\.com/.+/)image(?P<id>.+)\.jpg')
        photo_max_id = max(int(photo_regex.match(photo['url']).groupdict()['id']) for photo in self.photos)
        base_url = photo_regex.match(self.photos[0]['url']).groupdict()['base_url']
        for id in xrange(1, photo_max_id + 1):
            url = '%simage%s.jpg' % (base_url, id)
            if not url in [photo['url'] for photo in self.photos]:
                self.photos.append(dict(url=url, hidden=True))

    def parse_description(self, div):
        # look for description

        description = ''
        for c in div.childNodes:
            if hasattr(c, 'data'):
                description += ''.join(c.data.split('\n')) # to strip \n
            elif hasattr(c, 'tagName') and c.tagName == 'br':
                description += '\n'
            elif hasattr(c, 'tagName') and c.tagName == 'i':
                description += ''.join(c.childNodes[0].data.split('\n'))

        self.description = description.strip()

    def parse_table(self, div):
        d = self.table[self.tables[div.getAttribute('id')]]
        fields = self.fields[self.tables[div.getAttribute('id')]]
        table = div.getElementsByTagName('table')[1]

        field1 = None
        field2 = None

        for tr in table.getElementsByTagName('tr'):
            tds = tr.getElementsByTagName('td')
            if len(tds) != 2:
                continue

            label1 = ''
            label2 = ''
            value1 = ''
            value2 = ''
            # Check for first column
            if len(tds[0].childNodes) > 0:
                b = len(tds[0].childNodes) > 2 and tds[0].childNodes[2]
                if b and hasattr(b, 'tagName') and b.tagName == 'b':
                    for child in b.childNodes:
                        label1 += child.data
                else:
                    for child in tds[0].childNodes:
                        if child.data != u'\xa0': # strip nbsp
                            value1 += child.data
                    value2 = value2.strip()

            # Check for second column
            if len(tds[1].childNodes) > 0:
                b = tds[1].childNodes[0]
                if b and hasattr(b, 'tagName') and b.tagName == 'b':
                    for child in b.firstChild.childNodes:
                        label2 += child.data
                else:
                    for child in tds[1].childNodes:
                        if hasattr(child, 'data') and child.data != u'\xa0': # strip nbsp
                            value2 += child.data

            if label1 and value2:
                # This is a typically tuple of key/value on the line.
                try:
                    fields[label1].put_value(d, value2)
                except KeyError:
                    self.logger.warning('Unable to find "%s" (%s)' % (label1, repr(label1)))
            elif label1 and label2:
                # two titles, so there will have a list of value in
                # next lines on each columns
                field1 = fields[label1]
                field2 = fields[label2]
            elif not label1 and not label1:
                # two values, so it is a line of values
                if field1 and value1:
                    field1.put_value(d, value1)
                if field2 and value2:
                    field2.put_value(d, value2)

    def get_name(self):
        return self.name

    def get_description(self):
        return self.description

    def get_table(self):
        return self.table

    def get_id(self):
        return self.id

    def get_photos(self):
        return self.photos

    def get_status(self):
        return self.status

    def is_online(self):
        return self.status.find('en ligne') >= 0

    def get_stats(self):
        return self.stats

    def get_profile_text(self):
        body = u'Status: %s' % unicode(self.status)
        if self.photos:
            body += u'\nPhotos:'
            for photo in self.photos:
                body += u'\n\t\t%s%s' % (unicode(photo['url']), (' (hidden)' if photo['hidden'] else ''))
        body += u'\nStats:'
        for label, value in self.get_stats().iteritems():
            body += u'\n\t\t%-15s %s' % (label + ':', value)
        body += u'\n\nInformations:'
        for section, d in self.get_table().iteritems():
            body += u'\n\t%s\n' % section
            for key, value in d.items():
                key = '%s:' % key
                if isinstance(value, list):
                    body += u'\t\t%-15s %s\n' % (key, u', '.join(unicode(s) for s in value))
                elif isinstance(value, float):
                    body += u'\t\t%-15s %.2f\n' % (key, value)
                else:
                    body += u'\t\t%-15s %s\n' % (key, unicode(value))
        body += u'\n\nDescription:\n%s' % unicode(self.get_description())

        return body