Starting work on okc module : adding optimization

This commit is contained in:
Ahmed Boussadia 2014-05-07 11:38:51 +02:00 committed by Romain Bignon
commit 296d6b7c9f
8 changed files with 501 additions and 8 deletions

View file

@ -27,13 +27,15 @@ from dateutil.parser import parse as _parse_dt
from weboob.capabilities.base import NotLoaded
from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread
#from weboob.capabilities.dating import ICapDating, OptimizationNotFound, Event
from weboob.capabilities.dating import ICapDating, OptimizationNotFound, Event
from weboob.capabilities.contact import ICapContact, ContactPhoto, Contact
from weboob.tools.backend import BaseBackend, BackendConfig
from weboob.tools.value import Value, ValueBackendPassword
from weboob.tools.misc import local2utc
from .browser import OkCBrowser
from .optim.visibility import Visibility
from .optim.queries_queue import QueriesQueue
__all__ = ['OkCBackend']
@ -64,7 +66,7 @@ def parse_dt(s):
return local2utc(d)
class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost):
class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost, ICapDating):
NAME = 'okc'
MAINTAINER = u'Roger Philibert'
EMAIL = 'roger.philibert@gmail.com'
@ -73,7 +75,7 @@ class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost):
DESCRIPTION = u'OkCupid dating website'
CONFIG = BackendConfig(Value('username', label='Username'),
ValueBackendPassword('password', label='Password'))
STORAGE = {
STORAGE = {'queries_queue': {'queue': []},
'sluts': {},
#'notes': {},
}
@ -82,6 +84,32 @@ class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost):
def create_default_browser(self):
return self.create_browser(self.config['username'].get(), self.config['password'].get())
# ---- ICapDating methods ---------------------
def init_optimizations(self):
self.add_optimization('VISIBILITY', Visibility(self.weboob.scheduler, self.browser))
self.add_optimization('QUERIES_QUEUE', QueriesQueue(self.weboob.scheduler, self.storage, self.browser))
def iter_events(self):
all_events = {}
with self.browser:
all_events[u'visits'] = (self.browser.get_visits, 'Visited by %s')
for type, (events, message) in all_events.iteritems():
for event in events():
e = Event(event['who']['id'])
e.date = parse_dt(event['date'])
e.type = type
# if 'who' in event:
# e.contact = self._get_partial_contact(event['who'])
# else:
# e.contact = self._get_partial_contact(event)
# if not e.contact:
# continue
# e.message = message % e.contact.name
yield e
# ---- ICapMessages methods ---------------------
def fill_thread(self, thread, fields):

View file

@ -22,7 +22,7 @@ import urllib
from weboob.tools.browser import BaseBrowser, BasePage
from weboob.tools.ordereddict import OrderedDict
from .pages import LoginPage, ThreadPage, MessagesPage, PostMessagePage, ProfilePage, PhotosPage
from .pages import LoginPage, ThreadPage, MessagesPage, PostMessagePage, ProfilePage, PhotosPage, VisitsPage
__all__ = ['OkCBrowser']
@ -43,6 +43,7 @@ class OkCBrowser(BaseBrowser):
('http://%s/messages\?.*' % DOMAIN, MessagesPage),
('http://%s/profile/.*/photos' % DOMAIN, PhotosPage),
('http://%s/profile/[^/]*' % DOMAIN, ProfilePage),
('http://%s/visitors' % DOMAIN, VisitsPage)
))
logged_in = False
@ -120,10 +121,10 @@ class OkCBrowser(BaseBrowser):
# r = self.api_request('me', 'flashs')
# return r['result']['all']
#@check_login
#def get_visits(self):
# r = self.api_request('me', 'visits')
# return r['result']['news'] + r['result']['olds']
@check_login
def get_visits(self):
self.location('http://m.okcupid.com/visitors')
return self.page.get_visits()
@check_login
def get_threads_list(self, count=30):

View file

View file

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import random
from weboob.tools.browser import BrowserUnavailable, BrowserIncorrectPassword
from weboob.capabilities.dating import Optimization
from weboob.capabilities.account import AccountRegisterError
from weboob.tools.log import getLogger
from weboob.tools.value import Value, ValuesDict, ValueInt
from aum.captcha import CaptchaError
from aum.exceptions import AdopteWait, AdopteBanned
from aum.browser import AuMBrowser
__all__ = ['PriorityConnection']
class PriorityConnection(Optimization):
CONFIG = ValuesDict(ValueInt('minimal', label='Minimal of godchilds', default=5),
Value('domain', label='Domain to use for fake accounts emails', default='aum.example.com'),
ValueInt('interval', label='Interval of checks (seconds)', default=3600)
)
def __init__(self, sched, storage, browser):
self.sched = sched
self.storage = storage
self.browser = browser
self.logger = getLogger('priorityconn', browser.logger)
self.config = storage.get('priority_connection', 'config', default=None)
if self.config == {}:
self.config = None
self.check_cron = None
self.activity_cron = None
def start(self):
if self.config is None:
return False
self.check_cron = self.sched.repeat(int(self.config['interval']), self.check_godchilds)
self.activity_cron = self.sched.repeat(600, self.activity_fakes)
return True
def stop(self):
self.sched.cancel(self.check_cron)
self.check_cron = None
self.sched.cancel(self.activity_cron)
self.activity_cron = None
return True
def is_running(self):
return self.check_cron is not None
def set_config(self, params):
self.config = params
self.storage.set('priority_connection', 'config', self.config)
self.storage.save()
def get_config(self):
return self.config
def generate_name(self):
login = u''
for x in xrange(8):
if x % 2:
login += random.choice(u'aeiou')
else:
login += random.choice(u'bcdfghjklmnprstv')
fakes = self.storage.get('priority_connection', 'fakes')
while ('%s@%s' % (login, self.config['domain'])) in fakes.iterkeys():
login += '_'
return login
def generate_password(self):
return '%08x' % random.randint(1, int('ffffffff', 16))
def check_godchilds(self):
with self.browser:
try:
my_id = self.browser.get_my_id()
nb_godchilds = self.browser.nb_godchilds()
except AdopteWait:
nb_godchilds = 0
except BrowserUnavailable:
# We'll check later
return
missing_godchilds = int(self.config['minimal']) - nb_godchilds
self.logger.info('Missing godchilds: %s' % missing_godchilds)
if missing_godchilds <= 0:
return
for i in xrange(missing_godchilds):
registered = False
while not registered:
name = self.generate_name()
password = self.generate_password()
browser = AuMBrowser('%s@%s' % (name, self.config['domain']), proxy=self.browser.proxy)
try:
browser.register(password= password,
sex= 1, # slut
birthday_d= random.randint(1, 28),
birthday_m= random.randint(1, 12),
birthday_y= random.randint(1975, 1990),
zipcode= 75001,
country= 'fr',
godfather= my_id)
except AccountRegisterError as e:
self.logger.warning('Unable to register account: %s' % e)
except CaptchaError:
self.logger.warning('Unable to solve captcha... Retrying')
else:
registered = True
# set nickname
browser.set_nickname(name.strip('_').capitalize())
# rate my own profile with good score
for i in xrange(4):
browser.rate(my_id, i, 5.0)
# save fake in storage
fake = {'username': browser.username,
'password': password}
self.storage.set('priority_connection', 'fakes', name, fake)
self.storage.save()
self.logger.info('Fake account "%s" created (godfather=%s)' % (name, my_id))
def activity_fakes(self):
try:
fakes = self.storage.get('priority_connection', 'fakes', default={})
if len(fakes) == 0:
return
while True:
name = random.choice(fakes.keys())
fake = fakes[name]
try:
browser = AuMBrowser(fake['username'], fake['password'], proxy=self.browser.proxy)
except (AdopteBanned,BrowserIncorrectPassword) as e:
self.logger.warning('Fake %s can\'t login: %s' % (name, e))
continue
profiles = browser.search_profiles(country="fr",
dist='10',
save=True)
if not profiles:
continue
id = profiles.pop()
profile = browser.get_profile(id)
# bad rate
for i in xrange(4):
browser.rate(profile.get_id(), i, 0.6)
# deblock
browser.deblock(profile.get_id())
return
except BrowserUnavailable:
# don't care
pass

View file

@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon, Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from random import randint
from weboob.tools.browser import BrowserUnavailable
from weboob.capabilities.dating import Optimization
from weboob.tools.log import getLogger
__all__ = ['ProfilesWalker']
class ProfilesWalker(Optimization):
def __init__(self, sched, storage, browser):
self.sched = sched
self.storage = storage
self.browser = browser
self.logger = getLogger('walker', browser.logger)
self.walk_cron = None
self.view_cron = None
self.visited_profiles = set(storage.get('profiles_walker', 'viewed'))
self.logger.info(u'Loaded %d already visited profiles from storage.' % len(self.visited_profiles))
self.profiles_queue = set()
def save(self):
self.storage.set('profiles_walker', 'viewed', list(self.visited_profiles))
self.storage.save()
def start(self):
self.walk_cron = self.sched.repeat(60, self.enqueue_profiles)
self.view_cron = self.sched.schedule(randint(5, 10), self.view_profile)
return True
def stop(self):
self.sched.cancel(self.walk_cron)
self.sched.cancel(self.view_cron)
self.walk_cron = None
self.view_cron = None
return True
def is_running(self):
return self.walk_cron is not None
def enqueue_profiles(self):
try:
with self.browser:
profiles_to_visit = self.browser.search_profiles().difference(self.visited_profiles)
self.logger.info(u'Enqueuing profiles to visit: %s' % profiles_to_visit)
self.profiles_queue = set(profiles_to_visit)
self.save()
except BrowserUnavailable:
return
def view_profile(self):
try:
try:
id = self.profiles_queue.pop()
except KeyError:
return # empty queue
try:
with self.browser:
profile = self.browser.get_profile(id)
self.logger.info(u'Visited profile %s (%s)' % (profile['pseudo'], id))
# Get score from the aum_score module
#d = self.nucentral_core.callService(context.Context.fromComponent(self), 'aum_score', 'score', profile)
# d.addCallback(self.score_cb, profile.getID())
# deferredlist.append(d)
# do not forget that we visited this profile, to avoid re-visiting it.
self.visited_profiles.add(id)
self.save()
except BrowserUnavailable:
# We consider this profil hasn't been [correctly] analysed
self.profiles_queue.add(id)
return
except Exception as e:
print e
finally:
if self.view_cron is not None:
self.view_cron = self.sched.schedule(randint(5, 10), self.view_profile)

View file

@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BrowserUnavailable
from weboob.capabilities.dating import Optimization
from weboob.capabilities.contact import QueryError
from weboob.tools.log import getLogger
__all__ = ['QueriesQueue']
class QueriesQueue(Optimization):
def __init__(self, sched, storage, browser):
self.sched = sched
self.storage = storage
self.browser = browser
self.logger = getLogger('queriesqueue', browser.logger)
self.queue = storage.get('queries_queue', 'queue', default=[])
self.check_cron = None
def save(self):
self.storage.set('queries_queue', 'queue', self.queue)
self.storage.save()
def start(self):
self.check_cron = self.sched.repeat(3600, self.flush_queue)
return True
def stop(self):
self.sched.cancel(self.check_cron)
self.check_cron = None
return True
def is_running(self):
return self.check_cron is not None
def enqueue_query(self, id, priority=999):
id_queue = [_id[1] for _id in self.queue]
if int(id) in id_queue:
raise QueryError('This id is already queued')
self.queue.append((int(priority), int(id)))
self.save()
# Try to flush queue to send it now.
self.flush_queue()
# Check if the enqueued query has been sent
for p, i in self.queue:
if i == int(id):
return False
return True
def flush_queue(self):
self.queue.sort()
priority = 0
id = None
try:
try:
while len(self.queue) > 0:
priority, id = self.queue.pop()
if not id:
continue
with self.browser:
if self.browser.send_charm(id):
self.logger.info('Charm sent to %s' % id)
else:
self.queue.append((priority, id))
self.logger.info("Charm can't be send to %s" % id)
break
# As the charm has been correctly sent (no exception raised),
# we don't store anymore ID, because if nbAvailableCharms()
# fails, we don't want to re-queue this ID.
id = None
priority = 0
except BrowserUnavailable:
# We consider this profil hasn't been [correctly] analysed
if not id is None:
self.queue.append((priority, id))
finally:
self.save()

View file

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BrowserUnavailable
from weboob.capabilities.dating import Optimization
__all__ = ['Visibility']
class Visibility(Optimization):
def __init__(self, sched, browser):
self.sched = sched
self.browser = browser
self.cron = None
def start(self):
self.cron = self.sched.repeat(60*5, self.reconnect)
return True
def stop(self):
self.sched.cancel(self.cron)
self.cron = None
return True
def is_running(self):
return self.cron is not None
def reconnect(self):
try:
with self.browser:
self.browser.login()
except BrowserUnavailable as e:
print str(e)
pass

View file

@ -174,3 +174,19 @@ class PostMessagePage(BasePage):
self.browser['r1'] = id
self.browser['body'] = content
self.browser.submit()
class VisitsPage(BasePage):
def get_visits(self):
ul_item = self.parser.select(self.document.getroot(), '//*[@id="page_content"]/ul[3]', method='xpath')[0]
visitors = []
for li in ul_item:
visitor_id = unicode(li.get('id')[4:])
visitor_timestamp = unicode(self.parser.select(li, './/div/span', method='xpath')[0].text.strip())
visitors.append({
'who': {
'id': visitor_id
},
'date': visitor_timestamp
})
return visitors