From 296d6b7c9f9fb598d6d48cd79d766fd82fa7de1b Mon Sep 17 00:00:00 2001 From: Ahmed Boussadia Date: Wed, 7 May 2014 11:38:51 +0200 Subject: [PATCH] Starting work on okc module : adding optimization --- modules/okc/backend.py | 34 ++++- modules/okc/browser.py | 11 +- modules/okc/optim/__init__.py | 0 modules/okc/optim/priority_connection.py | 185 +++++++++++++++++++++++ modules/okc/optim/profiles_walker.py | 104 +++++++++++++ modules/okc/optim/queries_queue.py | 107 +++++++++++++ modules/okc/optim/visibility.py | 52 +++++++ modules/okc/pages.py | 16 ++ 8 files changed, 501 insertions(+), 8 deletions(-) create mode 100644 modules/okc/optim/__init__.py create mode 100644 modules/okc/optim/priority_connection.py create mode 100644 modules/okc/optim/profiles_walker.py create mode 100644 modules/okc/optim/queries_queue.py create mode 100644 modules/okc/optim/visibility.py diff --git a/modules/okc/backend.py b/modules/okc/backend.py index 1216688d..2d7b8ace 100644 --- a/modules/okc/backend.py +++ b/modules/okc/backend.py @@ -27,13 +27,15 @@ from dateutil.parser import parse as _parse_dt from weboob.capabilities.base import NotLoaded from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread -#from weboob.capabilities.dating import ICapDating, OptimizationNotFound, Event +from weboob.capabilities.dating import ICapDating, OptimizationNotFound, Event from weboob.capabilities.contact import ICapContact, ContactPhoto, Contact from weboob.tools.backend import BaseBackend, BackendConfig from weboob.tools.value import Value, ValueBackendPassword from weboob.tools.misc import local2utc from .browser import OkCBrowser +from .optim.visibility import Visibility +from .optim.queries_queue import QueriesQueue __all__ = ['OkCBackend'] @@ -64,7 +66,7 @@ def parse_dt(s): return local2utc(d) -class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost): +class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost, ICapDating): NAME = 'okc' MAINTAINER = u'Roger Philibert' EMAIL = 'roger.philibert@gmail.com' @@ -73,7 +75,7 @@ class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost): DESCRIPTION = u'OkCupid dating website' CONFIG = BackendConfig(Value('username', label='Username'), ValueBackendPassword('password', label='Password')) - STORAGE = { + STORAGE = {'queries_queue': {'queue': []}, 'sluts': {}, #'notes': {}, } @@ -82,6 +84,32 @@ class OkCBackend(BaseBackend, ICapMessages, ICapContact, ICapMessagesPost): def create_default_browser(self): return self.create_browser(self.config['username'].get(), self.config['password'].get()) + # ---- ICapDating methods --------------------- + def init_optimizations(self): + self.add_optimization('VISIBILITY', Visibility(self.weboob.scheduler, self.browser)) + self.add_optimization('QUERIES_QUEUE', QueriesQueue(self.weboob.scheduler, self.storage, self.browser)) + + def iter_events(self): + all_events = {} + with self.browser: + all_events[u'visits'] = (self.browser.get_visits, 'Visited by %s') + for type, (events, message) in all_events.iteritems(): + for event in events(): + e = Event(event['who']['id']) + + e.date = parse_dt(event['date']) + e.type = type + # if 'who' in event: + # e.contact = self._get_partial_contact(event['who']) + # else: + # e.contact = self._get_partial_contact(event) + + # if not e.contact: + # continue + + # e.message = message % e.contact.name + yield e + # ---- ICapMessages methods --------------------- def fill_thread(self, thread, fields): diff --git a/modules/okc/browser.py b/modules/okc/browser.py index 57b13247..177a6deb 100644 --- a/modules/okc/browser.py +++ b/modules/okc/browser.py @@ -22,7 +22,7 @@ import urllib from weboob.tools.browser import BaseBrowser, BasePage from weboob.tools.ordereddict import OrderedDict -from .pages import LoginPage, ThreadPage, MessagesPage, PostMessagePage, ProfilePage, PhotosPage +from .pages import LoginPage, ThreadPage, MessagesPage, PostMessagePage, ProfilePage, PhotosPage, VisitsPage __all__ = ['OkCBrowser'] @@ -43,6 +43,7 @@ class OkCBrowser(BaseBrowser): ('http://%s/messages\?.*' % DOMAIN, MessagesPage), ('http://%s/profile/.*/photos' % DOMAIN, PhotosPage), ('http://%s/profile/[^/]*' % DOMAIN, ProfilePage), + ('http://%s/visitors' % DOMAIN, VisitsPage) )) logged_in = False @@ -120,10 +121,10 @@ class OkCBrowser(BaseBrowser): # r = self.api_request('me', 'flashs') # return r['result']['all'] - #@check_login - #def get_visits(self): - # r = self.api_request('me', 'visits') - # return r['result']['news'] + r['result']['olds'] + @check_login + def get_visits(self): + self.location('http://m.okcupid.com/visitors') + return self.page.get_visits() @check_login def get_threads_list(self, count=30): diff --git a/modules/okc/optim/__init__.py b/modules/okc/optim/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/modules/okc/optim/priority_connection.py b/modules/okc/optim/priority_connection.py new file mode 100644 index 00000000..64b0f711 --- /dev/null +++ b/modules/okc/optim/priority_connection.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + + + +import random + +from weboob.tools.browser import BrowserUnavailable, BrowserIncorrectPassword +from weboob.capabilities.dating import Optimization +from weboob.capabilities.account import AccountRegisterError +from weboob.tools.log import getLogger +from weboob.tools.value import Value, ValuesDict, ValueInt + +from aum.captcha import CaptchaError +from aum.exceptions import AdopteWait, AdopteBanned +from aum.browser import AuMBrowser + + +__all__ = ['PriorityConnection'] + + +class PriorityConnection(Optimization): + CONFIG = ValuesDict(ValueInt('minimal', label='Minimal of godchilds', default=5), + Value('domain', label='Domain to use for fake accounts emails', default='aum.example.com'), + ValueInt('interval', label='Interval of checks (seconds)', default=3600) + ) + + def __init__(self, sched, storage, browser): + self.sched = sched + self.storage = storage + self.browser = browser + self.logger = getLogger('priorityconn', browser.logger) + + self.config = storage.get('priority_connection', 'config', default=None) + if self.config == {}: + self.config = None + + self.check_cron = None + self.activity_cron = None + + def start(self): + if self.config is None: + return False + + self.check_cron = self.sched.repeat(int(self.config['interval']), self.check_godchilds) + self.activity_cron = self.sched.repeat(600, self.activity_fakes) + return True + + def stop(self): + self.sched.cancel(self.check_cron) + self.check_cron = None + self.sched.cancel(self.activity_cron) + self.activity_cron = None + return True + + def is_running(self): + return self.check_cron is not None + + def set_config(self, params): + self.config = params + self.storage.set('priority_connection', 'config', self.config) + self.storage.save() + + def get_config(self): + return self.config + + def generate_name(self): + login = u'' + for x in xrange(8): + if x % 2: + login += random.choice(u'aeiou') + else: + login += random.choice(u'bcdfghjklmnprstv') + + fakes = self.storage.get('priority_connection', 'fakes') + while ('%s@%s' % (login, self.config['domain'])) in fakes.iterkeys(): + login += '_' + return login + + def generate_password(self): + return '%08x' % random.randint(1, int('ffffffff', 16)) + + def check_godchilds(self): + with self.browser: + try: + my_id = self.browser.get_my_id() + nb_godchilds = self.browser.nb_godchilds() + except AdopteWait: + nb_godchilds = 0 + except BrowserUnavailable: + # We'll check later + return + + missing_godchilds = int(self.config['minimal']) - nb_godchilds + + self.logger.info('Missing godchilds: %s' % missing_godchilds) + + if missing_godchilds <= 0: + return + + for i in xrange(missing_godchilds): + registered = False + while not registered: + name = self.generate_name() + password = self.generate_password() + + browser = AuMBrowser('%s@%s' % (name, self.config['domain']), proxy=self.browser.proxy) + try: + browser.register(password= password, + sex= 1, # slut + birthday_d= random.randint(1, 28), + birthday_m= random.randint(1, 12), + birthday_y= random.randint(1975, 1990), + zipcode= 75001, + country= 'fr', + godfather= my_id) + except AccountRegisterError as e: + self.logger.warning('Unable to register account: %s' % e) + except CaptchaError: + self.logger.warning('Unable to solve captcha... Retrying') + else: + registered = True + + # set nickname + browser.set_nickname(name.strip('_').capitalize()) + # rate my own profile with good score + for i in xrange(4): + browser.rate(my_id, i, 5.0) + + # save fake in storage + fake = {'username': browser.username, + 'password': password} + self.storage.set('priority_connection', 'fakes', name, fake) + self.storage.save() + self.logger.info('Fake account "%s" created (godfather=%s)' % (name, my_id)) + + def activity_fakes(self): + try: + fakes = self.storage.get('priority_connection', 'fakes', default={}) + if len(fakes) == 0: + return + while True: + name = random.choice(fakes.keys()) + fake = fakes[name] + try: + browser = AuMBrowser(fake['username'], fake['password'], proxy=self.browser.proxy) + except (AdopteBanned,BrowserIncorrectPassword) as e: + self.logger.warning('Fake %s can\'t login: %s' % (name, e)) + continue + + profiles = browser.search_profiles(country="fr", + dist='10', + save=True) + + if not profiles: + continue + + id = profiles.pop() + profile = browser.get_profile(id) + # bad rate + for i in xrange(4): + browser.rate(profile.get_id(), i, 0.6) + # deblock + browser.deblock(profile.get_id()) + return + except BrowserUnavailable: + # don't care + pass diff --git a/modules/okc/optim/profiles_walker.py b/modules/okc/optim/profiles_walker.py new file mode 100644 index 00000000..88acfb79 --- /dev/null +++ b/modules/okc/optim/profiles_walker.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon, Christophe Benz +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + + + +from random import randint + +from weboob.tools.browser import BrowserUnavailable +from weboob.capabilities.dating import Optimization +from weboob.tools.log import getLogger + + +__all__ = ['ProfilesWalker'] + + +class ProfilesWalker(Optimization): + def __init__(self, sched, storage, browser): + self.sched = sched + self.storage = storage + self.browser = browser + self.logger = getLogger('walker', browser.logger) + + self.walk_cron = None + self.view_cron = None + self.visited_profiles = set(storage.get('profiles_walker', 'viewed')) + self.logger.info(u'Loaded %d already visited profiles from storage.' % len(self.visited_profiles)) + self.profiles_queue = set() + + def save(self): + self.storage.set('profiles_walker', 'viewed', list(self.visited_profiles)) + self.storage.save() + + def start(self): + self.walk_cron = self.sched.repeat(60, self.enqueue_profiles) + self.view_cron = self.sched.schedule(randint(5, 10), self.view_profile) + return True + + def stop(self): + self.sched.cancel(self.walk_cron) + self.sched.cancel(self.view_cron) + self.walk_cron = None + self.view_cron = None + return True + + def is_running(self): + return self.walk_cron is not None + + def enqueue_profiles(self): + try: + with self.browser: + profiles_to_visit = self.browser.search_profiles().difference(self.visited_profiles) + self.logger.info(u'Enqueuing profiles to visit: %s' % profiles_to_visit) + self.profiles_queue = set(profiles_to_visit) + self.save() + except BrowserUnavailable: + return + + def view_profile(self): + try: + try: + id = self.profiles_queue.pop() + except KeyError: + return # empty queue + + try: + with self.browser: + profile = self.browser.get_profile(id) + self.logger.info(u'Visited profile %s (%s)' % (profile['pseudo'], id)) + + # Get score from the aum_score module + #d = self.nucentral_core.callService(context.Context.fromComponent(self), 'aum_score', 'score', profile) + # d.addCallback(self.score_cb, profile.getID()) + # deferredlist.append(d) + + # do not forget that we visited this profile, to avoid re-visiting it. + self.visited_profiles.add(id) + self.save() + + except BrowserUnavailable: + # We consider this profil hasn't been [correctly] analysed + self.profiles_queue.add(id) + return + except Exception as e: + print e + finally: + if self.view_cron is not None: + self.view_cron = self.sched.schedule(randint(5, 10), self.view_profile) diff --git a/modules/okc/optim/queries_queue.py b/modules/okc/optim/queries_queue.py new file mode 100644 index 00000000..ece864f2 --- /dev/null +++ b/modules/okc/optim/queries_queue.py @@ -0,0 +1,107 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + + + +from weboob.tools.browser import BrowserUnavailable +from weboob.capabilities.dating import Optimization +from weboob.capabilities.contact import QueryError +from weboob.tools.log import getLogger + + +__all__ = ['QueriesQueue'] + + +class QueriesQueue(Optimization): + def __init__(self, sched, storage, browser): + self.sched = sched + self.storage = storage + self.browser = browser + self.logger = getLogger('queriesqueue', browser.logger) + + self.queue = storage.get('queries_queue', 'queue', default=[]) + + self.check_cron = None + + def save(self): + self.storage.set('queries_queue', 'queue', self.queue) + self.storage.save() + + def start(self): + self.check_cron = self.sched.repeat(3600, self.flush_queue) + return True + + def stop(self): + self.sched.cancel(self.check_cron) + self.check_cron = None + return True + + def is_running(self): + return self.check_cron is not None + + def enqueue_query(self, id, priority=999): + id_queue = [_id[1] for _id in self.queue] + if int(id) in id_queue: + raise QueryError('This id is already queued') + self.queue.append((int(priority), int(id))) + self.save() + # Try to flush queue to send it now. + self.flush_queue() + + # Check if the enqueued query has been sent + for p, i in self.queue: + if i == int(id): + return False + return True + + def flush_queue(self): + self.queue.sort() + + priority = 0 + id = None + + try: + try: + while len(self.queue) > 0: + priority, id = self.queue.pop() + + if not id: + continue + + with self.browser: + if self.browser.send_charm(id): + self.logger.info('Charm sent to %s' % id) + else: + self.queue.append((priority, id)) + self.logger.info("Charm can't be send to %s" % id) + break + + # As the charm has been correctly sent (no exception raised), + # we don't store anymore ID, because if nbAvailableCharms() + # fails, we don't want to re-queue this ID. + id = None + priority = 0 + + except BrowserUnavailable: + # We consider this profil hasn't been [correctly] analysed + if not id is None: + self.queue.append((priority, id)) + finally: + self.save() diff --git a/modules/okc/optim/visibility.py b/modules/okc/optim/visibility.py new file mode 100644 index 00000000..e9a52251 --- /dev/null +++ b/modules/okc/optim/visibility.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.browser import BrowserUnavailable +from weboob.capabilities.dating import Optimization + + +__all__ = ['Visibility'] + + +class Visibility(Optimization): + def __init__(self, sched, browser): + self.sched = sched + self.browser = browser + self.cron = None + + def start(self): + self.cron = self.sched.repeat(60*5, self.reconnect) + return True + + def stop(self): + self.sched.cancel(self.cron) + self.cron = None + return True + + def is_running(self): + return self.cron is not None + + def reconnect(self): + try: + with self.browser: + self.browser.login() + except BrowserUnavailable as e: + print str(e) + pass diff --git a/modules/okc/pages.py b/modules/okc/pages.py index 15104e48..aa24fb59 100644 --- a/modules/okc/pages.py +++ b/modules/okc/pages.py @@ -174,3 +174,19 @@ class PostMessagePage(BasePage): self.browser['r1'] = id self.browser['body'] = content self.browser.submit() + +class VisitsPage(BasePage): + def get_visits(self): + ul_item = self.parser.select(self.document.getroot(), '//*[@id="page_content"]/ul[3]', method='xpath')[0] + visitors = [] + for li in ul_item: + visitor_id = unicode(li.get('id')[4:]) + visitor_timestamp = unicode(self.parser.select(li, './/div/span', method='xpath')[0].text.strip()) + visitors.append({ + 'who': { + 'id': visitor_id + }, + 'date': visitor_timestamp + }) + return visitors + \ No newline at end of file