split BaseBrowser into new class StandardBrowser which is usable without inheriting it
This commit is contained in:
parent
cc626bfd81
commit
5337e53b49
3 changed files with 176 additions and 147 deletions
|
|
@ -289,8 +289,8 @@ class BaseApplication(object):
|
||||||
|
|
||||||
if self.options.debug or self.options.save_responses:
|
if self.options.debug or self.options.save_responses:
|
||||||
level = logging.DEBUG
|
level = logging.DEBUG
|
||||||
from weboob.tools.browser import BaseBrowser
|
from weboob.tools.browser import StandardBrowser
|
||||||
BaseBrowser.DEBUG_MECHANIZE = True
|
StandardBrowser.DEBUG_MECHANIZE = True
|
||||||
# required to actually display or save the stuff
|
# required to actually display or save the stuff
|
||||||
logger = logging.getLogger("mechanize")
|
logger = logging.getLogger("mechanize")
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
|
|
@ -306,8 +306,8 @@ class BaseApplication(object):
|
||||||
if self.options.save_responses:
|
if self.options.save_responses:
|
||||||
responses_dirname = tempfile.mkdtemp(prefix='weboob_session_')
|
responses_dirname = tempfile.mkdtemp(prefix='weboob_session_')
|
||||||
print >>sys.stderr, 'Debug data will be saved in this directory: %s' % responses_dirname
|
print >>sys.stderr, 'Debug data will be saved in this directory: %s' % responses_dirname
|
||||||
BaseBrowser.SAVE_RESPONSES = True
|
StandardBrowser.SAVE_RESPONSES = True
|
||||||
BaseBrowser.responses_dirname = responses_dirname
|
StandardBrowser.responses_dirname = responses_dirname
|
||||||
self.add_logging_file_handler(os.path.join(responses_dirname, 'debug.log'))
|
self.add_logging_file_handler(os.path.join(responses_dirname, 'debug.log'))
|
||||||
|
|
||||||
# file logger
|
# file logger
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,10 @@
|
||||||
from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \
|
from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \
|
||||||
BrowserUnavailable, BrowserRetry, \
|
BrowserUnavailable, BrowserRetry, \
|
||||||
BrowserHTTPNotFound, BrowserHTTPError, \
|
BrowserHTTPNotFound, BrowserHTTPError, \
|
||||||
BasePage, BaseBrowser, BrokenPageError
|
BasePage, BaseBrowser, BrokenPageError, \
|
||||||
|
StandardBrowser
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
||||||
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser', 'BrokenPageError']
|
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser',
|
||||||
|
'BrokenPageError', 'StandardBrowser']
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ else:
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
||||||
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser']
|
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser', 'StandardBrowser']
|
||||||
|
|
||||||
|
|
||||||
# Exceptions
|
# Exceptions
|
||||||
|
|
@ -116,17 +116,23 @@ class BasePage(object):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class BaseBrowser(mechanize.Browser):
|
def check_location(func):
|
||||||
"""
|
def inner(self, *args, **kwargs):
|
||||||
Base browser class to navigate on a website.
|
if args and isinstance(args[0], basestring):
|
||||||
"""
|
url = args[0]
|
||||||
|
if url.startswith('/') and (not self.request or self.request.host != self.DOMAIN):
|
||||||
|
url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, url)
|
||||||
|
url = re.sub('(.*)#.*', r'\1', url)
|
||||||
|
|
||||||
|
args = (url,) + args[1:]
|
||||||
|
return func(self, *args, **kwargs)
|
||||||
|
return inner
|
||||||
|
|
||||||
|
class StandardBrowser(mechanize.Browser):
|
||||||
|
|
||||||
# ------ Class attributes --------------------------------------
|
# ------ Class attributes --------------------------------------
|
||||||
|
|
||||||
DOMAIN = None
|
|
||||||
PROTOCOL = 'http'
|
|
||||||
ENCODING = 'utf-8'
|
ENCODING = 'utf-8'
|
||||||
PAGES = {}
|
|
||||||
USER_AGENTS = {
|
USER_AGENTS = {
|
||||||
'desktop_firefox': 'Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.2.13) Gecko/20101209 Fedora/3.6.13-1.fc13 Firefox/3.6.13',
|
'desktop_firefox': 'Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.2.13) Gecko/20101209 Fedora/3.6.13-1.fc13 Firefox/3.6.13',
|
||||||
'android': 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17',
|
'android': 'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17',
|
||||||
|
|
@ -141,33 +147,6 @@ class BaseBrowser(mechanize.Browser):
|
||||||
responses_dirname = None
|
responses_dirname = None
|
||||||
responses_count = 0
|
responses_count = 0
|
||||||
|
|
||||||
# ------ Abstract methods --------------------------------------
|
|
||||||
|
|
||||||
def home(self):
|
|
||||||
"""
|
|
||||||
Go to the home page.
|
|
||||||
"""
|
|
||||||
if self.DOMAIN is not None:
|
|
||||||
self.location('%s://%s/' % (self.PROTOCOL, self.DOMAIN))
|
|
||||||
|
|
||||||
def login(self):
|
|
||||||
"""
|
|
||||||
Login to the website.
|
|
||||||
|
|
||||||
This function is called when is_logged() returns False and the password
|
|
||||||
attribute is not None.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def is_logged(self):
|
|
||||||
"""
|
|
||||||
Return True if we are logged on website. When Browser tries to access
|
|
||||||
to a page, if this method returns False, it calls login().
|
|
||||||
|
|
||||||
It is never called if the password attribute is None.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
# ------ Browser methods ---------------------------------------
|
# ------ Browser methods ---------------------------------------
|
||||||
|
|
||||||
# I'm not a robot, so disable the check of permissions in robots.txt.
|
# I'm not a robot, so disable the check of permissions in robots.txt.
|
||||||
|
|
@ -175,22 +154,17 @@ class BaseBrowser(mechanize.Browser):
|
||||||
default_features.remove('_robots')
|
default_features.remove('_robots')
|
||||||
default_features.remove('_refresh')
|
default_features.remove('_refresh')
|
||||||
|
|
||||||
def __init__(self, username=None, password=None, firefox_cookies=None,
|
def __init__(self, firefox_cookies=None, parser=None, history=NoHistory(), proxy=None, logger=None,
|
||||||
parser=None, history=NoHistory(), proxy=None, logger=None,
|
factory=None):
|
||||||
factory=None, get_home=True):
|
|
||||||
"""
|
"""
|
||||||
Constructor of Browser.
|
Constructor of Browser.
|
||||||
|
|
||||||
@param username [str] username on website.
|
|
||||||
@param password [str] password on website. If it is None, Browser will
|
|
||||||
not try to login.
|
|
||||||
@param filefox_cookies [str] Path to cookies' sqlite file.
|
@param filefox_cookies [str] Path to cookies' sqlite file.
|
||||||
@param parser [IParser] parser to use on HTML files.
|
@param parser [IParser] parser to use on HTML files.
|
||||||
@param hisory [object] History manager. Default value is an object
|
@param history [object] History manager. Default value is an object
|
||||||
which does not keep history.
|
which does not keep history.
|
||||||
@param proxy [str] proxy URL to use.
|
@param proxy [str] proxy URL to use.
|
||||||
@param factory [object] Mechanize factory. None to use Mechanize's default.
|
@param factory [object] Mechanize factory. None to use Mechanize's default.
|
||||||
@param get_home [bool] Try to get the homepage.
|
|
||||||
"""
|
"""
|
||||||
mechanize.Browser.__init__(self, history=history, factory=factory)
|
mechanize.Browser.__init__(self, history=history, factory=factory)
|
||||||
self.logger = getLogger('browser', logger)
|
self.logger = getLogger('browser', logger)
|
||||||
|
|
@ -219,13 +193,9 @@ class BaseBrowser(mechanize.Browser):
|
||||||
|
|
||||||
if parser is None:
|
if parser is None:
|
||||||
parser = get_parser()()
|
parser = get_parser()()
|
||||||
elif isinstance(parser, (tuple,list)):
|
elif isinstance(parser, (tuple,list,str,unicode)):
|
||||||
parser = get_parser(parser)()
|
parser = get_parser(parser)()
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.page = None
|
|
||||||
self.last_update = 0.0
|
|
||||||
self.username = username
|
|
||||||
self.password = password
|
|
||||||
self.lock = RLock()
|
self.lock = RLock()
|
||||||
|
|
||||||
if self.DEBUG_HTTP:
|
if self.DEBUG_HTTP:
|
||||||
|
|
@ -236,43 +206,12 @@ class BaseBrowser(mechanize.Browser):
|
||||||
# Enable log messages from mechanize.Browser
|
# Enable log messages from mechanize.Browser
|
||||||
self.set_debug_redirects(True)
|
self.set_debug_redirects(True)
|
||||||
|
|
||||||
if self.password and get_home:
|
|
||||||
try:
|
|
||||||
self.home()
|
|
||||||
# Do not abort the build of browser when the website is down.
|
|
||||||
except BrowserUnavailable:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.lock.acquire()
|
self.lock.acquire()
|
||||||
|
|
||||||
def __exit__(self, t, v, tb):
|
def __exit__(self, t, v, tb):
|
||||||
self.lock.release()
|
self.lock.release()
|
||||||
|
|
||||||
def pageaccess(func):
|
|
||||||
def inner(self, *args, **kwargs):
|
|
||||||
if not self.page or self.password and not self.page.is_logged():
|
|
||||||
self.home()
|
|
||||||
|
|
||||||
return func(self, *args, **kwargs)
|
|
||||||
return inner
|
|
||||||
|
|
||||||
@pageaccess
|
|
||||||
def keepalive(self):
|
|
||||||
self.home()
|
|
||||||
|
|
||||||
def check_location(func):
|
|
||||||
def inner(self, *args, **kwargs):
|
|
||||||
if args and isinstance(args[0], basestring):
|
|
||||||
url = args[0]
|
|
||||||
if url.startswith('/') and (not self.request or self.request.host != self.DOMAIN):
|
|
||||||
url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, url)
|
|
||||||
url = re.sub('(.*)#.*', r'\1', url)
|
|
||||||
|
|
||||||
args = (url,) + args[1:]
|
|
||||||
return func(self, *args, **kwargs)
|
|
||||||
return inner
|
|
||||||
|
|
||||||
@check_location
|
@check_location
|
||||||
@retry(BrowserHTTPError, tries=3)
|
@retry(BrowserHTTPError, tries=3)
|
||||||
def openurl(self, *args, **kwargs):
|
def openurl(self, *args, **kwargs):
|
||||||
|
|
@ -337,6 +276,155 @@ class BaseBrowser(mechanize.Browser):
|
||||||
else:
|
else:
|
||||||
self.logger.info(msg)
|
self.logger.info(msg)
|
||||||
|
|
||||||
|
def get_document(self, result):
|
||||||
|
return self.parser.parse(result, self.ENCODING)
|
||||||
|
|
||||||
|
def location(self, *args, **kwargs):
|
||||||
|
return self.get_document(self.openurl(*args, **kwargs))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def buildurl(base, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Build an URL and escape arguments.
|
||||||
|
You can give a serie of tuples in *args (and the order is keept), or
|
||||||
|
a dict in **kwargs (but the order is lost).
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> buildurl('/blah.php', ('a', '&'), ('b', '=')
|
||||||
|
'/blah.php?a=%26&b=%3D'
|
||||||
|
>>> buildurl('/blah.php', a='&', 'b'='=')
|
||||||
|
'/blah.php?b=%3D&a=%26'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not args:
|
||||||
|
args = kwargs
|
||||||
|
if not args:
|
||||||
|
return base
|
||||||
|
else:
|
||||||
|
return '%s?%s' % (base, urllib.urlencode(args))
|
||||||
|
|
||||||
|
def str(self, s):
|
||||||
|
if isinstance(s, unicode):
|
||||||
|
s = s.encode('iso-8859-15', 'replace')
|
||||||
|
return s
|
||||||
|
|
||||||
|
def set_field(self, args, label, field=None, value=None, is_list=False):
|
||||||
|
"""
|
||||||
|
Set a value to a form field.
|
||||||
|
|
||||||
|
@param args [dict] arguments where to look for value.
|
||||||
|
@param label [str] label in args.
|
||||||
|
@param field [str] field name. If None, use label instead.
|
||||||
|
@param value [str] value to give on field.
|
||||||
|
@param is_list [bool] the field is a list.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not field:
|
||||||
|
field = label
|
||||||
|
if args.get(label, None) is not None:
|
||||||
|
if not value:
|
||||||
|
if is_list:
|
||||||
|
if isinstance(is_list, (list, tuple)):
|
||||||
|
try:
|
||||||
|
value = [self.str(is_list.index(args[label]))]
|
||||||
|
except ValueError, e:
|
||||||
|
if args[label]:
|
||||||
|
print >>sys.stderr, '[%s] %s: %s' % (label, args[label], e)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
value = [self.str(args[label])]
|
||||||
|
else:
|
||||||
|
value = self.str(args[label])
|
||||||
|
self[field] = value
|
||||||
|
except ControlNotFoundError:
|
||||||
|
return
|
||||||
|
|
||||||
|
class BaseBrowser(StandardBrowser):
|
||||||
|
"""
|
||||||
|
Base browser class to navigate on a website.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ------ Class attributes --------------------------------------
|
||||||
|
|
||||||
|
DOMAIN = None
|
||||||
|
PROTOCOL = 'http'
|
||||||
|
PAGES = {}
|
||||||
|
|
||||||
|
# ------ Abstract methods --------------------------------------
|
||||||
|
|
||||||
|
def home(self):
|
||||||
|
"""
|
||||||
|
Go to the home page.
|
||||||
|
"""
|
||||||
|
if self.DOMAIN is not None:
|
||||||
|
self.location('%s://%s/' % (self.PROTOCOL, self.DOMAIN))
|
||||||
|
|
||||||
|
def login(self):
|
||||||
|
"""
|
||||||
|
Login to the website.
|
||||||
|
|
||||||
|
This function is called when is_logged() returns False and the password
|
||||||
|
attribute is not None.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def is_logged(self):
|
||||||
|
"""
|
||||||
|
Return True if we are logged on website. When Browser tries to access
|
||||||
|
to a page, if this method returns False, it calls login().
|
||||||
|
|
||||||
|
It is never called if the password attribute is None.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
# ------ Browser methods ---------------------------------------
|
||||||
|
|
||||||
|
def __init__(self, username=None, password=None, firefox_cookies=None,
|
||||||
|
parser=None, history=NoHistory(), proxy=None, logger=None,
|
||||||
|
factory=None, get_home=True):
|
||||||
|
"""
|
||||||
|
Constructor of Browser.
|
||||||
|
|
||||||
|
@param username [str] username on website.
|
||||||
|
@param password [str] password on website. If it is None, Browser will
|
||||||
|
not try to login.
|
||||||
|
@param filefox_cookies [str] Path to cookies' sqlite file.
|
||||||
|
@param parser [IParser] parser to use on HTML files.
|
||||||
|
@param hisory [object] History manager. Default value is an object
|
||||||
|
which does not keep history.
|
||||||
|
@param proxy [str] proxy URL to use.
|
||||||
|
@param factory [object] Mechanize factory. None to use Mechanize's default.
|
||||||
|
@param get_home [bool] Try to get the homepage.
|
||||||
|
"""
|
||||||
|
StandardBrowser.__init__(self, firefox_cookies, parser, history, proxy, logger, factory)
|
||||||
|
self.page = None
|
||||||
|
self.last_update = 0.0
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
if self.password and get_home:
|
||||||
|
try:
|
||||||
|
self.home()
|
||||||
|
# Do not abort the build of browser when the website is down.
|
||||||
|
except BrowserUnavailable:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def pageaccess(func):
|
||||||
|
"""
|
||||||
|
Decorator to use around a method which access to a page.
|
||||||
|
"""
|
||||||
|
def inner(self, *args, **kwargs):
|
||||||
|
if not self.page or self.password and not self.page.is_logged():
|
||||||
|
self.home()
|
||||||
|
|
||||||
|
return func(self, *args, **kwargs)
|
||||||
|
return inner
|
||||||
|
|
||||||
|
@pageaccess
|
||||||
|
def keepalive(self):
|
||||||
|
self.home()
|
||||||
|
|
||||||
def submit(self, *args, **kwargs):
|
def submit(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Submit the selected form.
|
Submit the selected form.
|
||||||
|
|
@ -400,9 +488,6 @@ class BaseBrowser(mechanize.Browser):
|
||||||
self.home()
|
self.home()
|
||||||
self.location(*keep_args, **keep_kwargs)
|
self.location(*keep_args, **keep_kwargs)
|
||||||
|
|
||||||
def get_document(self, result):
|
|
||||||
return self.parser.parse(result, self.ENCODING)
|
|
||||||
|
|
||||||
# DO NOT ENABLE THIS FUCKING PEACE OF CODE EVEN IF IT WOULD BE BETTER
|
# DO NOT ENABLE THIS FUCKING PEACE OF CODE EVEN IF IT WOULD BE BETTER
|
||||||
# TO SANITARIZE FUCKING HTML.
|
# TO SANITARIZE FUCKING HTML.
|
||||||
#def _set_response(self, response, *args, **kwargs):
|
#def _set_response(self, response, *args, **kwargs):
|
||||||
|
|
@ -470,61 +555,3 @@ class BaseBrowser(mechanize.Browser):
|
||||||
|
|
||||||
if self._cookie:
|
if self._cookie:
|
||||||
self._cookie.save()
|
self._cookie.save()
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def buildurl(base, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
Build an URL and escape arguments.
|
|
||||||
You can give a serie of tuples in *args (and the order is keept), or
|
|
||||||
a dict in **kwargs (but the order is lost).
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> buildurl('/blah.php', ('a', '&'), ('b', '=')
|
|
||||||
'/blah.php?a=%26&b=%3D'
|
|
||||||
>>> buildurl('/blah.php', a='&', 'b'='=')
|
|
||||||
'/blah.php?b=%3D&a=%26'
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not args:
|
|
||||||
args = kwargs
|
|
||||||
if not args:
|
|
||||||
return base
|
|
||||||
else:
|
|
||||||
return '%s?%s' % (base, urllib.urlencode(args))
|
|
||||||
|
|
||||||
def str(self, s):
|
|
||||||
if isinstance(s, unicode):
|
|
||||||
s = s.encode('iso-8859-15', 'replace')
|
|
||||||
return s
|
|
||||||
|
|
||||||
def set_field(self, args, label, field=None, value=None, is_list=False):
|
|
||||||
"""
|
|
||||||
Set a value to a form field.
|
|
||||||
|
|
||||||
@param args [dict] arguments where to look for value.
|
|
||||||
@param label [str] label in args.
|
|
||||||
@param field [str] field name. If None, use label instead.
|
|
||||||
@param value [str] value to give on field.
|
|
||||||
@param is_list [bool] the field is a list.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if not field:
|
|
||||||
field = label
|
|
||||||
if args.get(label, None) is not None:
|
|
||||||
if not value:
|
|
||||||
if is_list:
|
|
||||||
if isinstance(is_list, (list, tuple)):
|
|
||||||
try:
|
|
||||||
value = [self.str(is_list.index(args[label]))]
|
|
||||||
except ValueError, e:
|
|
||||||
if args[label]:
|
|
||||||
print >>sys.stderr, '[%s] %s: %s' % (label, args[label], e)
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
value = [self.str(args[label])]
|
|
||||||
else:
|
|
||||||
value = self.str(args[label])
|
|
||||||
self[field] = value
|
|
||||||
except ControlNotFoundError:
|
|
||||||
return
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue