From 6a108221db7f04f6ce222ca833e618e139d4e335 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Wed, 25 Feb 2015 11:30:13 +0100 Subject: [PATCH] add HTMLPage.REFRESH_MAX parameter (disabled by default) to configure handle of refreshes --- weboob/browser/pages.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/weboob/browser/pages.py b/weboob/browser/pages.py index 819efd3f..48741c39 100644 --- a/weboob/browser/pages.py +++ b/weboob/browser/pages.py @@ -457,6 +457,14 @@ class HTMLPage(Page): The class to instanciate when using :meth:`HTMLPage.get_form`. Default to :class:`Form`. """ + REFRESH_MAX = None + """ + When handling a "Refresh" meta header, the page considers it only if the sleep + time in lesser than this value. + + Default value is None, means refreshes aren't handled. + """ + def __init__(self, *args, **kwargs): import lxml.html as html ns = html.etree.FunctionNamespace(None) @@ -469,16 +477,22 @@ class HTMLPage(Page): self.handle_refresh() def handle_refresh(self): + if self.REFRESH_MAX is None: + return + for refresh in self.doc.xpath('//head/meta[@http-equiv="Refresh"]'): m = self.browser.REFRESH_RE.match(refresh.get('content', '')) if not m: continue url = urlparse.urljoin(self.url, m.groupdict().get('url', None)) + sleep = float(m.groupdict()['sleep']) - self.logger.info('Redirecting to %s', url) - self.browser.location(url) - break - + if sleep <= self.REFRESH_MAX: + self.logger.info('Redirecting to %s', url) + self.browser.location(url) + break + else: + self.logger.debug('Do not refresh to %s because %s > REFRESH_MAX(%s)' % (url, sleep, self.REFRESH_MAX)) def define_xpath_functions(self, ns): """