handle "Refresh"" meta tag

This commit is contained in:
Romain Bignon 2015-02-21 17:48:36 +01:00
commit 0445c92a77

View file

@ -23,6 +23,7 @@ import warnings
from io import BytesIO
import codecs
from cgi import parse_header
import urlparse
import requests
@ -193,7 +194,7 @@ class Page(object):
overriden in modules pages to preprocess or postprocess data. It must
return an object -- that will be assigned to :attr:`doc`.
"""
raise NotImplemented
raise NotImplementedError()
def detect_encoding(self):
"""
@ -463,6 +464,22 @@ class HTMLPage(Page):
super(HTMLPage, self).__init__(*args, **kwargs)
def on_load(self):
# Default on_load handle "Refresh" meta tag.
self.handle_refresh()
def handle_refresh(self):
for refresh in self.doc.xpath('//head/meta[@http-equiv="Refresh"]'):
m = self.browser.REFRESH_RE.match(refresh.get('content', ''))
if not m:
continue
url = urlparse.urljoin(self.url, m.groupdict().get('url', None))
self.logger.info('Redirecting to %s', url)
self.browser.location(url)
break
def define_xpath_functions(self, ns):
"""
Define XPath functions on the given lxml function namespace.