handle "Refresh"" meta tag
This commit is contained in:
parent
cedfb5670e
commit
0445c92a77
1 changed files with 18 additions and 1 deletions
|
|
@ -23,6 +23,7 @@ import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import codecs
|
import codecs
|
||||||
from cgi import parse_header
|
from cgi import parse_header
|
||||||
|
import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
@ -193,7 +194,7 @@ class Page(object):
|
||||||
overriden in modules pages to preprocess or postprocess data. It must
|
overriden in modules pages to preprocess or postprocess data. It must
|
||||||
return an object -- that will be assigned to :attr:`doc`.
|
return an object -- that will be assigned to :attr:`doc`.
|
||||||
"""
|
"""
|
||||||
raise NotImplemented
|
raise NotImplementedError()
|
||||||
|
|
||||||
def detect_encoding(self):
|
def detect_encoding(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -463,6 +464,22 @@ class HTMLPage(Page):
|
||||||
|
|
||||||
super(HTMLPage, self).__init__(*args, **kwargs)
|
super(HTMLPage, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def on_load(self):
|
||||||
|
# Default on_load handle "Refresh" meta tag.
|
||||||
|
self.handle_refresh()
|
||||||
|
|
||||||
|
def handle_refresh(self):
|
||||||
|
for refresh in self.doc.xpath('//head/meta[@http-equiv="Refresh"]'):
|
||||||
|
m = self.browser.REFRESH_RE.match(refresh.get('content', ''))
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
url = urlparse.urljoin(self.url, m.groupdict().get('url', None))
|
||||||
|
|
||||||
|
self.logger.info('Redirecting to %s', url)
|
||||||
|
self.browser.location(url)
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def define_xpath_functions(self, ns):
|
def define_xpath_functions(self, ns):
|
||||||
"""
|
"""
|
||||||
Define XPath functions on the given lxml function namespace.
|
Define XPath functions on the given lxml function namespace.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue