ability to specify a parser to use on a page handler
This commit is contained in:
parent
75e482fb5f
commit
db6ee276fa
1 changed files with 17 additions and 5 deletions
|
|
@ -216,7 +216,7 @@ class StandardBrowser(mechanize.Browser):
|
||||||
|
|
||||||
if parser is None:
|
if parser is None:
|
||||||
parser = get_parser()()
|
parser = get_parser()()
|
||||||
elif isinstance(parser, (tuple,list,str,unicode)):
|
elif isinstance(parser, (tuple,list,basestring)):
|
||||||
parser = get_parser(parser)()
|
parser = get_parser(parser)()
|
||||||
self.parser = parser
|
self.parser = parser
|
||||||
self.lock = RLock()
|
self.lock = RLock()
|
||||||
|
|
@ -324,14 +324,19 @@ class StandardBrowser(mechanize.Browser):
|
||||||
else:
|
else:
|
||||||
self.logger.info(msg)
|
self.logger.info(msg)
|
||||||
|
|
||||||
def get_document(self, result):
|
def get_document(self, result, parser=None):
|
||||||
"""
|
"""
|
||||||
Get a parsed document from a stream.
|
Get a parsed document from a stream.
|
||||||
|
|
||||||
:param result: HTML page stream
|
:param result: HTML page stream
|
||||||
:type result: stream
|
:type result: stream
|
||||||
"""
|
"""
|
||||||
return self.parser.parse(result, self.ENCODING)
|
if parser is None:
|
||||||
|
parser = self.parser
|
||||||
|
elif isinstance(parser, (basestring, list, tuple)):
|
||||||
|
parser = get_parser(parser)()
|
||||||
|
|
||||||
|
return parser.parse(result, self.ENCODING)
|
||||||
|
|
||||||
def location(self, *args, **kwargs):
|
def location(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|
@ -622,6 +627,7 @@ class BaseBrowser(StandardBrowser):
|
||||||
|
|
||||||
# Find page from url
|
# Find page from url
|
||||||
pageCls = None
|
pageCls = None
|
||||||
|
parser = None
|
||||||
page_groups = None
|
page_groups = None
|
||||||
page_group_dict = None
|
page_group_dict = None
|
||||||
for key, value in self.PAGES.items():
|
for key, value in self.PAGES.items():
|
||||||
|
|
@ -634,7 +640,13 @@ class BaseBrowser(StandardBrowser):
|
||||||
regexp = key
|
regexp = key
|
||||||
m = regexp.search(result.geturl())
|
m = regexp.search(result.geturl())
|
||||||
if m:
|
if m:
|
||||||
pageCls = value
|
if isinstance(value, (list, tuple)):
|
||||||
|
pageCls = value[0]
|
||||||
|
parser = value[1]
|
||||||
|
else:
|
||||||
|
pageCls = value
|
||||||
|
parser = self.parser
|
||||||
|
|
||||||
page_groups = m.groups()
|
page_groups = m.groups()
|
||||||
page_group_dict = m.groupdict()
|
page_group_dict = m.groupdict()
|
||||||
break
|
break
|
||||||
|
|
@ -652,7 +664,7 @@ class BaseBrowser(StandardBrowser):
|
||||||
if self.SAVE_RESPONSES:
|
if self.SAVE_RESPONSES:
|
||||||
self.save_response(result)
|
self.save_response(result)
|
||||||
|
|
||||||
document = self.get_document(result)
|
document = self.get_document(result, parser)
|
||||||
self.page = pageCls(self, document, result.geturl(), groups=page_groups, group_dict=page_group_dict, logger=self.logger)
|
self.page = pageCls(self, document, result.geturl(), groups=page_groups, group_dict=page_group_dict, logger=self.logger)
|
||||||
|
|
||||||
if not no_login and self.password is not None and not self.is_logged():
|
if not no_login and self.password is not None and not self.is_logged():
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue