ability to specify a parser to use on a page handler

This commit is contained in:
Romain Bignon 2013-01-02 13:48:42 +01:00
commit db6ee276fa

View file

@ -216,7 +216,7 @@ class StandardBrowser(mechanize.Browser):
if parser is None: if parser is None:
parser = get_parser()() parser = get_parser()()
elif isinstance(parser, (tuple,list,str,unicode)): elif isinstance(parser, (tuple,list,basestring)):
parser = get_parser(parser)() parser = get_parser(parser)()
self.parser = parser self.parser = parser
self.lock = RLock() self.lock = RLock()
@ -324,14 +324,19 @@ class StandardBrowser(mechanize.Browser):
else: else:
self.logger.info(msg) self.logger.info(msg)
def get_document(self, result): def get_document(self, result, parser=None):
""" """
Get a parsed document from a stream. Get a parsed document from a stream.
:param result: HTML page stream :param result: HTML page stream
:type result: stream :type result: stream
""" """
return self.parser.parse(result, self.ENCODING) if parser is None:
parser = self.parser
elif isinstance(parser, (basestring, list, tuple)):
parser = get_parser(parser)()
return parser.parse(result, self.ENCODING)
def location(self, *args, **kwargs): def location(self, *args, **kwargs):
""" """
@ -622,6 +627,7 @@ class BaseBrowser(StandardBrowser):
# Find page from url # Find page from url
pageCls = None pageCls = None
parser = None
page_groups = None page_groups = None
page_group_dict = None page_group_dict = None
for key, value in self.PAGES.items(): for key, value in self.PAGES.items():
@ -634,7 +640,13 @@ class BaseBrowser(StandardBrowser):
regexp = key regexp = key
m = regexp.search(result.geturl()) m = regexp.search(result.geturl())
if m: if m:
pageCls = value if isinstance(value, (list, tuple)):
pageCls = value[0]
parser = value[1]
else:
pageCls = value
parser = self.parser
page_groups = m.groups() page_groups = m.groups()
page_group_dict = m.groupdict() page_group_dict = m.groupdict()
break break
@ -652,7 +664,7 @@ class BaseBrowser(StandardBrowser):
if self.SAVE_RESPONSES: if self.SAVE_RESPONSES:
self.save_response(result) self.save_response(result)
document = self.get_document(result) document = self.get_document(result, parser)
self.page = pageCls(self, document, result.geturl(), groups=page_groups, group_dict=page_group_dict, logger=self.logger) self.page = pageCls(self, document, result.geturl(), groups=page_groups, group_dict=page_group_dict, logger=self.logger)
if not no_login and self.password is not None and not self.is_logged(): if not no_login and self.password is not None and not self.is_logged():