can change parser

This commit is contained in:
Romain Bignon 2010-03-23 22:38:29 +01:00
commit b920e20570

View file

@ -21,6 +21,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import mechanize import mechanize
import urllib2 import urllib2
import html5lib import html5lib
import ClientForm
from html5lib import treebuilders from html5lib import treebuilders
import re import re
import time import time
@ -59,6 +60,13 @@ class BasePage:
def loaded(self): def loaded(self):
pass pass
class StandardParser(html5lib.HTMLParser):
def __init__(self):
html5lib.HTMLParser.__init__(tree=treebuilders.getTreeBuilder("dom"))
def parse(self, data):
return html5lib.HTMLParser.parse(data, encoding='iso-8859-1')
class Browser(mechanize.Browser): class Browser(mechanize.Browser):
# ------ Class attributes -------------------------------------- # ------ Class attributes --------------------------------------
@ -84,7 +92,7 @@ class Browser(mechanize.Browser):
# ------ Browser methods --------------------------------------- # ------ Browser methods ---------------------------------------
def __init__(self, username, password=None, firefox_cookies=None): def __init__(self, username, password=None, firefox_cookies=None, parser=StandardParser):
mechanize.Browser.__init__(self, history=NoHistory()) mechanize.Browser.__init__(self, history=NoHistory())
self.addheaders = [ self.addheaders = [
['User-agent', self.USER_AGENT] ['User-agent', self.USER_AGENT]
@ -98,7 +106,7 @@ class Browser(mechanize.Browser):
else: else:
self.__cookie = None self.__cookie = None
self.__parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) self.__parser = parser()
self.page = None self.page = None
self.last_update = 0.0 self.last_update = 0.0
self.username = username self.username = username
@ -109,6 +117,9 @@ class Browser(mechanize.Browser):
except BrowserUnavailable: except BrowserUnavailable:
pass pass
def set_parser(self, parser):
self.__parser = parser
def pageaccess(func): def pageaccess(func):
def inner(self, *args, **kwargs): def inner(self, *args, **kwargs):
if not self.page or not self.page.is_logged() and self.password: if not self.page or not self.page.is_logged() and self.password:
@ -207,7 +218,7 @@ class Browser(mechanize.Browser):
print '[%s] Gone on %s' % (self.username, result.geturl()) print '[%s] Gone on %s' % (self.username, result.geturl())
self.last_update = time.time() self.last_update = time.time()
document = self.__parser.parse(result, encoding='iso-8859-1') document = self.__parser.parse()
self.page = pageCls(self, document, result.geturl()) self.page = pageCls(self, document, result.geturl())
self.page.loaded() self.page.loaded()