fix parsing of HTML entities with HTMLParser

This commit is contained in:
Romain Bignon 2010-04-12 14:21:38 +02:00
commit 3027c1ece2

View file

@ -38,6 +38,7 @@ try:
HTMLTreeBuilder = TidyHTMLTreeBuilder.TidyHTMLTreeBuilder
except ImportError:
from HTMLParser import HTMLParser
import htmlentitydefs
class HTMLTreeBuilder(HTMLParser):
def __init__(self, encoding=None):
@ -58,6 +59,12 @@ except ImportError:
self._target.start(tag, dict(attrs))
self._target.end(tag)
def handle_charref(self, name):
self._target.data(unichr(int(name)))
def handle_entityref(self, name):
self._target.data(unichr(htmlentitydefs.name2codepoint[name]))
def handle_data(self, data):
self._target.data(data)