Slightly improved the TokenExtractor class.

This commit is contained in:
Xavier G 2012-09-09 17:33:08 +02:00
commit 882e88da07

View file

@ -22,7 +22,12 @@ from lxml import html
class TokenExtractor:
""" Extracts texts token from an HTML document """
iterated_elements = []
def __init__(self):
self.iterated_elements = []
def clear(self):
""" Reset any content stored within a TokenExtractor: object. Useful to
start a new parsing without creating a new instance. """
self.iterated_elements = []
def element_iterated_already(self, html_element):
if html_element in self.iterated_elements:
return True