Slightly improved the TokenExtractor class.
This commit is contained in:
parent
1f078850bf
commit
882e88da07
1 changed files with 6 additions and 1 deletions
|
|
@ -22,7 +22,12 @@ from lxml import html
|
|||
|
||||
class TokenExtractor:
|
||||
""" Extracts texts token from an HTML document """
|
||||
iterated_elements = []
|
||||
def __init__(self):
|
||||
self.iterated_elements = []
|
||||
def clear(self):
|
||||
""" Reset any content stored within a TokenExtractor: object. Useful to
|
||||
start a new parsing without creating a new instance. """
|
||||
self.iterated_elements = []
|
||||
def element_iterated_already(self, html_element):
|
||||
if html_element in self.iterated_elements:
|
||||
return True
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue