Slightly improved the TokenExtractor class.
This commit is contained in:
parent
1f078850bf
commit
882e88da07
1 changed files with 6 additions and 1 deletions
|
|
@ -22,7 +22,12 @@ from lxml import html
|
||||||
|
|
||||||
class TokenExtractor:
|
class TokenExtractor:
|
||||||
""" Extracts texts token from an HTML document """
|
""" Extracts texts token from an HTML document """
|
||||||
iterated_elements = []
|
def __init__(self):
|
||||||
|
self.iterated_elements = []
|
||||||
|
def clear(self):
|
||||||
|
""" Reset any content stored within a TokenExtractor: object. Useful to
|
||||||
|
start a new parsing without creating a new instance. """
|
||||||
|
self.iterated_elements = []
|
||||||
def element_iterated_already(self, html_element):
|
def element_iterated_already(self, html_element):
|
||||||
if html_element in self.iterated_elements:
|
if html_element in self.iterated_elements:
|
||||||
return True
|
return True
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue