From 882e88da0715e448407b18ee7416e56a61323080 Mon Sep 17 00:00:00 2001 From: Xavier G Date: Sun, 9 Sep 2012 17:33:08 +0200 Subject: [PATCH] Slightly improved the TokenExtractor class. --- modules/cragr/pages/tokenextractor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/cragr/pages/tokenextractor.py b/modules/cragr/pages/tokenextractor.py index 54c5128b..9758d40b 100644 --- a/modules/cragr/pages/tokenextractor.py +++ b/modules/cragr/pages/tokenextractor.py @@ -22,7 +22,12 @@ from lxml import html class TokenExtractor: """ Extracts texts token from an HTML document """ - iterated_elements = [] + def __init__(self): + self.iterated_elements = [] + def clear(self): + """ Reset any content stored within a TokenExtractor: object. Useful to + start a new parsing without creating a new instance. """ + self.iterated_elements = [] def element_iterated_already(self, html_element): if html_element in self.iterated_elements: return True