weboob-devel/modules/cragr/pages/tokenextractor.py
2012-09-12 13:48:33 +02:00

53 lines
2 KiB
Python

# -*- coding: utf-8 -*-
# Copyright(C) 2010-2012 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
class TokenExtractor:
""" Extracts texts token from an HTML document """
def __init__(self):
self.iterated_elements = []
def clear(self):
"""
Reset any content stored within a TokenExtractor object. Useful to start
a new parsing without creating a new instance.
"""
self.iterated_elements = []
def element_iterated_already(self, html_element):
if html_element in self.iterated_elements:
return True
for ancestor in html_element.iterancestors():
if ancestor in self.iterated_elements:
return True
return False
def extract_tokens(self, html_element):
if self.element_iterated_already(html_element):
return
self.iterated_elements.append(html_element)
for text in html_element.itertext():
text = text.replace(u'\xa0', ' ')
text = text.replace("\n", ' ')
for token in self.split_text_into_smaller_tokens(text):
if self.token_looks_relevant(token):
yield token.strip()
@staticmethod
def split_text_into_smaller_tokens(text):
for subtext1 in text.split('\t'):
yield subtext1
@staticmethod
def token_looks_relevant(token):
return len(token.strip()) > 1