Move CleanHTML to html filters
This commit is contained in:
parent
9275de4bdb
commit
fbd8cf1a64
2 changed files with 17 additions and 17 deletions
|
|
@ -18,10 +18,13 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
|
||||||
|
import lxml.html as html
|
||||||
from .standard import _Selector, _NO_DEFAULT, Filter, FilterError
|
from .standard import _Selector, _NO_DEFAULT, Filter, FilterError
|
||||||
|
from weboob.tools.html import html2text
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound', 'Attr', 'Link']
|
__all__ = ['CSS', 'XPath', 'XPathNotFound', 'AttributeNotFound',
|
||||||
|
'Attr', 'Link', 'CleanHTML']
|
||||||
|
|
||||||
|
|
||||||
class XPathNotFound(FilterError):
|
class XPathNotFound(FilterError):
|
||||||
|
|
@ -65,3 +68,15 @@ class Link(Attr):
|
||||||
|
|
||||||
def __init__(self, selector=None, default=_NO_DEFAULT):
|
def __init__(self, selector=None, default=_NO_DEFAULT):
|
||||||
super(Link, self).__init__(selector, 'href', default=default)
|
super(Link, self).__init__(selector, 'href', default=default)
|
||||||
|
|
||||||
|
class CleanHTML(Filter):
|
||||||
|
def filter(self, txt):
|
||||||
|
if isinstance(txt, (tuple, list)):
|
||||||
|
return u' '.join([self.clean(item) for item in txt])
|
||||||
|
return self.clean(txt)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def clean(cls, txt):
|
||||||
|
if not isinstance(txt, basestring):
|
||||||
|
txt = html.tostring(txt, encoding=unicode)
|
||||||
|
return html2text(txt)
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,6 @@ from dateutil.parser import parse as parse_date
|
||||||
from weboob.capabilities.base import empty
|
from weboob.capabilities.base import empty
|
||||||
from weboob.tools.compat import basestring
|
from weboob.tools.compat import basestring
|
||||||
from weboob.tools.exceptions import ParseError
|
from weboob.tools.exceptions import ParseError
|
||||||
from weboob.tools.html import html2text
|
|
||||||
from weboob.tools.browser2 import URL
|
from weboob.tools.browser2 import URL
|
||||||
|
|
||||||
class NoDefault(object):
|
class NoDefault(object):
|
||||||
|
|
@ -40,7 +39,7 @@ _NO_DEFAULT = NoDefault()
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['FilterError', 'ColumnNotFound', 'RegexpError', 'ItemNotFound',
|
__all__ = ['FilterError', 'ColumnNotFound', 'RegexpError', 'ItemNotFound',
|
||||||
'Filter', 'Base', 'Env', 'TableCell', 'CleanHTML', 'RawText',
|
'Filter', 'Base', 'Env', 'TableCell', 'RawText',
|
||||||
'CleanText', 'Lower', 'CleanDecimal', 'Field', 'Regexp', 'Map',
|
'CleanText', 'Lower', 'CleanDecimal', 'Field', 'Regexp', 'Map',
|
||||||
'DateTime', 'Date', 'Time', 'DateGuesser', 'Duration',
|
'DateTime', 'Date', 'Time', 'DateGuesser', 'Duration',
|
||||||
'MultiFilter', 'CombineDate', 'Format', 'Join', 'Type',
|
'MultiFilter', 'CombineDate', 'Format', 'Join', 'Type',
|
||||||
|
|
@ -223,20 +222,6 @@ class TableCell(_Filter):
|
||||||
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
|
return self.default_or_raise(ColumnNotFound('Unable to find column %s' % ' or '.join(self.names)))
|
||||||
|
|
||||||
|
|
||||||
class CleanHTML(Filter):
|
|
||||||
def filter(self, txt):
|
|
||||||
if isinstance(txt, (tuple, list)):
|
|
||||||
return u' '.join([self.clean(item) for item in txt])
|
|
||||||
return self.clean(txt)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def clean(cls, txt):
|
|
||||||
if not isinstance(txt, basestring):
|
|
||||||
import lxml.html as html
|
|
||||||
txt = html.tostring(txt, encoding=unicode)
|
|
||||||
return html2text(txt)
|
|
||||||
|
|
||||||
|
|
||||||
class RawText(Filter):
|
class RawText(Filter):
|
||||||
def filter(self, el):
|
def filter(self, el):
|
||||||
if isinstance(el, (tuple, list)):
|
if isinstance(el, (tuple, list)):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue