rename things related to browsers

weboob.tools.browser -> weboob.deprecated.browser
weboob.tools.parsers -> weboob.deprecated.browser.parsers
weboob.tools.mech -> weboob.deprecated.mech
weboob.browser2 -> weboob.browser
weboob.core.exceptions -> weboob.exceptions

Also, the new tree for browser2 is:

weboob.browser: import weboob.browser.browsers.* and weboob.browser.url.*
weboob.browser.browsers: all browsers (including PagesBrowser and LoginBrowser)
weboob.browser.url: the URL class
weboob.browser.profiles: all Profile classes
weboob.browser.sessions: WeboobSession and FuturesSession
weboob.browser.cookies: that's a cookies thing
weboob.browser.pages: all Page and derivated classes, and Form class
weboob.browser.exceptions: specific browser exceptions
weboob.browser.elements: AbstractElement classes, and 'method' decorator
weboob.browser.filters.*: all filters
This commit is contained in:
Romain Bignon 2014-10-07 00:23:18 +02:00
commit d61e15cf84
396 changed files with 1442 additions and 1382 deletions

View file

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz, Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import logging
__all__ = ['get_parser', 'NoParserFound']
class NoParserFound(Exception):
pass
def load_lxml():
from .lxmlparser import LxmlHtmlParser
return LxmlHtmlParser
def load_lxmlsoup():
from .lxmlsoupparser import LxmlSoupParser
return LxmlSoupParser
def load_xml():
from .lxmlparser import LxmlXmlParser
return LxmlXmlParser
def load_json():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .jsonparser import JsonParser
return JsonParser
def load_csv():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .csvparser import CsvParser
return CsvParser
def load_raw():
# This parser doesn't read HTML, don't include it in the
# preference_order default value below.
from .iparser import RawParser
return RawParser
def get_parser(preference_order=('lxml', 'lxmlsoup')):
"""
Get a parser from a preference order list.
Return a parser implementing IParser.
"""
if not isinstance(preference_order, (tuple, list)):
preference_order = [preference_order]
for kind in preference_order:
if not 'load_%s' % kind in globals():
continue
try:
return globals()['load_%s' % kind]()
except ImportError:
logging.debug('%s is not installed.' % kind)
raise NoParserFound("No parser found (%s)" % ','.join(preference_order))

View file

@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2013 Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import csv
from .iparser import IParser
class Csv(object):
"""
CSV parser result.
header contains the first row if it is a header
rows contains the raw rows
drows contains the rows with cells indexed by header title
"""
def __init__(self):
self.header = None
self.rows = []
self.drows = []
class CsvParser(IParser):
"""
CSV Parser.
Since CSV files are not normalized, this parser is intended to be derived.
"""
DIALECT = 'excel'
FMTPARAMS = {}
"""
If True, will consider the first line as a header.
This means the rows will be also available as dictionnaries.
"""
HEADER = False
def parse(self, data, encoding=None):
reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS)
c = Csv()
for row in reader:
row = self.decode_row(row, encoding)
if c.header is None and self.HEADER:
c.header = row
else:
c.rows.append(row)
if c.header:
drow = {}
for i, cell in enumerate(row):
drow[c.header[i]] = cell
c.drows.append(drow)
return c
def decode_row(self, row, encoding):
if encoding:
return [unicode(cell, encoding) for cell in row]
else:
return row
def tostring(self, element):
if not isinstance(element, basestring):
return unicode(element)
return element

View file

@ -0,0 +1,63 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
__all__ = ['IParser', 'RawParser']
class IParser(object):
def parse(self, data, encoding=None):
"""
Parse a HTML document with a specific encoding to get a tree.
@param data [str] HTML document
@param encoding [str] encoding to use
@return an object with the structured document
"""
raise NotImplementedError()
def tostring(self, elem):
"""
Get HTML string from an element.
"""
raise NotImplementedError()
def tocleanstring(self, elem):
"""
Get a clean string from an element.
"""
return self.strip(self.tostring(elem))
def strip(self, data):
"""
Strip a HTML string.
"""
p = re.compile(r'<.*?>')
return p.sub(' ', data).strip()
class RawParser(IParser):
def parse(self, data, encoding=None):
return data.read()
def tostring(self, elem):
return elem

View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Romain Bignon
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.json import json
from .iparser import IParser
__all__ = ['JsonParser']
class JsonParser(IParser):
"""
Json parser.
"""
def parse(self, data, encoding=None):
return json.load(data, encoding=encoding)
def tostring(self, element):
return json.dumps(element)

View file

@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
import lxml.html as html
import lxml.etree as etree
from .iparser import IParser
from ..browser import BrokenPageError
__all__ = ['LxmlHtmlParser', 'LxmlXmlParser']
class LxmlParser(IParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def get_parser(encoding=None):
pass
def parse(self, data, encoding=None):
if encoding is None:
parser = None
else:
parser = self.get_parser(encoding=encoding)
return self.module.parse(data, parser)
def tostring(self, element):
return self.module.tostring(element, encoding=unicode)
def tocleanstring(self, element):
txt = [txt.strip() for txt in element.itertext()]
txt = u' '.join(txt) # 'foo bar'
txt = re.sub('\s+', ' ', txt) # 'foo bar'
return txt.strip()
def strip(self, s):
doc = self.module.fromstring(s) # parse html/xml string
return self.tocleanstring(doc)
@classmethod
def select(cls, element, selector, nb=None, method='cssselect', **kwargs):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises :class:`weboob.deprecated.browser.browser.BrokenPageError` if not found.
:param element: element on which to apply selector
:type element: object
:param selector: CSS or XPath expression
:type selector: str
:param method: (cssselect|xpath)
:type method: str
:param nb: number of elements expected to be found. Use None for
undefined number, and 'many' for 1 to infinite
:type nb: :class:`int` or :class:`str`
:rtype: Element
"""
if method == 'cssselect':
results = element.cssselect(selector, **kwargs)
elif method == 'xpath':
results = element.xpath(selector, **kwargs)
else:
raise NotImplementedError('Only the cssselect and xpath methods are supported')
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise BrokenPageError('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
class LxmlHtmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = html
def get_parser(self, encoding=None):
return html.HTMLParser(encoding=encoding)
class LxmlXmlParser(LxmlParser):
"""
Parser using lxml.
Note that it is not available on every systems.
"""
def __init__(self, *args, **kwargs):
self.module = etree
def get_parser(self, encoding=None):
return etree.XMLParser(encoding=encoding, strip_cdata=False)

View file

@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010-2011 Christophe Benz
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import lxml.html
import lxml.html.soupparser
from .iparser import IParser
__all__ = ['LxmlSoupParser']
class LxmlSoupParser(IParser):
"""
Parser using lxml elementsoup.
Note that it is not available on every systems.
"""
def parse(self, data, encoding=None):
return lxml.html.soupparser.parse(data)
def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode)