rename things related to browsers

weboob.tools.browser -> weboob.deprecated.browser weboob.tools.parsers -> weboob.deprecated.browser.parsers weboob.tools.mech -> weboob.deprecated.mech weboob.browser2 -> weboob.browser weboob.core.exceptions -> weboob.exceptions Also, the new tree for browser2 is: weboob.browser: import weboob.browser.browsers.* and weboob.browser.url.* weboob.browser.browsers: all browsers (including PagesBrowser and LoginBrowser) weboob.browser.url: the URL class weboob.browser.profiles: all Profile classes weboob.browser.sessions: WeboobSession and FuturesSession weboob.browser.cookies: that's a cookies thing weboob.browser.pages: all Page and derivated classes, and Form class weboob.browser.exceptions: specific browser exceptions weboob.browser.elements: AbstractElement classes, and 'method' decorator weboob.browser.filters.*: all filters
2014-10-07 00:23:18 +02:00 · 2014-10-07 00:23:18 +02:00 · d61e15cf84
commit d61e15cf84
parent 1f95e7631f
396 changed files with 1442 additions and 1382 deletions
--- a/weboob/deprecated/browser/parsers/init.py
+++ b/weboob/deprecated/browser/parsers/init.py
@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Christophe Benz, Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import logging
+
+
+__all__ = ['get_parser', 'NoParserFound']
+
+
+class NoParserFound(Exception):
+    pass
+
+
+def load_lxml():
+    from .lxmlparser import LxmlHtmlParser
+    return LxmlHtmlParser
+
+
+def load_lxmlsoup():
+    from .lxmlsoupparser import LxmlSoupParser
+    return LxmlSoupParser
+
+
+def load_xml():
+    from .lxmlparser import LxmlXmlParser
+    return LxmlXmlParser
+
+
+def load_json():
+    # This parser doesn't read HTML, don't include it in the
+    # preference_order default value below.
+    from .jsonparser import JsonParser
+    return JsonParser
+
+
+def load_csv():
+    # This parser doesn't read HTML, don't include it in the
+    # preference_order default value below.
+    from .csvparser import CsvParser
+    return CsvParser
+
+
+def load_raw():
+    # This parser doesn't read HTML, don't include it in the
+    # preference_order default value below.
+    from .iparser import RawParser
+    return RawParser
+
+
+def get_parser(preference_order=('lxml', 'lxmlsoup')):
+    """
+    Get a parser from a preference order list.
+    Return a parser implementing IParser.
+    """
+    if not isinstance(preference_order, (tuple, list)):
+        preference_order = [preference_order]
+
+    for kind in preference_order:
+        if not 'load_%s' % kind in globals():
+            continue
+
+        try:
+            return globals()['load_%s' % kind]()
+        except ImportError:
+            logging.debug('%s is not installed.' % kind)
+
+    raise NoParserFound("No parser found (%s)" % ','.join(preference_order))
--- a/weboob/deprecated/browser/parsers/csvparser.py
+++ b/weboob/deprecated/browser/parsers/csvparser.py
@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2013      Laurent Bachelier
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import csv
+from .iparser import IParser
+
+
+class Csv(object):
+    """
+    CSV parser result.
+    header contains the first row if it is a header
+    rows contains the raw rows
+    drows contains the rows with cells indexed by header title
+    """
+    def __init__(self):
+        self.header = None
+        self.rows = []
+        self.drows = []
+
+
+class CsvParser(IParser):
+    """
+    CSV Parser.
+    Since CSV files are not normalized, this parser is intended to be derived.
+    """
+    DIALECT = 'excel'
+    FMTPARAMS = {}
+
+    """
+    If True, will consider the first line as a header.
+    This means the rows will be also available as dictionnaries.
+    """
+    HEADER = False
+
+    def parse(self, data, encoding=None):
+        reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS)
+        c = Csv()
+        for row in reader:
+            row = self.decode_row(row, encoding)
+            if c.header is None and self.HEADER:
+                c.header = row
+            else:
+                c.rows.append(row)
+                if c.header:
+                    drow = {}
+                    for i, cell in enumerate(row):
+                        drow[c.header[i]] = cell
+                    c.drows.append(drow)
+        return c
+
+    def decode_row(self, row, encoding):
+        if encoding:
+            return [unicode(cell, encoding) for cell in row]
+        else:
+            return row
+
+    def tostring(self, element):
+        if not isinstance(element, basestring):
+            return unicode(element)
+        return element
--- a/weboob/deprecated/browser/parsers/iparser.py
+++ b/weboob/deprecated/browser/parsers/iparser.py
@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import re
+
+
+__all__ = ['IParser', 'RawParser']
+
+
+class IParser(object):
+    def parse(self, data, encoding=None):
+        """
+        Parse a HTML document with a specific encoding to get a tree.
+
+        @param data  [str] HTML document
+        @param encoding  [str] encoding to use
+        @return  an object with the structured document
+        """
+        raise NotImplementedError()
+
+    def tostring(self, elem):
+        """
+        Get HTML string from an element.
+        """
+        raise NotImplementedError()
+
+    def tocleanstring(self, elem):
+        """
+        Get a clean string from an element.
+        """
+        return self.strip(self.tostring(elem))
+
+    def strip(self, data):
+        """
+        Strip a HTML string.
+        """
+        p = re.compile(r'<.*?>')
+        return p.sub(' ', data).strip()
+
+
+class RawParser(IParser):
+    def parse(self, data, encoding=None):
+        return data.read()
+
+    def tostring(self, elem):
+        return elem
--- a/weboob/deprecated/browser/parsers/jsonparser.py
+++ b/weboob/deprecated/browser/parsers/jsonparser.py
@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2011 Romain Bignon
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.json import json
+from .iparser import IParser
+
+
+__all__ = ['JsonParser']
+
+
+class JsonParser(IParser):
+    """
+    Json parser.
+    """
+
+    def parse(self, data, encoding=None):
+        return json.load(data, encoding=encoding)
+
+    def tostring(self, element):
+        return json.dumps(element)
--- a/weboob/deprecated/browser/parsers/lxmlparser.py
+++ b/weboob/deprecated/browser/parsers/lxmlparser.py
@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Christophe Benz
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import re
+import lxml.html as html
+import lxml.etree as etree
+
+from .iparser import IParser
+from ..browser import BrokenPageError
+
+
+__all__ = ['LxmlHtmlParser', 'LxmlXmlParser']
+
+
+class LxmlParser(IParser):
+    """
+    Parser using lxml.
+
+    Note that it is not available on every systems.
+    """
+
+    def get_parser(encoding=None):
+        pass
+
+    def parse(self, data, encoding=None):
+        if encoding is None:
+            parser = None
+        else:
+            parser = self.get_parser(encoding=encoding)
+        return self.module.parse(data, parser)
+
+    def tostring(self, element):
+        return self.module.tostring(element, encoding=unicode)
+
+    def tocleanstring(self, element):
+        txt = [txt.strip() for txt in element.itertext()]
+        txt = u' '.join(txt)            # 'foo   bar'
+        txt = re.sub('\s+', ' ', txt)   # 'foo bar'
+        return txt.strip()
+
+    def strip(self, s):
+        doc = self.module.fromstring(s)   # parse html/xml string
+        return self.tocleanstring(doc)
+
+    @classmethod
+    def select(cls, element, selector, nb=None, method='cssselect', **kwargs):
+        """
+        Select one or many elements from an element, using lxml cssselect by default.
+
+        Raises :class:`weboob.deprecated.browser.browser.BrokenPageError` if not found.
+
+        :param element: element on which to apply selector
+        :type element: object
+        :param selector: CSS or XPath expression
+        :type selector: str
+        :param method: (cssselect|xpath)
+        :type method: str
+        :param nb: number of elements expected to be found. Use None for
+                   undefined number, and 'many' for 1 to infinite
+        :type nb: :class:`int` or :class:`str`
+        :rtype: Element
+        """
+        if method == 'cssselect':
+            results = element.cssselect(selector, **kwargs)
+        elif method == 'xpath':
+            results = element.xpath(selector, **kwargs)
+        else:
+            raise NotImplementedError('Only the cssselect and xpath methods are supported')
+        if nb is None:
+            return results
+        elif isinstance(nb, basestring) and nb == 'many':
+            if results is None or len(results) == 0:
+                raise BrokenPageError('Element not found with selector "%s"' % selector)
+            elif len(results) == 1:
+                raise BrokenPageError('Only one element found with selector "%s"' % selector)
+            else:
+                return results
+        elif isinstance(nb, int) and nb > 0:
+            if results is None:
+                raise BrokenPageError('Element not found with selector "%s"' % selector)
+            elif len(results) < nb:
+                raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
+            else:
+                return results[0] if nb == 1 else results
+        else:
+            raise Exception('Unhandled value for kwarg "nb": %s' % nb)
+
+
+class LxmlHtmlParser(LxmlParser):
+    """
+    Parser using lxml.
+
+    Note that it is not available on every systems.
+    """
+    def __init__(self, *args, **kwargs):
+        self.module = html
+
+    def get_parser(self, encoding=None):
+        return html.HTMLParser(encoding=encoding)
+
+
+class LxmlXmlParser(LxmlParser):
+    """
+    Parser using lxml.
+
+    Note that it is not available on every systems.
+    """
+    def __init__(self, *args, **kwargs):
+        self.module = etree
+
+    def get_parser(self, encoding=None):
+        return etree.XMLParser(encoding=encoding, strip_cdata=False)
--- a/weboob/deprecated/browser/parsers/lxmlsoupparser.py
+++ b/weboob/deprecated/browser/parsers/lxmlsoupparser.py
@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Christophe Benz
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import lxml.html
+import lxml.html.soupparser
+
+from .iparser import IParser
+
+
+__all__ = ['LxmlSoupParser']
+
+
+class LxmlSoupParser(IParser):
+    """
+    Parser using lxml elementsoup.
+
+    Note that it is not available on every systems.
+    """
+
+    def parse(self, data, encoding=None):
+        return lxml.html.soupparser.parse(data)
+
+    def tostring(self, element):
+        return lxml.html.tostring(element, encoding=unicode)