Add a CSV parser

2013-02-05 14:20:51 +01:00 · 2013-02-05 14:20:51 +01:00 · 91f59a9cef
commit 91f59a9cef
parent efbc3b9ef6
2 changed files with 85 additions and 0 deletions
--- a/weboob/tools/parsers/init.py
+++ b/weboob/tools/parsers/init.py
@ -59,6 +59,14 @@ def load_json():
    from .jsonparser import JsonParser
    return JsonParser

+
+def load_csv():
+    # This parser doesn't read HTML, don't include it in the
+    # preference_order default value below.
+    from .csvparser import CsvParser
+    return CsvParser
+
+
 def load_raw():
    # This parser doesn't read HTML, don't include it in the
    # preference_order default value below.
--- a/weboob/tools/parsers/csvparser.py
+++ b/weboob/tools/parsers/csvparser.py
@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2013      Laurent Bachelier
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+import csv
+from .iparser import IParser
+
+
+class Csv(object):
+    """
+    CSV parser result.
+    header contains the first row if it is a header
+    rows contains the raw rows
+    drows contains the rows with cells indexed by header title
+    """
+    def __init__(self):
+        self.header = None
+        self.rows = []
+        self.drows = []
+
+
+class CsvParser(IParser):
+    """
+    CSV Parser.
+    Since CSV files are not normalized, this parser is intended to be derived.
+    """
+    DIALECT = 'excel'
+    FMTPARAMS = {}
+
+    """
+    If True, will consider the first line as a header.
+    This means the rows will be also available as dictionnaries.
+    """
+    HEADER = False
+
+    def parse(self, data, encoding=None):
+        reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS)
+        c = Csv()
+        for row in reader:
+            row = self.decode_row(row, encoding)
+            if c.header is None and self.HEADER:
+                c.header = row
+            else:
+                c.rows.append(row)
+                if c.header:
+                    drow = {}
+                    for i, cell in enumerate(row):
+                        drow[c.header[i]] = cell
+                    c.drows.append(drow)
+        return c
+
+    def decode_row(self, row, encoding):
+        if encoding:
+            return [unicode(cell, encoding) for cell in row]
+        else:
+            return row
+
+    def tostring(self, element):
+        if not isinstance(element, basestring):
+            return unicode(element)
+        return element