diff --git a/weboob/tools/parsers/__init__.py b/weboob/tools/parsers/__init__.py index 0bbb528f..5c47fe93 100644 --- a/weboob/tools/parsers/__init__.py +++ b/weboob/tools/parsers/__init__.py @@ -59,6 +59,14 @@ def load_json(): from .jsonparser import JsonParser return JsonParser + +def load_csv(): + # This parser doesn't read HTML, don't include it in the + # preference_order default value below. + from .csvparser import CsvParser + return CsvParser + + def load_raw(): # This parser doesn't read HTML, don't include it in the # preference_order default value below. diff --git a/weboob/tools/parsers/csvparser.py b/weboob/tools/parsers/csvparser.py new file mode 100644 index 00000000..639149cc --- /dev/null +++ b/weboob/tools/parsers/csvparser.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Laurent Bachelier +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +import csv +from .iparser import IParser + + +class Csv(object): + """ + CSV parser result. + header contains the first row if it is a header + rows contains the raw rows + drows contains the rows with cells indexed by header title + """ + def __init__(self): + self.header = None + self.rows = [] + self.drows = [] + + +class CsvParser(IParser): + """ + CSV Parser. + Since CSV files are not normalized, this parser is intended to be derived. + """ + DIALECT = 'excel' + FMTPARAMS = {} + + """ + If True, will consider the first line as a header. + This means the rows will be also available as dictionnaries. + """ + HEADER = False + + def parse(self, data, encoding=None): + reader = csv.reader(data, dialect=self.DIALECT, **self.FMTPARAMS) + c = Csv() + for row in reader: + row = self.decode_row(row, encoding) + if c.header is None and self.HEADER: + c.header = row + else: + c.rows.append(row) + if c.header: + drow = {} + for i, cell in enumerate(row): + drow[c.header[i]] = cell + c.drows.append(drow) + return c + + def decode_row(self, row, encoding): + if encoding: + return [unicode(cell, encoding) for cell in row] + else: + return row + + def tostring(self, element): + if not isinstance(element, basestring): + return unicode(element) + return element