# -*- coding: utf-8 -*- # Copyright(C) 2014 Oleg Plakhotniuk # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . import os import subprocess from tempfile import mkstemp __all__ = ['decompress_pdf'] def decompress_pdf(inpdf): """ Takes PDF file contents as a string and returns decompressed version of the file contents, suitable for text parsing. External dependencies: MuPDF (http://www.mupdf.com). """ inh, inname = mkstemp(suffix='.pdf') outh, outname = mkstemp(suffix='.pdf') os.write(inh, inpdf) os.close(inh) os.close(outh) subprocess.call(['mutool', 'clean', '-d', inname, outname]) with open(outname) as f: outpdf = f.read() os.remove(inname) os.remove(outname) return outpdf