diff --git a/scripts/cleanboob b/scripts/cleanboob new file mode 100755 index 00000000..27029116 --- /dev/null +++ b/scripts/cleanboob @@ -0,0 +1,25 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from weboob.applications.cleanboob import CleanBoob + + +if __name__ == '__main__': + CleanBoob.run() diff --git a/weboob/applications/cleanboob/__init__.py b/weboob/applications/cleanboob/__init__.py new file mode 100644 index 00000000..44dd7073 --- /dev/null +++ b/weboob/applications/cleanboob/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from .cleanboob import CleanBoob + +__all__ = ['CleanBoob'] diff --git a/weboob/applications/cleanboob/cleanboob.py b/weboob/applications/cleanboob/cleanboob.py new file mode 100644 index 00000000..e74178a4 --- /dev/null +++ b/weboob/applications/cleanboob/cleanboob.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hébert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +import sys + +from weboob.capabilities.content import ICapContent +from weboob.tools.application.repl import ReplApplication + + +__all__ = ['CleanBoob'] + + +class CleanBoob(ReplApplication): + APPNAME = 'CleanBoob' + VERSION = '0.1' + COPYRIGHT = 'Copyright(C) 2011-2012 Julien Hébert' + DESCRIPTION = "CleanBoob is a console application to extract article from website." + CAPS = ICapContent + + def main(self, argv): + for backend, content in self.do('get_content', argv[1]): + self.format(content) + return 0 diff --git a/weboob/backends/minutes20/__init__.py b/weboob/backends/minutes20/__init__.py new file mode 100644 index 00000000..91af6812 --- /dev/null +++ b/weboob/backends/minutes20/__init__.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from .backend import Newspaper20minutesBackend + +__all__ = ['Newspaper20minutesBackend'] diff --git a/weboob/backends/minutes20/backend.py b/weboob/backends/minutes20/backend.py new file mode 100644 index 00000000..8d8db1f5 --- /dev/null +++ b/weboob/backends/minutes20/backend.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +# python2.5 compatibility +from __future__ import with_statement + +from weboob.capabilities.content import ICapContent, Content +from weboob.tools.backend import BaseBackend + +from .browser import Newspaper20minutesBrowser + + +__all__ = ['Newspaper20minutesBackend'] + + +class Newspaper20minutesBackend(BaseBackend, ICapContent): + NAME = 'minutes20' + MAINTAINER = 'Julien Hebert' + EMAIL = 'juke@free.fr' + VERSION = '0.1' + LICENSE = 'GPLv3' + DESCRIPTION = u'20minutes French news website' + #CONFIG = ValuesDict(Value('login', label='Account ID'), + # Value('password', label='Password', masked=True)) + BROWSER = Newspaper20minutesBrowser + + def get_content(self, url): + if isinstance(url, basestring): + content = Content(url) + else: + content = url + url = content._id + with self.browser: + data = self.browser.get_content(url) + print "blabla" + + content.content = data[1] + content.title = data[0] + return content + + def log_content(self, id): + raise NotImplementedError() + + def push_content(self, content, message = None): + raise NotImplementedError() diff --git a/weboob/backends/minutes20/browser.py b/weboob/backends/minutes20/browser.py new file mode 100644 index 00000000..fffbbfc4 --- /dev/null +++ b/weboob/backends/minutes20/browser.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from urlparse import urlsplit + +from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword + +from .pages.index import IndexPage +from .pages.article import ArticlePage + +__all__ = ['Newspaper20minutesBrowser'] + + +# Browser +class Newspaper20minutesBrowser(BaseBrowser): + PAGES = { + 'http://www.20minutes.fr/article/?.*': ArticlePage + } + + def get_content(self, url): + self.location(url) + return self.page.get_content() diff --git a/weboob/backends/minutes20/pages/__init__.py b/weboob/backends/minutes20/pages/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/weboob/backends/minutes20/pages/article.py b/weboob/backends/minutes20/pages/article.py new file mode 100644 index 00000000..d63a3cb6 --- /dev/null +++ b/weboob/backends/minutes20/pages/article.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2011 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlparser import select + +class ArticlePage(BasePage): + def get_title(self): + return select(self.document.getroot(), "h1", 1).text_content() + + def get_article(self): + main_div = self.document.getroot() + article_body = select(main_div, "div.mn-line>div.mna-body")[0] + txt_article = article_body.text_content() + txt_to_remove = select(article_body, "div.mna-tools")[0].text_content() + txt_to_remove2 = select(main_div, "div.mn-line>div.mna-body>div.mna-comment-call")[0].text_content() + return txt_article.replace(txt_to_remove, '', 1).replace( txt_to_remove2, '', 1) + + def get_content(self): + title = self.get_title() + content = self.get_article() + return [title, content] diff --git a/weboob/backends/minutes20/pages/index.py b/weboob/backends/minutes20/pages/index.py new file mode 100644 index 00000000..c2e5aaf4 --- /dev/null +++ b/weboob/backends/minutes20/pages/index.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010 Julien Hebert +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + +from weboob.tools.browser import BasePage + +class IndexPage(BasePage): + pass