20 minutes backend + cleanboob

This commit is contained in:
juke 2011-02-01 18:09:15 +01:00 committed by Romain Bignon
commit 4bd3a976a6
9 changed files with 262 additions and 0 deletions

25
scripts/cleanboob Executable file
View file

@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.applications.cleanboob import CleanBoob
if __name__ == '__main__':
CleanBoob.run()

View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .cleanboob import CleanBoob
__all__ = ['CleanBoob']

View file

@ -0,0 +1,38 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hébert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import sys
from weboob.capabilities.content import ICapContent
from weboob.tools.application.repl import ReplApplication
__all__ = ['CleanBoob']
class CleanBoob(ReplApplication):
APPNAME = 'CleanBoob'
VERSION = '0.1'
COPYRIGHT = 'Copyright(C) 2011-2012 Julien Hébert'
DESCRIPTION = "CleanBoob is a console application to extract article from website."
CAPS = ICapContent
def main(self, argv):
for backend, content in self.do('get_content', argv[1]):
self.format(content)
return 0

View file

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from .backend import Newspaper20minutesBackend
__all__ = ['Newspaper20minutesBackend']

View file

@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# python2.5 compatibility
from __future__ import with_statement
from weboob.capabilities.content import ICapContent, Content
from weboob.tools.backend import BaseBackend
from .browser import Newspaper20minutesBrowser
__all__ = ['Newspaper20minutesBackend']
class Newspaper20minutesBackend(BaseBackend, ICapContent):
NAME = 'minutes20'
MAINTAINER = 'Julien Hebert'
EMAIL = 'juke@free.fr'
VERSION = '0.1'
LICENSE = 'GPLv3'
DESCRIPTION = u'20minutes French news website'
#CONFIG = ValuesDict(Value('login', label='Account ID'),
# Value('password', label='Password', masked=True))
BROWSER = Newspaper20minutesBrowser
def get_content(self, url):
if isinstance(url, basestring):
content = Content(url)
else:
content = url
url = content._id
with self.browser:
data = self.browser.get_content(url)
print "blabla"
content.content = data[1]
content.title = data[0]
return content
def log_content(self, id):
raise NotImplementedError()
def push_content(self, content, message = None):
raise NotImplementedError()

View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from urlparse import urlsplit
from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword
from .pages.index import IndexPage
from .pages.article import ArticlePage
__all__ = ['Newspaper20minutesBrowser']
# Browser
class Newspaper20minutesBrowser(BaseBrowser):
PAGES = {
'http://www.20minutes.fr/article/?.*': ArticlePage
}
def get_content(self, url):
self.location(url)
return self.page.get_content()

View file

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
class ArticlePage(BasePage):
def get_title(self):
return select(self.document.getroot(), "h1", 1).text_content()
def get_article(self):
main_div = self.document.getroot()
article_body = select(main_div, "div.mn-line>div.mna-body")[0]
txt_article = article_body.text_content()
txt_to_remove = select(article_body, "div.mna-tools")[0].text_content()
txt_to_remove2 = select(main_div, "div.mn-line>div.mna-body>div.mna-comment-call")[0].text_content()
return txt_article.replace(txt_to_remove, '', 1).replace( txt_to_remove2, '', 1)
def get_content(self):
title = self.get_title()
content = self.get_article()
return [title, content]

View file

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Julien Hebert
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, version 3 of the License.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
from weboob.tools.browser import BasePage
class IndexPage(BasePage):
pass