From 144bb8a7e43de4899e4af0035e5738dd3bc79451 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Thu, 24 Feb 2011 21:36:19 +0100 Subject: [PATCH] works with DLFP2.0RoR-ng --- weboob/backends/dlfp/backend.py | 37 ++++---- weboob/backends/dlfp/browser.py | 119 ++++++++++++------------- weboob/backends/dlfp/pages/index.py | 4 +- weboob/backends/dlfp/pages/news.py | 131 ++++++++++++++-------------- weboob/backends/dlfp/tools.py | 61 +++++++------ weboob/capabilities/messages.py | 5 +- 6 files changed, 175 insertions(+), 182 deletions(-) diff --git a/weboob/backends/dlfp/backend.py b/weboob/backends/dlfp/backend.py index 2d2569fc..fc0da7d9 100644 --- a/weboob/backends/dlfp/backend.py +++ b/weboob/backends/dlfp/backend.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010 Romain Bignon +# Copyright(C) 2010-2011 Romain Bignon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage from .browser import DLFP -from .tools import url2id +from .tools import rssid, id2url __all__ = ['DLFPBackend'] @@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost): CONFIG = ValuesDict(Value('username', label='Username', regexp='.+'), Value('password', label='Password', regexp='.+', masked=True), ValueBool('get_news', label='Get newspapers', default=True), - ValueBool('get_telegrams', label='Get telegrams', default=False)) + ValueBool('get_diaries', label='Get diaries', default=False)) STORAGE = {'seen': {}} BROWSER = DLFP - RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss" - RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss" + RSS_NEWSPAPERS = "https://linuxfr.org/news.atom" + RSS_DIARIES = "https://linuxfr.org/journaux.atom" def create_default_browser(self): @@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost): whats = set() if self.config['get_news']: whats.add(self.RSS_NEWSPAPERS) - if self.config['get_telegrams']: - whats.add(self.RSS_TELEGRAMS) - + if self.config['get_diaries']: + whats.add(self.RSS_DIARIES) for what in whats: - for article in Newsfeed(what, url2id).iter_entries(): + for article in Newsfeed(what, rssid).iter_entries(): thread = Thread(article.id) thread.title = article.title if article.datetime: @@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost): with self.browser: content = self.browser.get_content(id) + if not content: + return None + if not thread: - thread = Thread(id) + thread = Thread(content.id) flags = Message.IS_HTML if not thread.id in self.storage.get('seen', default={}): @@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost): receivers=None, date=thread.date, #TODO XXX WTF this is None parent=None, - content=''.join([content.body, content.part2]), - signature='URL: %s' % content.url, + content=content.body, + signature='URL: %s' % self.browser.absurl(id2url(content.id)), children=[], flags=flags) @@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost): def post_message(self, message): if not message.parent: - raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet') + raise CantSendMessage('Posting news and diaries on DLFP is not supported yet') assert message.thread with self.browser: - return self.browser.post_reply(message.thread.id, - message.parent.id, - message.title, - message.content, - message.flags & message.IS_HTML) + return self.browser.post_comment(message.thread.id, + message.parent.id, + message.title, + message.content) def fill_thread(self, thread, fields): return self.get_thread(thread) diff --git a/weboob/backends/dlfp/browser.py b/weboob/backends/dlfp/browser.py index 2ef2cdff..8b50d8fa 100644 --- a/weboob/backends/dlfp/browser.py +++ b/weboob/backends/dlfp/browser.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010 Romain Bignon +# Copyright(C) 2010-2011 Romain Bignon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,98 +17,87 @@ import urllib -from cStringIO import StringIO -from weboob.tools.browser import BaseBrowser -from weboob.tools.parsers.lxmlparser import LxmlHtmlParser +from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword +from weboob.capabilities.messages import CantSendMessage from .pages.index import IndexPage, LoginPage -from .pages.news import ContentPage -from .tools import id2url, id2threadid, id2contenttype - -class Parser(LxmlHtmlParser): - def parse(self, data, encoding=None): - # Want to kill templeet coders - data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>')) - return LxmlHtmlParser.parse(self, data, encoding) +from .pages.news import ContentPage, NewCommentPage, NodePage +from .tools import id2url, url2id # Browser class DLFP(BaseBrowser): DOMAIN = 'linuxfr.org' PROTOCOL = 'https' - PAGES = {'https://linuxfr.org/': IndexPage, - 'https://linuxfr.org/pub/': IndexPage, - 'https://linuxfr.org/my/': IndexPage, + PAGES = {'https://linuxfr.org/?': IndexPage, 'https://linuxfr.org/login.html': LoginPage, - 'https://linuxfr.org/.*/\d+.html': ContentPage + 'https://linuxfr.org/news/[^\.]+': ContentPage, + 'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage, + 'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage, + 'https://linuxfr.org/nodes/(\d+)/comments$': NodePage, } - def __init__(self, *args, **kwargs): - kwargs['parser'] = Parser() - BaseBrowser.__init__(self, *args, **kwargs) - def home(self): return self.location('https://linuxfr.org') def get_content(self, _id): - self.location(id2url(_id)) - return self.page.get_article() + url = id2url(_id) + if url is None: + if url2id(_id) is not None: + url = _id + _id = url2id(url) + else: + return None - def post_reply(self, thread, reply_id, title, message, is_html=False): - content_type = id2contenttype(thread) - thread_id = id2threadid(thread) - thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread)) - reply_id = int(reply_id) + self.location(url) + content = self.page.get_article() + content.id = _id + return content - if not content_type or not thread_id: - return False + def _is_comment_submit_form(self, form): + return 'comment_new' in form.action - url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, - self.DOMAIN, - thread_id, - reply_id, - content_type) + def post_comment(self, thread, reply_id, title, message): + url = id2url(thread) + if url is None: + raise CantSendMessage('%s is not a right ID' % thread) - timestamp = '' - if content_type == 1: - res = self.openurl(url).read() - const = 'name="timestamp" value="' - i = res.find(const) - if i >= 0: - res = res[i + len(const):] - timestamp = res[:res.find('"/>')] + self.location(url) + assert self.is_on_page(ContentPage) + self.location(self.page.get_post_comment_url()) + assert self.is_on_page(NewCommentPage) - if is_html: - format = 1 - else: - format = 3 + self.select_form(predicate=self._is_comment_submit_form) + self.set_all_readonly(False) + if title is not None: + self['comment[title]'] = title + self['comment[wiki_body]'] = message + if int(reply_id) > 0: + self['comment[parent_id]'] = str(reply_id) + self['commit'] = 'Poster le commentaire' - # Define every data fields - data = {'news_id': thread_id, - 'com_parent': reply_id, - 'timestamp': timestamp, - 'res_type': content_type, - 'referer': thread_url, - 'subject': unicode(title).encode('utf-8'), - 'body': unicode(message).encode('utf-8'), - 'format': format, - 'submit': 'Envoyer', - } + try: + self.submit() + except BrowserHTTPError, e: + raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e)) - url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type) + if self.is_on_page(NodePage): + errors = self.page.get_errors() + if len(errors) > 0: + raise CantSendMessage('Unable to send message: %s' % ', '.join(errors)) - request = self.request_class(url, urllib.urlencode(data), {'Referer': url}) - result = self.openurl(request) - request = self.request_class(thread_url, None, {'Referer': result.geturl()}) - self.openurl(request).read() return None def login(self): - self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password)) + data = {'account[login]': self.username, + 'account[password]': self.password, + 'account[remember_me]': 1} + self.location('/compte/connexion', urllib.urlencode(data), no_login=True) + if not self.is_logged(): + raise BrowserIncorrectPassword() def is_logged(self): return (self.page and self.page.is_logged()) def close_session(self): - self.openurl('/close_session.html') - + self.openurl('/compte/deconnexion') diff --git a/weboob/backends/dlfp/pages/index.py b/weboob/backends/dlfp/pages/index.py index 69ad45ba..7380bec8 100644 --- a/weboob/backends/dlfp/pages/index.py +++ b/weboob/backends/dlfp/pages/index.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010 Romain Bignon +# Copyright(C) 2010-2011 Romain Bignon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage class DLFPPage(BasePage): def is_logged(self): for form in self.document.getiterator('form'): - if form.attrib.get('id', None) == 'formulaire': + if form.attrib.get('id', None) == 'new_account_sidebar': return False return True diff --git a/weboob/backends/dlfp/pages/news.py b/weboob/backends/dlfp/pages/news.py index 6c4722dd..32cf3817 100644 --- a/weboob/backends/dlfp/pages/news.py +++ b/weboob/backends/dlfp/pages/news.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010 Romain Bignon +# Copyright(C) 2010-2011 Romain Bignon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -17,9 +17,8 @@ from datetime import datetime -from logging import warning -from weboob.tools.misc import local2utc +from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.backends.dlfp.tools import url2id from .index import DLFPPage @@ -37,31 +36,23 @@ class Comment(object): self.url = u'' self.comments = [] - for sub in div.getchildren(): - if sub.tag == 'a': - self.id = sub.attrib['name'] - self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id) - elif sub.tag == 'h1': - try: - self.title = sub.find('b').text - except UnicodeError: - warning('Bad encoded title, but DLFP sucks') - elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'): - self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown' - self.date = self.parse_date(sub.find('i').tail) - self.score = int(sub.findall('i')[-1].find('span').text) - self.body = self.browser.parser.tostring(sub.find('p')) - elif sub.attrib.get('class', '') == 'commentsul': - comment = Comment(self.browser, sub.find('li'), self.id) - self.comments.append(comment) + self.id = div.attrib['id'].split('-')[1] + self.title = unicode(select(div.find('h2'), 'a.title', 1).text) + try: + self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text) + except SelectElementException: + self.author = 'Anonyme' + self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], + '%Y-%m-%dT%H:%M:%S') + self.body = self.browser.parser.tostring(div.find('div')) + self.score = int(select(div.find('p'), 'span.score', 1).text) + self.url = select(div.find('h2'), 'a.title', 1).attrib['href'] - def parse_date(self, date_s): - date_s = date_s.strip().encode('utf-8') - if not date_s: - date = datetime.now() - else: - date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8')) - return local2utc(date) + subs = div.find('ul') + if subs is not None: + for sub in subs.findall('li'): + comment = Comment(self.browser, sub, self.id) + self.comments.append(comment) def iter_all_comments(self): for comment in self.comments: @@ -70,35 +61,25 @@ class Comment(object): yield c def __repr__(self): - return u"" % (self.id, self.author, self.title) + return u"" % (self.id, self.author, self.title) class Article(object): - def __init__(self, browser, _id, tree): + def __init__(self, browser, url, tree): self.browser = browser - self.id = _id - self.title = u'' - self.author = u'' - self.body = u'' - self.part2 = u'' - self.date = None - self.url = u'' - self.comments = [] + self.url = url + self.id = url2id(self.url) - for div in tree.findall('div'): - if div.attrib.get('class', '').startswith('titlediv '): - self.author = div.find('a').text - for a in div.find('h1').getiterator('a'): - if a.text: self.title += a.text - if a.tail: self.title += a.tail - self.title = self.title.strip() - # TODO use the date_s - #subdivs = div.findall('a') - #if len(subdivs) > 1: - # date_s = unicode(subdivs[1].text) - #else: - # date_s = unicode(div.find('i').tail) - if div.attrib.get('class', '').startswith('bodydiv '): - self.body = self.browser.parser.tostring(div) + header = tree.find('header') + self.title = u' — '.join([a.text for a in header.find('h1').findall('a')]) + try: + self.author = select(header, 'a[rel=author]', 1).text + except SelectElementException: + self.author = 'Anonyme' + self.body = self.browser.parser.tostring(select(tree, 'div.content', 1)) + self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0], + '%Y-%m-%dT%H:%M:%S') + + self.comments = [] def append_comment(self, comment): self.comments.append(comment) @@ -115,21 +96,37 @@ class Article(object): class ContentPage(DLFPPage): def on_loaded(self): self.article = None - for div in self.document.find('body').find('div').findall('div'): - self.parse_div(div) - if div.attrib.get('class', '') == 'centraldiv': - for subdiv in div.findall('div'): - self.parse_div(subdiv) - - def parse_div(self, div): - if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'): - self.article = Article(self.browser, url2id(self.url), div) - self.article.url = self.url - if div.attrib.get('class', '') == 'articlediv': - self.article.parse_part2(div) - if div.attrib.get('class', '') == 'comments': - comment = Comment(self.browser, div, 0) - self.article.append_comment(comment) def get_article(self): + if not self.article: + self.article = Article(self.browser, + self.url, + select(self.document.getroot(), 'article', 1)) + + try: + threads = select(self.document.getroot(), 'ul.threads', 1) + except SelectElementException: + pass # no comments + else: + for comment in threads.findall('li'): + self.article.append_comment(Comment(self.browser, comment, 0)) + return self.article + + def get_post_comment_url(self): + return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href'] + +class NewCommentPage(DLFPPage): + pass + +class NodePage(DLFPPage): + def get_errors(self): + try: + div = select(self.document.getroot(), 'div.errors', 1) + except SelectElementException: + return [] + + l = [] + for li in div.find('ul').findall('li'): + l.append(li.text) + return l diff --git a/weboob/backends/dlfp/tools.py b/weboob/backends/dlfp/tools.py index 7072dbb9..0d4d02d2 100644 --- a/weboob/backends/dlfp/tools.py +++ b/weboob/backends/dlfp/tools.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright(C) 2010 Romain Bignon +# Copyright(C) 2010-2011 Romain Bignon # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -18,43 +18,50 @@ import re -ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$') -ID2URL_TELEGRAM = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$') -URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$') -URL2ID_TELEGRAM = re.compile('^T([A-Za-z0-9_]+).(\d+)$') +RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)') +ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$') +URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)') +URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)') + +def rssid(entry): + m = RSSID_RE.match(entry.id) + if not m: + return None + if m.group(1) == 'D': + mm = URL2ID_DIARY_RE.match(entry.link) + if not mm: + return + return 'D%s.%s' % (mm.group(1), m.group(2)) + return '%s.%s' % (m.group(1), m.group(2)) + +def id2url(id): + m = ID2URL_RE.match(id) + if not m: + return None + + if m.group(1) == 'N': + return '/news/%s' % m.group(3) + if m.group(1) == 'D': + return '/users/%s/journaux/%s' % (m.group(2), m.group(3)) def url2id(url): - m = ID2URL_NEWSPAPER.match(url) + m = URL2ID_NEWSPAPER_RE.match(url) if m: - return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))) - m = ID2URL_TELEGRAM.match(url) + return 'N.%s' % (m.group(1)) + m = URL2ID_DIARY_RE.match(url) if m: - return 'T%s.%d' % (m.group(1), int(m.group(2))) - return None + return 'D%s.%s' % (m.group(1), m.group(2)) -def id2url(_id): - m = URL2ID_NEWSPAPER.match(_id) +def id2threadid(id): + m = ID2URL_RE.match(id) if m: - return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))) - m = URL2ID_TELEGRAM.match(_id) - if m: - return '/~%s/%d.html' % (m.group(1), int(m.group(2))) - return None - -def id2threadid(_id): - m = URL2ID_NEWSPAPER.match(_id) - if m: - return int(m.group(4)) - m = URL2ID_TELEGRAM.match(_id) - if m: - return int(m.group(2)) - return None + return m.group(3) def id2contenttype(_id): if not _id: return None if _id[0] == 'N': return 1 - if _id[0] == 'T': + if _id[0] == 'D': return 5 return None diff --git a/weboob/capabilities/messages.py b/weboob/capabilities/messages.py index 13a99cf1..3e0d6d5e 100644 --- a/weboob/capabilities/messages.py +++ b/weboob/capabilities/messages.py @@ -91,9 +91,8 @@ class Message(CapBaseObject): return unicode(self.id) == unicode(msg.id) def __repr__(self): - result = '' % ( - self.full_id, self.title, self.date, self.sender) - return result.encode('utf-8') + return '' % ( + self.full_id, self.title, self.date, self.sender) class Thread(CapBaseObject): IS_THREADS = 0x001