works with DLFP2.0RoR-ng

2011-02-24 21:36:19 +01:00 · 2011-02-24 21:36:19 +01:00 · 144bb8a7e4
commit 144bb8a7e4
parent 09bb78258a
6 changed files with 183 additions and 190 deletions
--- a/weboob/backends/dlfp/backend.py
+++ b/weboob/backends/dlfp/backend.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict
 from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage
 from .browser import DLFP
-from .tools import url2id
+from .tools import rssid, id2url
 __all__ = ['DLFPBackend']
@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
    CONFIG = ValuesDict(Value('username',          label='Username', regexp='.+'),
                        Value('password',          label='Password', regexp='.+', masked=True),
                        ValueBool('get_news',      label='Get newspapers', default=True),
-                        ValueBool('get_telegrams', label='Get telegrams', default=False))
+                        ValueBool('get_diaries',   label='Get diaries', default=False))
    STORAGE = {'seen': {}}
    BROWSER = DLFP
-    RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss"
+    RSS_NEWSPAPERS = "https://linuxfr.org/news.atom"
-    RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss"
+    RSS_DIARIES = "https://linuxfr.org/journaux.atom"
    def create_default_browser(self):
@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
        whats = set()
        if self.config['get_news']:
            whats.add(self.RSS_NEWSPAPERS)
-        if self.config['get_telegrams']:
+        if self.config['get_diaries']:
-            whats.add(self.RSS_TELEGRAMS)
+            whats.add(self.RSS_DIARIES)
        for what in whats:
-            for article in Newsfeed(what, url2id).iter_entries():
+            for article in Newsfeed(what, rssid).iter_entries():
                thread = Thread(article.id)
                thread.title = article.title
                if article.datetime:
@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
        with self.browser:
            content = self.browser.get_content(id)
        if not content:
            return None
        if not thread:
-            thread = Thread(id)
+            thread = Thread(content.id)
        flags = Message.IS_HTML
        if not thread.id in self.storage.get('seen', default={}):
@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
                              receivers=None,
                              date=thread.date, #TODO XXX WTF this is None
                              parent=None,
-                              content=''.join([content.body, content.part2]),
+                              content=content.body,
-                              signature='URL: %s' % content.url,
+                              signature='URL: %s' % self.browser.absurl(id2url(content.id)),
                              children=[],
                              flags=flags)
@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
    def post_message(self, message):
        if not message.parent:
-            raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet')
+            raise CantSendMessage('Posting news and diaries on DLFP is not supported yet')
        assert message.thread
        with self.browser:
-            return self.browser.post_reply(message.thread.id,
+            return self.browser.post_comment(message.thread.id,
-                                           message.parent.id,
+                                             message.parent.id,
-                                           message.title,
+                                             message.title,
-                                           message.content,
+                                             message.content)
                                           message.flags & message.IS_HTML)
    def fill_thread(self, thread, fields):
        return self.get_thread(thread)
--- a/weboob/backends/dlfp/browser.py
+++ b/weboob/backends/dlfp/browser.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -17,98 +17,87 @@
 import urllib
 from cStringIO import StringIO
-from weboob.tools.browser import BaseBrowser
+from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword
-from weboob.tools.parsers.lxmlparser import LxmlHtmlParser
+from weboob.capabilities.messages import CantSendMessage
 from .pages.index import IndexPage, LoginPage
-from .pages.news import ContentPage
+from .pages.news import ContentPage, NewCommentPage, NodePage
-from .tools import id2url, id2threadid, id2contenttype
+from .tools import id2url, url2id
 class Parser(LxmlHtmlParser):
    def parse(self, data, encoding=None):
        # Want to kill templeet coders
        data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>'))
        return LxmlHtmlParser.parse(self, data, encoding)
 # Browser
 class DLFP(BaseBrowser):
    DOMAIN = 'linuxfr.org'
    PROTOCOL = 'https'
-    PAGES = {'https://linuxfr.org/': IndexPage,
+    PAGES = {'https://linuxfr.org/?': IndexPage,
             'https://linuxfr.org/pub/': IndexPage,
             'https://linuxfr.org/my/': IndexPage,
             'https://linuxfr.org/login.html': LoginPage,
-             'https://linuxfr.org/.*/\d+.html': ContentPage
+             'https://linuxfr.org/news/[^\.]+': ContentPage,
             'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage,
             'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
             'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,
            }
    def __init__(self, *args, **kwargs):
        kwargs['parser'] = Parser()
        BaseBrowser.__init__(self, *args, **kwargs)
    def home(self):
        return self.location('https://linuxfr.org')
    def get_content(self, _id):
-        self.location(id2url(_id))
+        url = id2url(_id)
-        return self.page.get_article()
+        if url is None:
            if url2id(_id) is not None:
                url = _id
                _id = url2id(url)
            else:
                return None
-    def post_reply(self, thread, reply_id, title, message, is_html=False):
+        self.location(url)
-        content_type = id2contenttype(thread)
+        content = self.page.get_article()
-        thread_id = id2threadid(thread)
+        content.id = _id
-        thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread))
+        return content
        reply_id = int(reply_id)
-        if not content_type or not thread_id:
+    def _is_comment_submit_form(self, form):
-            return False
+        return 'comment_new' in form.action
-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL,
+    def post_comment(self, thread, reply_id, title, message):
-                                                              self.DOMAIN,
+        url = id2url(thread)
-                                                              thread_id,
+        if url is None:
-                                                              reply_id,
+            raise CantSendMessage('%s is not a right ID' % thread)
                                                              content_type)
-        timestamp = ''
+        self.location(url)
-        if content_type == 1:
+        assert self.is_on_page(ContentPage)
-            res = self.openurl(url).read()
+        self.location(self.page.get_post_comment_url())
-            const = 'name="timestamp" value="'
+        assert self.is_on_page(NewCommentPage)
            i = res.find(const)
            if i >= 0:
                res = res[i + len(const):]
                timestamp = res[:res.find('"/>')]
-        if is_html:
+        self.select_form(predicate=self._is_comment_submit_form)
-            format = 1
+        self.set_all_readonly(False)
-        else:
+        if title is not None:
-            format = 3
+            self['comment[title]'] = title
        self['comment[wiki_body]'] = message
        if int(reply_id) > 0:
            self['comment[parent_id]'] = str(reply_id)
        self['commit'] = 'Poster le commentaire'
-        # Define every data fields
+        try:
-        data = {'news_id': thread_id,
+            self.submit()
-                'com_parent': reply_id,
+        except BrowserHTTPError, e:
-                'timestamp': timestamp,
+            raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e))
                'res_type': content_type,
                'referer': thread_url,
                'subject': unicode(title).encode('utf-8'),
                'body': unicode(message).encode('utf-8'),
                'format': format,
                'submit': 'Envoyer',
                }
-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type)
+        if self.is_on_page(NodePage):
            errors = self.page.get_errors()
            if len(errors) > 0:
                raise CantSendMessage('Unable to send message: %s' % ', '.join(errors))
        request = self.request_class(url, urllib.urlencode(data), {'Referer': url})
        result = self.openurl(request)
        request = self.request_class(thread_url, None, {'Referer': result.geturl()})
        self.openurl(request).read()
        return None
    def login(self):
-        self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password))
+        data = {'account[login]': self.username,
                'account[password]': self.password,
                'account[remember_me]': 1}
        self.location('/compte/connexion', urllib.urlencode(data), no_login=True)
        if not self.is_logged():
            raise BrowserIncorrectPassword()
    def is_logged(self):
        return (self.page and self.page.is_logged())
    def close_session(self):
-        self.openurl('/close_session.html')
+        self.openurl('/compte/deconnexion')
--- a/weboob/backends/dlfp/pages/index.py
+++ b/weboob/backends/dlfp/pages/index.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage
 class DLFPPage(BasePage):
    def is_logged(self):
        for form in self.document.getiterator('form'):
-            if form.attrib.get('id', None) == 'formulaire':
+            if form.attrib.get('id', None) == 'new_account_sidebar':
                return False
        return True
--- a/weboob/backends/dlfp/pages/news.py
+++ b/weboob/backends/dlfp/pages/news.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -17,9 +17,8 @@
 from datetime import datetime
 from logging import warning
-from weboob.tools.misc import local2utc
+from weboob.tools.parsers.lxmlparser import select, SelectElementException
 from weboob.backends.dlfp.tools import url2id
 from .index import DLFPPage
@ -37,31 +36,23 @@ class Comment(object):
        self.url = u''
        self.comments = []
-        for sub in div.getchildren():
+        self.id = div.attrib['id'].split('-')[1]
-            if sub.tag == 'a':
+        self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
-                self.id = sub.attrib['name']
+        try:
-                self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id)
+            self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text)
-            elif sub.tag == 'h1':
+        except SelectElementException:
-                try:
+            self.author = 'Anonyme'
-                    self.title = sub.find('b').text
+        self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
-                except UnicodeError:
+                                      '%Y-%m-%dT%H:%M:%S')
-                    warning('Bad encoded title, but DLFP sucks')
+        self.body = self.browser.parser.tostring(div.find('div'))
-            elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'):
+        self.score = int(select(div.find('p'), 'span.score', 1).text)
-                self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown'
+        self.url = select(div.find('h2'), 'a.title', 1).attrib['href']
                self.date = self.parse_date(sub.find('i').tail)
                self.score = int(sub.findall('i')[-1].find('span').text)
                self.body = self.browser.parser.tostring(sub.find('p'))
            elif sub.attrib.get('class', '') == 'commentsul':
                comment = Comment(self.browser, sub.find('li'), self.id)
                self.comments.append(comment)
-    def parse_date(self, date_s):
+        subs = div.find('ul')
-        date_s = date_s.strip().encode('utf-8')
+        if subs is not None:
-        if not date_s:
+            for sub in subs.findall('li'):
-            date = datetime.now()
+                comment = Comment(self.browser, sub, self.id)
-        else:
+                self.comments.append(comment)
            date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
        return local2utc(date)
    def iter_all_comments(self):
        for comment in self.comments:
@ -70,35 +61,25 @@ class Comment(object):
                yield c
    def __repr__(self):
-        return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title)
+        return u"<Comment id=%r author=%r title=%r>" % (self.id, self.author, self.title)
 class Article(object):
-    def __init__(self, browser, _id, tree):
+    def __init__(self, browser, url, tree):
        self.browser = browser
-        self.id = _id
+        self.url = url
-        self.title = u''
+        self.id = url2id(self.url)
        self.author = u''
        self.body = u''
        self.part2 = u''
        self.date = None
        self.url = u''
        self.comments = []
-        for div in tree.findall('div'):
+        header = tree.find('header')
-            if div.attrib.get('class', '').startswith('titlediv '):
+        self.title = u' — '.join([a.text for a in header.find('h1').findall('a')])
-                self.author = div.find('a').text
+        try:
-                for a in div.find('h1').getiterator('a'):
+            self.author = select(header, 'a[rel=author]', 1).text
-                    if a.text: self.title += a.text
+        except SelectElementException:
-                    if a.tail: self.title += a.tail
+            self.author = 'Anonyme'
-                self.title = self.title.strip()
+        self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
-                # TODO use the date_s
+        self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
-                #subdivs = div.findall('a')
+                                      '%Y-%m-%dT%H:%M:%S')
-                #if len(subdivs) > 1:
+
-                #    date_s = unicode(subdivs[1].text)
+        self.comments = []
                #else:
                #    date_s = unicode(div.find('i').tail)
            if div.attrib.get('class', '').startswith('bodydiv '):
                self.body = self.browser.parser.tostring(div)
    def append_comment(self, comment):
        self.comments.append(comment)
@ -115,21 +96,37 @@ class Article(object):
 class ContentPage(DLFPPage):
    def on_loaded(self):
        self.article = None
        for div in self.document.find('body').find('div').findall('div'):
            self.parse_div(div)
            if div.attrib.get('class', '') == 'centraldiv':
                for subdiv in div.findall('div'):
                    self.parse_div(subdiv)
    def parse_div(self, div):
        if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
            self.article = Article(self.browser, url2id(self.url), div)
            self.article.url = self.url
        if div.attrib.get('class', '') == 'articlediv':
            self.article.parse_part2(div)
        if div.attrib.get('class', '') == 'comments':
            comment = Comment(self.browser, div, 0)
            self.article.append_comment(comment)
    def get_article(self):
        if not self.article:
            self.article = Article(self.browser,
                                   self.url,
                                   select(self.document.getroot(), 'article', 1))
            try:
                threads = select(self.document.getroot(), 'ul.threads', 1)
            except SelectElementException:
                pass # no comments
            else:
                for comment in threads.findall('li'):
                    self.article.append_comment(Comment(self.browser, comment, 0))
        return self.article
    def get_post_comment_url(self):
        return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
 class NewCommentPage(DLFPPage):
    pass
 class NodePage(DLFPPage):
    def get_errors(self):
        try:
            div = select(self.document.getroot(), 'div.errors', 1)
        except SelectElementException:
            return []
        l = []
        for li in div.find('ul').findall('li'):
            l.append(li.text)
        return l
--- a/weboob/backends/dlfp/tools.py
+++ b/weboob/backends/dlfp/tools.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -18,43 +18,50 @@
 import re
-ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$')
+RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
-ID2URL_TELEGRAM  = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$')
+ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$')
-URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$')
+URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)')
-URL2ID_TELEGRAM  = re.compile('^T([A-Za-z0-9_]+).(\d+)$')
+URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
 def rssid(entry):
    m = RSSID_RE.match(entry.id)
    if not m:
        return None
    if m.group(1) == 'D':
        mm = URL2ID_DIARY_RE.match(entry.link)
        if not mm:
            return
        return 'D%s.%s' % (mm.group(1), m.group(2))
    return '%s.%s' % (m.group(1), m.group(2))
 def id2url(id):
    m = ID2URL_RE.match(id)
    if not m:
        return None
    if m.group(1) == 'N':
        return '/news/%s' % m.group(3)
    if m.group(1) == 'D':
        return '/users/%s/journaux/%s' % (m.group(2), m.group(3))
 def url2id(url):
-    m = ID2URL_NEWSPAPER.match(url)
+    m = URL2ID_NEWSPAPER_RE.match(url)
    if m:
-        return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
+        return 'N.%s' % (m.group(1))
-    m = ID2URL_TELEGRAM.match(url)
+    m = URL2ID_DIARY_RE.match(url)
    if m:
-        return 'T%s.%d' % (m.group(1), int(m.group(2)))
+        return 'D%s.%s' % (m.group(1), m.group(2))
    return None
-def id2url(_id):
+def id2threadid(id):
-    m = URL2ID_NEWSPAPER.match(_id)
+    m = ID2URL_RE.match(id)
    if m:
-        return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
+        return m.group(3)
    m = URL2ID_TELEGRAM.match(_id)
    if m:
        return '/~%s/%d.html' % (m.group(1), int(m.group(2)))
    return None
 def id2threadid(_id):
    m = URL2ID_NEWSPAPER.match(_id)
    if m:
        return int(m.group(4))
    m = URL2ID_TELEGRAM.match(_id)
    if m:
        return int(m.group(2))
    return None
 def id2contenttype(_id):
    if not _id:
        return None
    if _id[0] == 'N':
        return 1
-    if _id[0] == 'T':
+    if _id[0] == 'D':
        return 5
    return None
--- a/weboob/capabilities/messages.py
+++ b/weboob/capabilities/messages.py
@ -91,9 +91,8 @@ class Message(CapBaseObject):
            return unicode(self.id) == unicode(msg.id)
    def __repr__(self):
-        result = '<Message id="%s" title="%s" date="%s" from="%s">' % (
+        return '<Message id=%r title=%r date=%r from=%r>' % (
-            self.full_id, self.title, self.date, self.sender)
+                   self.full_id, self.title, self.date, self.sender)
        return result.encode('utf-8')
 class Thread(CapBaseObject):
    IS_THREADS =    0x001