From 144bb8a7e43de4899e4af0035e5738dd3bc79451 Mon Sep 17 00:00:00 2001
From: Romain Bignon <romain@symlink.me>
Date: Thu, 24 Feb 2011 21:36:19 +0100
Subject: [PATCH] works with DLFP2.0RoR-ng

---
 weboob/backends/dlfp/backend.py     |  37 ++++----
 weboob/backends/dlfp/browser.py     | 119 ++++++++++++-------------
 weboob/backends/dlfp/pages/index.py |   4 +-
 weboob/backends/dlfp/pages/news.py  | 131 ++++++++++++++--------------
 weboob/backends/dlfp/tools.py       |  61 +++++++------
 weboob/capabilities/messages.py     |   5 +-
 6 files changed, 175 insertions(+), 182 deletions(-)

diff --git a/weboob/backends/dlfp/backend.py b/weboob/backends/dlfp/backend.py
index 2d2569fc..fc0da7d9 100644
--- a/weboob/backends/dlfp/backend.py
+++ b/weboob/backends/dlfp/backend.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict
 from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage
 
 from .browser import DLFP
-from .tools import url2id
+from .tools import rssid, id2url
 
 
 __all__ = ['DLFPBackend']
@@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
     CONFIG = ValuesDict(Value('username',          label='Username', regexp='.+'),
                         Value('password',          label='Password', regexp='.+', masked=True),
                         ValueBool('get_news',      label='Get newspapers', default=True),
-                        ValueBool('get_telegrams', label='Get telegrams', default=False))
+                        ValueBool('get_diaries',   label='Get diaries', default=False))
     STORAGE = {'seen': {}}
     BROWSER = DLFP
-    RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss"
-    RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss"
+    RSS_NEWSPAPERS = "https://linuxfr.org/news.atom"
+    RSS_DIARIES = "https://linuxfr.org/journaux.atom"
 
 
     def create_default_browser(self):
@@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
         whats = set()
         if self.config['get_news']:
             whats.add(self.RSS_NEWSPAPERS)
-        if self.config['get_telegrams']:
-            whats.add(self.RSS_TELEGRAMS)
-
+        if self.config['get_diaries']:
+            whats.add(self.RSS_DIARIES)
 
         for what in whats:
-            for article in Newsfeed(what, url2id).iter_entries():
+            for article in Newsfeed(what, rssid).iter_entries():
                 thread = Thread(article.id)
                 thread.title = article.title
                 if article.datetime:
@@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
         with self.browser:
             content = self.browser.get_content(id)
 
+        if not content:
+            return None
+
         if not thread:
-            thread = Thread(id)
+            thread = Thread(content.id)
 
         flags = Message.IS_HTML
         if not thread.id in self.storage.get('seen', default={}):
@@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
                               receivers=None,
                               date=thread.date, #TODO XXX WTF this is None
                               parent=None,
-                              content=''.join([content.body, content.part2]),
-                              signature='URL: %s' % content.url,
+                              content=content.body,
+                              signature='URL: %s' % self.browser.absurl(id2url(content.id)),
                               children=[],
                               flags=flags)
 
@@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
 
     def post_message(self, message):
         if not message.parent:
-            raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet')
+            raise CantSendMessage('Posting news and diaries on DLFP is not supported yet')
 
         assert message.thread
 
         with self.browser:
-            return self.browser.post_reply(message.thread.id,
-                                           message.parent.id,
-                                           message.title,
-                                           message.content,
-                                           message.flags & message.IS_HTML)
+            return self.browser.post_comment(message.thread.id,
+                                             message.parent.id,
+                                             message.title,
+                                             message.content)
 
     def fill_thread(self, thread, fields):
         return self.get_thread(thread)
diff --git a/weboob/backends/dlfp/browser.py b/weboob/backends/dlfp/browser.py
index 2ef2cdff..8b50d8fa 100644
--- a/weboob/backends/dlfp/browser.py
+++ b/weboob/backends/dlfp/browser.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -17,98 +17,87 @@
 
 
 import urllib
-from cStringIO import StringIO
 
-from weboob.tools.browser import BaseBrowser
-from weboob.tools.parsers.lxmlparser import LxmlHtmlParser
+from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword
+from weboob.capabilities.messages import CantSendMessage
 
 from .pages.index import IndexPage, LoginPage
-from .pages.news import ContentPage
-from .tools import id2url, id2threadid, id2contenttype
-
-class Parser(LxmlHtmlParser):
-    def parse(self, data, encoding=None):
-        # Want to kill templeet coders
-        data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>'))
-        return LxmlHtmlParser.parse(self, data, encoding)
+from .pages.news import ContentPage, NewCommentPage, NodePage
+from .tools import id2url, url2id
 
 # Browser
 class DLFP(BaseBrowser):
     DOMAIN = 'linuxfr.org'
     PROTOCOL = 'https'
-    PAGES = {'https://linuxfr.org/': IndexPage,
-             'https://linuxfr.org/pub/': IndexPage,
-             'https://linuxfr.org/my/': IndexPage,
+    PAGES = {'https://linuxfr.org/?': IndexPage,
              'https://linuxfr.org/login.html': LoginPage,
-             'https://linuxfr.org/.*/\d+.html': ContentPage
+             'https://linuxfr.org/news/[^\.]+': ContentPage,
+             'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage,
+             'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
+             'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,
             }
 
-    def __init__(self, *args, **kwargs):
-        kwargs['parser'] = Parser()
-        BaseBrowser.__init__(self, *args, **kwargs)
-
     def home(self):
         return self.location('https://linuxfr.org')
 
     def get_content(self, _id):
-        self.location(id2url(_id))
-        return self.page.get_article()
+        url = id2url(_id)
+        if url is None:
+            if url2id(_id) is not None:
+                url = _id
+                _id = url2id(url)
+            else:
+                return None
 
-    def post_reply(self, thread, reply_id, title, message, is_html=False):
-        content_type = id2contenttype(thread)
-        thread_id = id2threadid(thread)
-        thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread))
-        reply_id = int(reply_id)
+        self.location(url)
+        content = self.page.get_article()
+        content.id = _id
+        return content
 
-        if not content_type or not thread_id:
-            return False
+    def _is_comment_submit_form(self, form):
+        return 'comment_new' in form.action
 
-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL,
-                                                              self.DOMAIN,
-                                                              thread_id,
-                                                              reply_id,
-                                                              content_type)
+    def post_comment(self, thread, reply_id, title, message):
+        url = id2url(thread)
+        if url is None:
+            raise CantSendMessage('%s is not a right ID' % thread)
 
-        timestamp = ''
-        if content_type == 1:
-            res = self.openurl(url).read()
-            const = 'name="timestamp" value="'
-            i = res.find(const)
-            if i >= 0:
-                res = res[i + len(const):]
-                timestamp = res[:res.find('"/>')]
+        self.location(url)
+        assert self.is_on_page(ContentPage)
+        self.location(self.page.get_post_comment_url())
+        assert self.is_on_page(NewCommentPage)
 
-        if is_html:
-            format = 1
-        else:
-            format = 3
+        self.select_form(predicate=self._is_comment_submit_form)
+        self.set_all_readonly(False)
+        if title is not None:
+            self['comment[title]'] = title
+        self['comment[wiki_body]'] = message
+        if int(reply_id) > 0:
+            self['comment[parent_id]'] = str(reply_id)
+        self['commit'] = 'Poster le commentaire'
 
-        # Define every data fields
-        data = {'news_id': thread_id,
-                'com_parent': reply_id,
-                'timestamp': timestamp,
-                'res_type': content_type,
-                'referer': thread_url,
-                'subject': unicode(title).encode('utf-8'),
-                'body': unicode(message).encode('utf-8'),
-                'format': format,
-                'submit': 'Envoyer',
-                }
+        try:
+            self.submit()
+        except BrowserHTTPError, e:
+            raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e))
 
-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type)
+        if self.is_on_page(NodePage):
+            errors = self.page.get_errors()
+            if len(errors) > 0:
+                raise CantSendMessage('Unable to send message: %s' % ', '.join(errors))
 
-        request = self.request_class(url, urllib.urlencode(data), {'Referer': url})
-        result = self.openurl(request)
-        request = self.request_class(thread_url, None, {'Referer': result.geturl()})
-        self.openurl(request).read()
         return None
 
     def login(self):
-        self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password))
+        data = {'account[login]': self.username,
+                'account[password]': self.password,
+                'account[remember_me]': 1}
+        self.location('/compte/connexion', urllib.urlencode(data), no_login=True)
+        if not self.is_logged():
+            raise BrowserIncorrectPassword()
 
     def is_logged(self):
         return (self.page and self.page.is_logged())
 
     def close_session(self):
-        self.openurl('/close_session.html')
-
+        self.openurl('/compte/deconnexion')
diff --git a/weboob/backends/dlfp/pages/index.py b/weboob/backends/dlfp/pages/index.py
index 69ad45ba..7380bec8 100644
--- a/weboob/backends/dlfp/pages/index.py
+++ b/weboob/backends/dlfp/pages/index.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage
 class DLFPPage(BasePage):
     def is_logged(self):
         for form in self.document.getiterator('form'):
-            if form.attrib.get('id', None) == 'formulaire':
+            if form.attrib.get('id', None) == 'new_account_sidebar':
                 return False
 
         return True
diff --git a/weboob/backends/dlfp/pages/news.py b/weboob/backends/dlfp/pages/news.py
index 6c4722dd..32cf3817 100644
--- a/weboob/backends/dlfp/pages/news.py
+++ b/weboob/backends/dlfp/pages/news.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -17,9 +17,8 @@
 
 
 from datetime import datetime
-from logging import warning
 
-from weboob.tools.misc import local2utc
+from weboob.tools.parsers.lxmlparser import select, SelectElementException
 from weboob.backends.dlfp.tools import url2id
 
 from .index import DLFPPage
@@ -37,31 +36,23 @@ class Comment(object):
         self.url = u''
         self.comments = []
 
-        for sub in div.getchildren():
-            if sub.tag == 'a':
-                self.id = sub.attrib['name']
-                self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id)
-            elif sub.tag == 'h1':
-                try:
-                    self.title = sub.find('b').text
-                except UnicodeError:
-                    warning('Bad encoded title, but DLFP sucks')
-            elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'):
-                self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown'
-                self.date = self.parse_date(sub.find('i').tail)
-                self.score = int(sub.findall('i')[-1].find('span').text)
-                self.body = self.browser.parser.tostring(sub.find('p'))
-            elif sub.attrib.get('class', '') == 'commentsul':
-                comment = Comment(self.browser, sub.find('li'), self.id)
-                self.comments.append(comment)
+        self.id = div.attrib['id'].split('-')[1]
+        self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
+        try:
+            self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text)
+        except SelectElementException:
+            self.author = 'Anonyme'
+        self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
+                                      '%Y-%m-%dT%H:%M:%S')
+        self.body = self.browser.parser.tostring(div.find('div'))
+        self.score = int(select(div.find('p'), 'span.score', 1).text)
+        self.url = select(div.find('h2'), 'a.title', 1).attrib['href']
 
-    def parse_date(self, date_s):
-        date_s = date_s.strip().encode('utf-8')
-        if not date_s:
-            date = datetime.now()
-        else:
-            date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
-        return local2utc(date)
+        subs = div.find('ul')
+        if subs is not None:
+            for sub in subs.findall('li'):
+                comment = Comment(self.browser, sub, self.id)
+                self.comments.append(comment)
 
     def iter_all_comments(self):
         for comment in self.comments:
@@ -70,35 +61,25 @@ class Comment(object):
                 yield c
 
     def __repr__(self):
-        return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title)
+        return u"<Comment id=%r author=%r title=%r>" % (self.id, self.author, self.title)
 
 class Article(object):
-    def __init__(self, browser, _id, tree):
+    def __init__(self, browser, url, tree):
         self.browser = browser
-        self.id = _id
-        self.title = u''
-        self.author = u''
-        self.body = u''
-        self.part2 = u''
-        self.date = None
-        self.url = u''
-        self.comments = []
+        self.url = url
+        self.id = url2id(self.url)
 
-        for div in tree.findall('div'):
-            if div.attrib.get('class', '').startswith('titlediv '):
-                self.author = div.find('a').text
-                for a in div.find('h1').getiterator('a'):
-                    if a.text: self.title += a.text
-                    if a.tail: self.title += a.tail
-                self.title = self.title.strip()
-                # TODO use the date_s
-                #subdivs = div.findall('a')
-                #if len(subdivs) > 1:
-                #    date_s = unicode(subdivs[1].text)
-                #else:
-                #    date_s = unicode(div.find('i').tail)
-            if div.attrib.get('class', '').startswith('bodydiv '):
-                self.body = self.browser.parser.tostring(div)
+        header = tree.find('header')
+        self.title = u' — '.join([a.text for a in header.find('h1').findall('a')])
+        try:
+            self.author = select(header, 'a[rel=author]', 1).text
+        except SelectElementException:
+            self.author = 'Anonyme'
+        self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
+        self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
+                                      '%Y-%m-%dT%H:%M:%S')
+
+        self.comments = []
 
     def append_comment(self, comment):
         self.comments.append(comment)
@@ -115,21 +96,37 @@ class Article(object):
 class ContentPage(DLFPPage):
     def on_loaded(self):
         self.article = None
-        for div in self.document.find('body').find('div').findall('div'):
-            self.parse_div(div)
-            if div.attrib.get('class', '') == 'centraldiv':
-                for subdiv in div.findall('div'):
-                    self.parse_div(subdiv)
-
-    def parse_div(self, div):
-        if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
-            self.article = Article(self.browser, url2id(self.url), div)
-            self.article.url = self.url
-        if div.attrib.get('class', '') == 'articlediv':
-            self.article.parse_part2(div)
-        if div.attrib.get('class', '') == 'comments':
-            comment = Comment(self.browser, div, 0)
-            self.article.append_comment(comment)
 
     def get_article(self):
+        if not self.article:
+            self.article = Article(self.browser,
+                                   self.url,
+                                   select(self.document.getroot(), 'article', 1))
+
+            try:
+                threads = select(self.document.getroot(), 'ul.threads', 1)
+            except SelectElementException:
+                pass # no comments
+            else:
+                for comment in threads.findall('li'):
+                    self.article.append_comment(Comment(self.browser, comment, 0))
+
         return self.article
+
+    def get_post_comment_url(self):
+        return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
+
+class NewCommentPage(DLFPPage):
+    pass
+
+class NodePage(DLFPPage):
+    def get_errors(self):
+        try:
+            div = select(self.document.getroot(), 'div.errors', 1)
+        except SelectElementException:
+            return []
+
+        l = []
+        for li in div.find('ul').findall('li'):
+            l.append(li.text)
+        return l
diff --git a/weboob/backends/dlfp/tools.py b/weboob/backends/dlfp/tools.py
index 7072dbb9..0d4d02d2 100644
--- a/weboob/backends/dlfp/tools.py
+++ b/weboob/backends/dlfp/tools.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,43 +18,50 @@
 
 import re
 
-ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$')
-ID2URL_TELEGRAM  = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$')
-URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$')
-URL2ID_TELEGRAM  = re.compile('^T([A-Za-z0-9_]+).(\d+)$')
+RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
+ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$')
+URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)')
+URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
+
+def rssid(entry):
+    m = RSSID_RE.match(entry.id)
+    if not m:
+        return None
+    if m.group(1) == 'D':
+        mm = URL2ID_DIARY_RE.match(entry.link)
+        if not mm:
+            return
+        return 'D%s.%s' % (mm.group(1), m.group(2))
+    return '%s.%s' % (m.group(1), m.group(2))
+
+def id2url(id):
+    m = ID2URL_RE.match(id)
+    if not m:
+        return None
+
+    if m.group(1) == 'N':
+        return '/news/%s' % m.group(3)
+    if m.group(1) == 'D':
+        return '/users/%s/journaux/%s' % (m.group(2), m.group(3))
 
 def url2id(url):
-    m = ID2URL_NEWSPAPER.match(url)
+    m = URL2ID_NEWSPAPER_RE.match(url)
     if m:
-        return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
-    m = ID2URL_TELEGRAM.match(url)
+        return 'N.%s' % (m.group(1))
+    m = URL2ID_DIARY_RE.match(url)
     if m:
-        return 'T%s.%d' % (m.group(1), int(m.group(2)))
-    return None
+        return 'D%s.%s' % (m.group(1), m.group(2))
 
-def id2url(_id):
-    m = URL2ID_NEWSPAPER.match(_id)
+def id2threadid(id):
+    m = ID2URL_RE.match(id)
     if m:
-        return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
-    m = URL2ID_TELEGRAM.match(_id)
-    if m:
-        return '/~%s/%d.html' % (m.group(1), int(m.group(2)))
-    return None
-
-def id2threadid(_id):
-    m = URL2ID_NEWSPAPER.match(_id)
-    if m:
-        return int(m.group(4))
-    m = URL2ID_TELEGRAM.match(_id)
-    if m:
-        return int(m.group(2))
-    return None
+        return m.group(3)
 
 def id2contenttype(_id):
     if not _id:
         return None
     if _id[0] == 'N':
         return 1
-    if _id[0] == 'T':
+    if _id[0] == 'D':
         return 5
     return None
diff --git a/weboob/capabilities/messages.py b/weboob/capabilities/messages.py
index 13a99cf1..3e0d6d5e 100644
--- a/weboob/capabilities/messages.py
+++ b/weboob/capabilities/messages.py
@@ -91,9 +91,8 @@ class Message(CapBaseObject):
             return unicode(self.id) == unicode(msg.id)
 
     def __repr__(self):
-        result = '<Message id="%s" title="%s" date="%s" from="%s">' % (
-            self.full_id, self.title, self.date, self.sender)
-        return result.encode('utf-8')
+        return '<Message id=%r title=%r date=%r from=%r>' % (
+                   self.full_id, self.title, self.date, self.sender)
 
 class Thread(CapBaseObject):
     IS_THREADS =    0x001