works with DLFP2.0RoR-ng

2011-02-24 21:36:19 +01:00 · 2011-02-24 21:36:19 +01:00 · 144bb8a7e4
commit 144bb8a7e4
parent 09bb78258a
6 changed files with 183 additions and 190 deletions
--- a/weboob/backends/dlfp/backend.py
+++ b/weboob/backends/dlfp/backend.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict
 from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage

 from .browser import DLFP
-from .tools import url2id
+from .tools import rssid, id2url


 __all__ = ['DLFPBackend']
@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
    CONFIG = ValuesDict(Value('username',          label='Username', regexp='.+'),
                        Value('password',          label='Password', regexp='.+', masked=True),
                        ValueBool('get_news',      label='Get newspapers', default=True),
-                        ValueBool('get_telegrams', label='Get telegrams', default=False))
+                        ValueBool('get_diaries',   label='Get diaries', default=False))
    STORAGE = {'seen': {}}
    BROWSER = DLFP
-    RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss"
-    RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss"
+    RSS_NEWSPAPERS = "https://linuxfr.org/news.atom"
+    RSS_DIARIES = "https://linuxfr.org/journaux.atom"


    def create_default_browser(self):
@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
        whats = set()
        if self.config['get_news']:
            whats.add(self.RSS_NEWSPAPERS)
-        if self.config['get_telegrams']:
-            whats.add(self.RSS_TELEGRAMS)
-
+        if self.config['get_diaries']:
+            whats.add(self.RSS_DIARIES)

        for what in whats:
-            for article in Newsfeed(what, url2id).iter_entries():
+            for article in Newsfeed(what, rssid).iter_entries():
                thread = Thread(article.id)
                thread.title = article.title
                if article.datetime:
@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
        with self.browser:
            content = self.browser.get_content(id)

+        if not content:
+            return None
+
        if not thread:
-            thread = Thread(id)
+            thread = Thread(content.id)

        flags = Message.IS_HTML
        if not thread.id in self.storage.get('seen', default={}):
@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
                              receivers=None,
                              date=thread.date, #TODO XXX WTF this is None
                              parent=None,
-                              content=''.join([content.body, content.part2]),
-                              signature='URL: %s' % content.url,
+                              content=content.body,
+                              signature='URL: %s' % self.browser.absurl(id2url(content.id)),
                              children=[],
                              flags=flags)

@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):

    def post_message(self, message):
        if not message.parent:
-            raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet')
+            raise CantSendMessage('Posting news and diaries on DLFP is not supported yet')

        assert message.thread

        with self.browser:
-            return self.browser.post_reply(message.thread.id,
-                                           message.parent.id,
-                                           message.title,
-                                           message.content,
-                                           message.flags & message.IS_HTML)
+            return self.browser.post_comment(message.thread.id,
+                                             message.parent.id,
+                                             message.title,
+                                             message.content)

    def fill_thread(self, thread, fields):
        return self.get_thread(thread)
--- a/weboob/backends/dlfp/browser.py
+++ b/weboob/backends/dlfp/browser.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -17,98 +17,87 @@


 import urllib
-from cStringIO import StringIO

-from weboob.tools.browser import BaseBrowser
-from weboob.tools.parsers.lxmlparser import LxmlHtmlParser
+from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword
+from weboob.capabilities.messages import CantSendMessage

 from .pages.index import IndexPage, LoginPage
-from .pages.news import ContentPage
-from .tools import id2url, id2threadid, id2contenttype
-
-class Parser(LxmlHtmlParser):
-    def parse(self, data, encoding=None):
-        # Want to kill templeet coders
-        data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>'))
-        return LxmlHtmlParser.parse(self, data, encoding)
+from .pages.news import ContentPage, NewCommentPage, NodePage
+from .tools import id2url, url2id

 # Browser
 class DLFP(BaseBrowser):
    DOMAIN = 'linuxfr.org'
    PROTOCOL = 'https'
-    PAGES = {'https://linuxfr.org/': IndexPage,
-             'https://linuxfr.org/pub/': IndexPage,
-             'https://linuxfr.org/my/': IndexPage,
+    PAGES = {'https://linuxfr.org/?': IndexPage,
             'https://linuxfr.org/login.html': LoginPage,
-             'https://linuxfr.org/.*/\d+.html': ContentPage
+             'https://linuxfr.org/news/[^\.]+': ContentPage,
+             'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage,
+             'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
+             'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,
            }

-    def __init__(self, *args, **kwargs):
-        kwargs['parser'] = Parser()
-        BaseBrowser.__init__(self, *args, **kwargs)
-
    def home(self):
        return self.location('https://linuxfr.org')

    def get_content(self, _id):
-        self.location(id2url(_id))
-        return self.page.get_article()
+        url = id2url(_id)
+        if url is None:
+            if url2id(_id) is not None:
+                url = _id
+                _id = url2id(url)
+            else:
+                return None

-    def post_reply(self, thread, reply_id, title, message, is_html=False):
-        content_type = id2contenttype(thread)
-        thread_id = id2threadid(thread)
-        thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread))
-        reply_id = int(reply_id)
+        self.location(url)
+        content = self.page.get_article()
+        content.id = _id
+        return content

-        if not content_type or not thread_id:
-            return False
+    def _is_comment_submit_form(self, form):
+        return 'comment_new' in form.action

-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL,
-                                                              self.DOMAIN,
-                                                              thread_id,
-                                                              reply_id,
-                                                              content_type)
+    def post_comment(self, thread, reply_id, title, message):
+        url = id2url(thread)
+        if url is None:
+            raise CantSendMessage('%s is not a right ID' % thread)

-        timestamp = ''
-        if content_type == 1:
-            res = self.openurl(url).read()
-            const = 'name="timestamp" value="'
-            i = res.find(const)
-            if i >= 0:
-                res = res[i + len(const):]
-                timestamp = res[:res.find('"/>')]
+        self.location(url)
+        assert self.is_on_page(ContentPage)
+        self.location(self.page.get_post_comment_url())
+        assert self.is_on_page(NewCommentPage)

-        if is_html:
-            format = 1
-        else:
-            format = 3
+        self.select_form(predicate=self._is_comment_submit_form)
+        self.set_all_readonly(False)
+        if title is not None:
+            self['comment[title]'] = title
+        self['comment[wiki_body]'] = message
+        if int(reply_id) > 0:
+            self['comment[parent_id]'] = str(reply_id)
+        self['commit'] = 'Poster le commentaire'

-        # Define every data fields
-        data = {'news_id': thread_id,
-                'com_parent': reply_id,
-                'timestamp': timestamp,
-                'res_type': content_type,
-                'referer': thread_url,
-                'subject': unicode(title).encode('utf-8'),
-                'body': unicode(message).encode('utf-8'),
-                'format': format,
-                'submit': 'Envoyer',
-                }
+        try:
+            self.submit()
+        except BrowserHTTPError, e:
+            raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e))

-        url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type)
+        if self.is_on_page(NodePage):
+            errors = self.page.get_errors()
+            if len(errors) > 0:
+                raise CantSendMessage('Unable to send message: %s' % ', '.join(errors))

-        request = self.request_class(url, urllib.urlencode(data), {'Referer': url})
-        result = self.openurl(request)
-        request = self.request_class(thread_url, None, {'Referer': result.geturl()})
-        self.openurl(request).read()
        return None

    def login(self):
-        self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password))
+        data = {'account[login]': self.username,
+                'account[password]': self.password,
+                'account[remember_me]': 1}
+        self.location('/compte/connexion', urllib.urlencode(data), no_login=True)
+        if not self.is_logged():
+            raise BrowserIncorrectPassword()

    def is_logged(self):
        return (self.page and self.page.is_logged())

    def close_session(self):
-        self.openurl('/close_session.html')
-
+        self.openurl('/compte/deconnexion')
--- a/weboob/backends/dlfp/pages/index.py
+++ b/weboob/backends/dlfp/pages/index.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage
 class DLFPPage(BasePage):
    def is_logged(self):
        for form in self.document.getiterator('form'):
-            if form.attrib.get('id', None) == 'formulaire':
+            if form.attrib.get('id', None) == 'new_account_sidebar':
                return False

        return True
--- a/weboob/backends/dlfp/pages/news.py
+++ b/weboob/backends/dlfp/pages/news.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -17,9 +17,8 @@


 from datetime import datetime
-from logging import warning

-from weboob.tools.misc import local2utc
+from weboob.tools.parsers.lxmlparser import select, SelectElementException
 from weboob.backends.dlfp.tools import url2id

 from .index import DLFPPage
@ -37,31 +36,23 @@ class Comment(object):
        self.url = u''
        self.comments = []

-        for sub in div.getchildren():
-            if sub.tag == 'a':
-                self.id = sub.attrib['name']
-                self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id)
-            elif sub.tag == 'h1':
-                try:
-                    self.title = sub.find('b').text
-                except UnicodeError:
-                    warning('Bad encoded title, but DLFP sucks')
-            elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'):
-                self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown'
-                self.date = self.parse_date(sub.find('i').tail)
-                self.score = int(sub.findall('i')[-1].find('span').text)
-                self.body = self.browser.parser.tostring(sub.find('p'))
-            elif sub.attrib.get('class', '') == 'commentsul':
-                comment = Comment(self.browser, sub.find('li'), self.id)
-                self.comments.append(comment)
+        self.id = div.attrib['id'].split('-')[1]
+        self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
+        try:
+            self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text)
+        except SelectElementException:
+            self.author = 'Anonyme'
+        self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
+                                      '%Y-%m-%dT%H:%M:%S')
+        self.body = self.browser.parser.tostring(div.find('div'))
+        self.score = int(select(div.find('p'), 'span.score', 1).text)
+        self.url = select(div.find('h2'), 'a.title', 1).attrib['href']

-    def parse_date(self, date_s):
-        date_s = date_s.strip().encode('utf-8')
-        if not date_s:
-            date = datetime.now()
-        else:
-            date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
-        return local2utc(date)
+        subs = div.find('ul')
+        if subs is not None:
+            for sub in subs.findall('li'):
+                comment = Comment(self.browser, sub, self.id)
+                self.comments.append(comment)

    def iter_all_comments(self):
        for comment in self.comments:
@ -70,35 +61,25 @@ class Comment(object):
                yield c

    def __repr__(self):
-        return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title)
+        return u"<Comment id=%r author=%r title=%r>" % (self.id, self.author, self.title)

 class Article(object):
-    def __init__(self, browser, _id, tree):
+    def __init__(self, browser, url, tree):
        self.browser = browser
-        self.id = _id
-        self.title = u''
-        self.author = u''
-        self.body = u''
-        self.part2 = u''
-        self.date = None
-        self.url = u''
-        self.comments = []
+        self.url = url
+        self.id = url2id(self.url)

-        for div in tree.findall('div'):
-            if div.attrib.get('class', '').startswith('titlediv '):
-                self.author = div.find('a').text
-                for a in div.find('h1').getiterator('a'):
-                    if a.text: self.title += a.text
-                    if a.tail: self.title += a.tail
-                self.title = self.title.strip()
-                # TODO use the date_s
-                #subdivs = div.findall('a')
-                #if len(subdivs) > 1:
-                #    date_s = unicode(subdivs[1].text)
-                #else:
-                #    date_s = unicode(div.find('i').tail)
-            if div.attrib.get('class', '').startswith('bodydiv '):
-                self.body = self.browser.parser.tostring(div)
+        header = tree.find('header')
+        self.title = u' — '.join([a.text for a in header.find('h1').findall('a')])
+        try:
+            self.author = select(header, 'a[rel=author]', 1).text
+        except SelectElementException:
+            self.author = 'Anonyme'
+        self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
+        self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
+                                      '%Y-%m-%dT%H:%M:%S')
+
+        self.comments = []

    def append_comment(self, comment):
        self.comments.append(comment)
@ -115,21 +96,37 @@ class Article(object):
 class ContentPage(DLFPPage):
    def on_loaded(self):
        self.article = None
-        for div in self.document.find('body').find('div').findall('div'):
-            self.parse_div(div)
-            if div.attrib.get('class', '') == 'centraldiv':
-                for subdiv in div.findall('div'):
-                    self.parse_div(subdiv)
-
-    def parse_div(self, div):
-        if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
-            self.article = Article(self.browser, url2id(self.url), div)
-            self.article.url = self.url
-        if div.attrib.get('class', '') == 'articlediv':
-            self.article.parse_part2(div)
-        if div.attrib.get('class', '') == 'comments':
-            comment = Comment(self.browser, div, 0)
-            self.article.append_comment(comment)

    def get_article(self):
+        if not self.article:
+            self.article = Article(self.browser,
+                                   self.url,
+                                   select(self.document.getroot(), 'article', 1))
+
+            try:
+                threads = select(self.document.getroot(), 'ul.threads', 1)
+            except SelectElementException:
+                pass # no comments
+            else:
+                for comment in threads.findall('li'):
+                    self.article.append_comment(Comment(self.browser, comment, 0))
+
        return self.article
+
+    def get_post_comment_url(self):
+        return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
+
+class NewCommentPage(DLFPPage):
+    pass
+
+class NodePage(DLFPPage):
+    def get_errors(self):
+        try:
+            div = select(self.document.getroot(), 'div.errors', 1)
+        except SelectElementException:
+            return []
+
+        l = []
+        for li in div.find('ul').findall('li'):
+            l.append(li.text)
+        return l
--- a/weboob/backends/dlfp/tools.py
+++ b/weboob/backends/dlfp/tools.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright(C) 2010  Romain Bignon
+# Copyright(C) 2010-2011  Romain Bignon
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@ -18,43 +18,50 @@

 import re

-ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$')
-ID2URL_TELEGRAM  = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$')
-URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$')
-URL2ID_TELEGRAM  = re.compile('^T([A-Za-z0-9_]+).(\d+)$')
+RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
+ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$')
+URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)')
+URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
+
+def rssid(entry):
+    m = RSSID_RE.match(entry.id)
+    if not m:
+        return None
+    if m.group(1) == 'D':
+        mm = URL2ID_DIARY_RE.match(entry.link)
+        if not mm:
+            return
+        return 'D%s.%s' % (mm.group(1), m.group(2))
+    return '%s.%s' % (m.group(1), m.group(2))
+
+def id2url(id):
+    m = ID2URL_RE.match(id)
+    if not m:
+        return None
+
+    if m.group(1) == 'N':
+        return '/news/%s' % m.group(3)
+    if m.group(1) == 'D':
+        return '/users/%s/journaux/%s' % (m.group(2), m.group(3))

 def url2id(url):
-    m = ID2URL_NEWSPAPER.match(url)
+    m = URL2ID_NEWSPAPER_RE.match(url)
    if m:
-        return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
-    m = ID2URL_TELEGRAM.match(url)
+        return 'N.%s' % (m.group(1))
+    m = URL2ID_DIARY_RE.match(url)
    if m:
-        return 'T%s.%d' % (m.group(1), int(m.group(2)))
-    return None
+        return 'D%s.%s' % (m.group(1), m.group(2))

-def id2url(_id):
-    m = URL2ID_NEWSPAPER.match(_id)
+def id2threadid(id):
+    m = ID2URL_RE.match(id)
    if m:
-        return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
-    m = URL2ID_TELEGRAM.match(_id)
-    if m:
-        return '/~%s/%d.html' % (m.group(1), int(m.group(2)))
-    return None
-
-def id2threadid(_id):
-    m = URL2ID_NEWSPAPER.match(_id)
-    if m:
-        return int(m.group(4))
-    m = URL2ID_TELEGRAM.match(_id)
-    if m:
-        return int(m.group(2))
-    return None
+        return m.group(3)

 def id2contenttype(_id):
    if not _id:
        return None
    if _id[0] == 'N':
        return 1
-    if _id[0] == 'T':
+    if _id[0] == 'D':
        return 5
    return None
--- a/weboob/capabilities/messages.py
+++ b/weboob/capabilities/messages.py
@ -91,9 +91,8 @@ class Message(CapBaseObject):
            return unicode(self.id) == unicode(msg.id)

    def __repr__(self):
-        result = '<Message id="%s" title="%s" date="%s" from="%s">' % (
-            self.full_id, self.title, self.date, self.sender)
-        return result.encode('utf-8')
+        return '<Message id=%r title=%r date=%r from=%r>' % (
+                   self.full_id, self.title, self.date, self.sender)

 class Thread(CapBaseObject):
    IS_THREADS =    0x001