works with DLFP2.0RoR-ng

This commit is contained in:
Romain Bignon 2011-02-24 21:36:19 +01:00
commit 144bb8a7e4
6 changed files with 183 additions and 190 deletions

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon
# Copyright(C) 2010-2011 Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict
from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage
from .browser import DLFP
from .tools import url2id
from .tools import rssid, id2url
__all__ = ['DLFPBackend']
@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
CONFIG = ValuesDict(Value('username', label='Username', regexp='.+'),
Value('password', label='Password', regexp='.+', masked=True),
ValueBool('get_news', label='Get newspapers', default=True),
ValueBool('get_telegrams', label='Get telegrams', default=False))
ValueBool('get_diaries', label='Get diaries', default=False))
STORAGE = {'seen': {}}
BROWSER = DLFP
RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss"
RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss"
RSS_NEWSPAPERS = "https://linuxfr.org/news.atom"
RSS_DIARIES = "https://linuxfr.org/journaux.atom"
def create_default_browser(self):
@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
whats = set()
if self.config['get_news']:
whats.add(self.RSS_NEWSPAPERS)
if self.config['get_telegrams']:
whats.add(self.RSS_TELEGRAMS)
if self.config['get_diaries']:
whats.add(self.RSS_DIARIES)
for what in whats:
for article in Newsfeed(what, url2id).iter_entries():
for article in Newsfeed(what, rssid).iter_entries():
thread = Thread(article.id)
thread.title = article.title
if article.datetime:
@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
with self.browser:
content = self.browser.get_content(id)
if not content:
return None
if not thread:
thread = Thread(id)
thread = Thread(content.id)
flags = Message.IS_HTML
if not thread.id in self.storage.get('seen', default={}):
@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
receivers=None,
date=thread.date, #TODO XXX WTF this is None
parent=None,
content=''.join([content.body, content.part2]),
signature='URL: %s' % content.url,
content=content.body,
signature='URL: %s' % self.browser.absurl(id2url(content.id)),
children=[],
flags=flags)
@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
def post_message(self, message):
if not message.parent:
raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet')
raise CantSendMessage('Posting news and diaries on DLFP is not supported yet')
assert message.thread
with self.browser:
return self.browser.post_reply(message.thread.id,
message.parent.id,
message.title,
message.content,
message.flags & message.IS_HTML)
return self.browser.post_comment(message.thread.id,
message.parent.id,
message.title,
message.content)
def fill_thread(self, thread, fields):
return self.get_thread(thread)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon
# Copyright(C) 2010-2011 Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -17,98 +17,87 @@
import urllib
from cStringIO import StringIO
from weboob.tools.browser import BaseBrowser
from weboob.tools.parsers.lxmlparser import LxmlHtmlParser
from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword
from weboob.capabilities.messages import CantSendMessage
from .pages.index import IndexPage, LoginPage
from .pages.news import ContentPage
from .tools import id2url, id2threadid, id2contenttype
class Parser(LxmlHtmlParser):
def parse(self, data, encoding=None):
# Want to kill templeet coders
data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>'))
return LxmlHtmlParser.parse(self, data, encoding)
from .pages.news import ContentPage, NewCommentPage, NodePage
from .tools import id2url, url2id
# Browser
class DLFP(BaseBrowser):
DOMAIN = 'linuxfr.org'
PROTOCOL = 'https'
PAGES = {'https://linuxfr.org/': IndexPage,
'https://linuxfr.org/pub/': IndexPage,
'https://linuxfr.org/my/': IndexPage,
PAGES = {'https://linuxfr.org/?': IndexPage,
'https://linuxfr.org/login.html': LoginPage,
'https://linuxfr.org/.*/\d+.html': ContentPage
'https://linuxfr.org/news/[^\.]+': ContentPage,
'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage,
'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,
}
def __init__(self, *args, **kwargs):
kwargs['parser'] = Parser()
BaseBrowser.__init__(self, *args, **kwargs)
def home(self):
return self.location('https://linuxfr.org')
def get_content(self, _id):
self.location(id2url(_id))
return self.page.get_article()
url = id2url(_id)
if url is None:
if url2id(_id) is not None:
url = _id
_id = url2id(url)
else:
return None
def post_reply(self, thread, reply_id, title, message, is_html=False):
content_type = id2contenttype(thread)
thread_id = id2threadid(thread)
thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread))
reply_id = int(reply_id)
self.location(url)
content = self.page.get_article()
content.id = _id
return content
if not content_type or not thread_id:
return False
def _is_comment_submit_form(self, form):
return 'comment_new' in form.action
url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL,
self.DOMAIN,
thread_id,
reply_id,
content_type)
def post_comment(self, thread, reply_id, title, message):
url = id2url(thread)
if url is None:
raise CantSendMessage('%s is not a right ID' % thread)
timestamp = ''
if content_type == 1:
res = self.openurl(url).read()
const = 'name="timestamp" value="'
i = res.find(const)
if i >= 0:
res = res[i + len(const):]
timestamp = res[:res.find('"/>')]
self.location(url)
assert self.is_on_page(ContentPage)
self.location(self.page.get_post_comment_url())
assert self.is_on_page(NewCommentPage)
if is_html:
format = 1
else:
format = 3
self.select_form(predicate=self._is_comment_submit_form)
self.set_all_readonly(False)
if title is not None:
self['comment[title]'] = title
self['comment[wiki_body]'] = message
if int(reply_id) > 0:
self['comment[parent_id]'] = str(reply_id)
self['commit'] = 'Poster le commentaire'
# Define every data fields
data = {'news_id': thread_id,
'com_parent': reply_id,
'timestamp': timestamp,
'res_type': content_type,
'referer': thread_url,
'subject': unicode(title).encode('utf-8'),
'body': unicode(message).encode('utf-8'),
'format': format,
'submit': 'Envoyer',
}
try:
self.submit()
except BrowserHTTPError, e:
raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e))
url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type)
if self.is_on_page(NodePage):
errors = self.page.get_errors()
if len(errors) > 0:
raise CantSendMessage('Unable to send message: %s' % ', '.join(errors))
request = self.request_class(url, urllib.urlencode(data), {'Referer': url})
result = self.openurl(request)
request = self.request_class(thread_url, None, {'Referer': result.geturl()})
self.openurl(request).read()
return None
def login(self):
self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password))
data = {'account[login]': self.username,
'account[password]': self.password,
'account[remember_me]': 1}
self.location('/compte/connexion', urllib.urlencode(data), no_login=True)
if not self.is_logged():
raise BrowserIncorrectPassword()
def is_logged(self):
return (self.page and self.page.is_logged())
def close_session(self):
self.openurl('/close_session.html')
self.openurl('/compte/deconnexion')

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon
# Copyright(C) 2010-2011 Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage
class DLFPPage(BasePage):
def is_logged(self):
for form in self.document.getiterator('form'):
if form.attrib.get('id', None) == 'formulaire':
if form.attrib.get('id', None) == 'new_account_sidebar':
return False
return True

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon
# Copyright(C) 2010-2011 Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -17,9 +17,8 @@
from datetime import datetime
from logging import warning
from weboob.tools.misc import local2utc
from weboob.tools.parsers.lxmlparser import select, SelectElementException
from weboob.backends.dlfp.tools import url2id
from .index import DLFPPage
@ -37,31 +36,23 @@ class Comment(object):
self.url = u''
self.comments = []
for sub in div.getchildren():
if sub.tag == 'a':
self.id = sub.attrib['name']
self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id)
elif sub.tag == 'h1':
try:
self.title = sub.find('b').text
except UnicodeError:
warning('Bad encoded title, but DLFP sucks')
elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'):
self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown'
self.date = self.parse_date(sub.find('i').tail)
self.score = int(sub.findall('i')[-1].find('span').text)
self.body = self.browser.parser.tostring(sub.find('p'))
elif sub.attrib.get('class', '') == 'commentsul':
comment = Comment(self.browser, sub.find('li'), self.id)
self.comments.append(comment)
self.id = div.attrib['id'].split('-')[1]
self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
try:
self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text)
except SelectElementException:
self.author = 'Anonyme'
self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
'%Y-%m-%dT%H:%M:%S')
self.body = self.browser.parser.tostring(div.find('div'))
self.score = int(select(div.find('p'), 'span.score', 1).text)
self.url = select(div.find('h2'), 'a.title', 1).attrib['href']
def parse_date(self, date_s):
date_s = date_s.strip().encode('utf-8')
if not date_s:
date = datetime.now()
else:
date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
return local2utc(date)
subs = div.find('ul')
if subs is not None:
for sub in subs.findall('li'):
comment = Comment(self.browser, sub, self.id)
self.comments.append(comment)
def iter_all_comments(self):
for comment in self.comments:
@ -70,35 +61,25 @@ class Comment(object):
yield c
def __repr__(self):
return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title)
return u"<Comment id=%r author=%r title=%r>" % (self.id, self.author, self.title)
class Article(object):
def __init__(self, browser, _id, tree):
def __init__(self, browser, url, tree):
self.browser = browser
self.id = _id
self.title = u''
self.author = u''
self.body = u''
self.part2 = u''
self.date = None
self.url = u''
self.comments = []
self.url = url
self.id = url2id(self.url)
for div in tree.findall('div'):
if div.attrib.get('class', '').startswith('titlediv '):
self.author = div.find('a').text
for a in div.find('h1').getiterator('a'):
if a.text: self.title += a.text
if a.tail: self.title += a.tail
self.title = self.title.strip()
# TODO use the date_s
#subdivs = div.findall('a')
#if len(subdivs) > 1:
# date_s = unicode(subdivs[1].text)
#else:
# date_s = unicode(div.find('i').tail)
if div.attrib.get('class', '').startswith('bodydiv '):
self.body = self.browser.parser.tostring(div)
header = tree.find('header')
self.title = u''.join([a.text for a in header.find('h1').findall('a')])
try:
self.author = select(header, 'a[rel=author]', 1).text
except SelectElementException:
self.author = 'Anonyme'
self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
'%Y-%m-%dT%H:%M:%S')
self.comments = []
def append_comment(self, comment):
self.comments.append(comment)
@ -115,21 +96,37 @@ class Article(object):
class ContentPage(DLFPPage):
def on_loaded(self):
self.article = None
for div in self.document.find('body').find('div').findall('div'):
self.parse_div(div)
if div.attrib.get('class', '') == 'centraldiv':
for subdiv in div.findall('div'):
self.parse_div(subdiv)
def parse_div(self, div):
if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
self.article = Article(self.browser, url2id(self.url), div)
self.article.url = self.url
if div.attrib.get('class', '') == 'articlediv':
self.article.parse_part2(div)
if div.attrib.get('class', '') == 'comments':
comment = Comment(self.browser, div, 0)
self.article.append_comment(comment)
def get_article(self):
if not self.article:
self.article = Article(self.browser,
self.url,
select(self.document.getroot(), 'article', 1))
try:
threads = select(self.document.getroot(), 'ul.threads', 1)
except SelectElementException:
pass # no comments
else:
for comment in threads.findall('li'):
self.article.append_comment(Comment(self.browser, comment, 0))
return self.article
def get_post_comment_url(self):
return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
class NewCommentPage(DLFPPage):
pass
class NodePage(DLFPPage):
def get_errors(self):
try:
div = select(self.document.getroot(), 'div.errors', 1)
except SelectElementException:
return []
l = []
for li in div.find('ul').findall('li'):
l.append(li.text)
return l

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon
# Copyright(C) 2010-2011 Romain Bignon
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@ -18,43 +18,50 @@
import re
ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$')
ID2URL_TELEGRAM = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$')
URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$')
URL2ID_TELEGRAM = re.compile('^T([A-Za-z0-9_]+).(\d+)$')
RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$')
URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)')
URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
def rssid(entry):
m = RSSID_RE.match(entry.id)
if not m:
return None
if m.group(1) == 'D':
mm = URL2ID_DIARY_RE.match(entry.link)
if not mm:
return
return 'D%s.%s' % (mm.group(1), m.group(2))
return '%s.%s' % (m.group(1), m.group(2))
def id2url(id):
m = ID2URL_RE.match(id)
if not m:
return None
if m.group(1) == 'N':
return '/news/%s' % m.group(3)
if m.group(1) == 'D':
return '/users/%s/journaux/%s' % (m.group(2), m.group(3))
def url2id(url):
m = ID2URL_NEWSPAPER.match(url)
m = URL2ID_NEWSPAPER_RE.match(url)
if m:
return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
m = ID2URL_TELEGRAM.match(url)
return 'N.%s' % (m.group(1))
m = URL2ID_DIARY_RE.match(url)
if m:
return 'T%s.%d' % (m.group(1), int(m.group(2)))
return None
return 'D%s.%s' % (m.group(1), m.group(2))
def id2url(_id):
m = URL2ID_NEWSPAPER.match(_id)
def id2threadid(id):
m = ID2URL_RE.match(id)
if m:
return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4)))
m = URL2ID_TELEGRAM.match(_id)
if m:
return '/~%s/%d.html' % (m.group(1), int(m.group(2)))
return None
def id2threadid(_id):
m = URL2ID_NEWSPAPER.match(_id)
if m:
return int(m.group(4))
m = URL2ID_TELEGRAM.match(_id)
if m:
return int(m.group(2))
return None
return m.group(3)
def id2contenttype(_id):
if not _id:
return None
if _id[0] == 'N':
return 1
if _id[0] == 'T':
if _id[0] == 'D':
return 5
return None

View file

@ -91,9 +91,8 @@ class Message(CapBaseObject):
return unicode(self.id) == unicode(msg.id)
def __repr__(self):
result = '<Message id="%s" title="%s" date="%s" from="%s">' % (
self.full_id, self.title, self.date, self.sender)
return result.encode('utf-8')
return '<Message id=%r title=%r date=%r from=%r>' % (
self.full_id, self.title, self.date, self.sender)
class Thread(CapBaseObject):
IS_THREADS = 0x001