works with DLFP2.0RoR-ng

This commit is contained in:
Romain Bignon 2011-02-24 21:36:19 +01:00
commit 144bb8a7e4
6 changed files with 183 additions and 190 deletions

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon # Copyright(C) 2010-2011 Romain Bignon
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -24,7 +24,7 @@ from weboob.tools.value import Value, ValueBool, ValuesDict
from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage from weboob.capabilities.messages import ICapMessages, ICapMessagesPost, Message, Thread, CantSendMessage
from .browser import DLFP from .browser import DLFP
from .tools import url2id from .tools import rssid, id2url
__all__ = ['DLFPBackend'] __all__ = ['DLFPBackend']
@ -40,11 +40,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
CONFIG = ValuesDict(Value('username', label='Username', regexp='.+'), CONFIG = ValuesDict(Value('username', label='Username', regexp='.+'),
Value('password', label='Password', regexp='.+', masked=True), Value('password', label='Password', regexp='.+', masked=True),
ValueBool('get_news', label='Get newspapers', default=True), ValueBool('get_news', label='Get newspapers', default=True),
ValueBool('get_telegrams', label='Get telegrams', default=False)) ValueBool('get_diaries', label='Get diaries', default=False))
STORAGE = {'seen': {}} STORAGE = {'seen': {}}
BROWSER = DLFP BROWSER = DLFP
RSS_TELEGRAMS= "https://linuxfr.org/backend/journaux/rss20.rss" RSS_NEWSPAPERS = "https://linuxfr.org/news.atom"
RSS_NEWSPAPERS = "https://linuxfr.org/backend/news/rss20.rss" RSS_DIARIES = "https://linuxfr.org/journaux.atom"
def create_default_browser(self): def create_default_browser(self):
@ -62,12 +62,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
whats = set() whats = set()
if self.config['get_news']: if self.config['get_news']:
whats.add(self.RSS_NEWSPAPERS) whats.add(self.RSS_NEWSPAPERS)
if self.config['get_telegrams']: if self.config['get_diaries']:
whats.add(self.RSS_TELEGRAMS) whats.add(self.RSS_DIARIES)
for what in whats: for what in whats:
for article in Newsfeed(what, url2id).iter_entries(): for article in Newsfeed(what, rssid).iter_entries():
thread = Thread(article.id) thread = Thread(article.id)
thread.title = article.title thread.title = article.title
if article.datetime: if article.datetime:
@ -84,8 +83,11 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
with self.browser: with self.browser:
content = self.browser.get_content(id) content = self.browser.get_content(id)
if not content:
return None
if not thread: if not thread:
thread = Thread(id) thread = Thread(content.id)
flags = Message.IS_HTML flags = Message.IS_HTML
if not thread.id in self.storage.get('seen', default={}): if not thread.id in self.storage.get('seen', default={}):
@ -102,8 +104,8 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
receivers=None, receivers=None,
date=thread.date, #TODO XXX WTF this is None date=thread.date, #TODO XXX WTF this is None
parent=None, parent=None,
content=''.join([content.body, content.part2]), content=content.body,
signature='URL: %s' % content.url, signature='URL: %s' % self.browser.absurl(id2url(content.id)),
children=[], children=[],
flags=flags) flags=flags)
@ -151,16 +153,15 @@ class DLFPBackend(BaseBackend, ICapMessages, ICapMessagesPost):
def post_message(self, message): def post_message(self, message):
if not message.parent: if not message.parent:
raise CantSendMessage('Posting news and telegrams on DLFP is not supported yet') raise CantSendMessage('Posting news and diaries on DLFP is not supported yet')
assert message.thread assert message.thread
with self.browser: with self.browser:
return self.browser.post_reply(message.thread.id, return self.browser.post_comment(message.thread.id,
message.parent.id, message.parent.id,
message.title, message.title,
message.content, message.content)
message.flags & message.IS_HTML)
def fill_thread(self, thread, fields): def fill_thread(self, thread, fields):
return self.get_thread(thread) return self.get_thread(thread)

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon # Copyright(C) 2010-2011 Romain Bignon
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -17,98 +17,87 @@
import urllib import urllib
from cStringIO import StringIO
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser, BrowserHTTPError, BrowserIncorrectPassword
from weboob.tools.parsers.lxmlparser import LxmlHtmlParser from weboob.capabilities.messages import CantSendMessage
from .pages.index import IndexPage, LoginPage from .pages.index import IndexPage, LoginPage
from .pages.news import ContentPage from .pages.news import ContentPage, NewCommentPage, NodePage
from .tools import id2url, id2threadid, id2contenttype from .tools import id2url, url2id
class Parser(LxmlHtmlParser):
def parse(self, data, encoding=None):
# Want to kill templeet coders
data = StringIO(data.read().replace('<<', '<').replace('cite>', 'i>').replace('tt>', 'i>'))
return LxmlHtmlParser.parse(self, data, encoding)
# Browser # Browser
class DLFP(BaseBrowser): class DLFP(BaseBrowser):
DOMAIN = 'linuxfr.org' DOMAIN = 'linuxfr.org'
PROTOCOL = 'https' PROTOCOL = 'https'
PAGES = {'https://linuxfr.org/': IndexPage, PAGES = {'https://linuxfr.org/?': IndexPage,
'https://linuxfr.org/pub/': IndexPage,
'https://linuxfr.org/my/': IndexPage,
'https://linuxfr.org/login.html': LoginPage, 'https://linuxfr.org/login.html': LoginPage,
'https://linuxfr.org/.*/\d+.html': ContentPage 'https://linuxfr.org/news/[^\.]+': ContentPage,
'https://linuxfr.org/users/[\w_]+/journaux/[^\.]+': ContentPage,
'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,
} }
def __init__(self, *args, **kwargs):
kwargs['parser'] = Parser()
BaseBrowser.__init__(self, *args, **kwargs)
def home(self): def home(self):
return self.location('https://linuxfr.org') return self.location('https://linuxfr.org')
def get_content(self, _id): def get_content(self, _id):
self.location(id2url(_id)) url = id2url(_id)
return self.page.get_article() if url is None:
if url2id(_id) is not None:
url = _id
_id = url2id(url)
else:
return None
def post_reply(self, thread, reply_id, title, message, is_html=False): self.location(url)
content_type = id2contenttype(thread) content = self.page.get_article()
thread_id = id2threadid(thread) content.id = _id
thread_url = '%s://%s%s' % (self.PROTOCOL, self.DOMAIN, id2url(thread)) return content
reply_id = int(reply_id)
if not content_type or not thread_id: def _is_comment_submit_form(self, form):
return False return 'comment_new' in form.action
url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, def post_comment(self, thread, reply_id, title, message):
self.DOMAIN, url = id2url(thread)
thread_id, if url is None:
reply_id, raise CantSendMessage('%s is not a right ID' % thread)
content_type)
timestamp = '' self.location(url)
if content_type == 1: assert self.is_on_page(ContentPage)
res = self.openurl(url).read() self.location(self.page.get_post_comment_url())
const = 'name="timestamp" value="' assert self.is_on_page(NewCommentPage)
i = res.find(const)
if i >= 0:
res = res[i + len(const):]
timestamp = res[:res.find('"/>')]
if is_html: self.select_form(predicate=self._is_comment_submit_form)
format = 1 self.set_all_readonly(False)
else: if title is not None:
format = 3 self['comment[title]'] = title
self['comment[wiki_body]'] = message
if int(reply_id) > 0:
self['comment[parent_id]'] = str(reply_id)
self['commit'] = 'Poster le commentaire'
# Define every data fields try:
data = {'news_id': thread_id, self.submit()
'com_parent': reply_id, except BrowserHTTPError, e:
'timestamp': timestamp, raise CantSendMessage('Unable to send message to %s.%s: %s' % (thread, reply_id, e))
'res_type': content_type,
'referer': thread_url,
'subject': unicode(title).encode('utf-8'),
'body': unicode(message).encode('utf-8'),
'format': format,
'submit': 'Envoyer',
}
url = '%s://%s/submit/comments,%d,%d,%d.html#post' % (self.PROTOCOL, self.DOMAIN, thread_id, reply_id, content_type) if self.is_on_page(NodePage):
errors = self.page.get_errors()
if len(errors) > 0:
raise CantSendMessage('Unable to send message: %s' % ', '.join(errors))
request = self.request_class(url, urllib.urlencode(data), {'Referer': url})
result = self.openurl(request)
request = self.request_class(thread_url, None, {'Referer': result.geturl()})
self.openurl(request).read()
return None return None
def login(self): def login(self):
self.location('/login.html', 'login=%s&passwd=%s&isauto=1' % (self.username, self.password)) data = {'account[login]': self.username,
'account[password]': self.password,
'account[remember_me]': 1}
self.location('/compte/connexion', urllib.urlencode(data), no_login=True)
if not self.is_logged():
raise BrowserIncorrectPassword()
def is_logged(self): def is_logged(self):
return (self.page and self.page.is_logged()) return (self.page and self.page.is_logged())
def close_session(self): def close_session(self):
self.openurl('/close_session.html') self.openurl('/compte/deconnexion')

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon # Copyright(C) 2010-2011 Romain Bignon
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -21,7 +21,7 @@ from weboob.tools.browser import BrowserIncorrectPassword, BasePage
class DLFPPage(BasePage): class DLFPPage(BasePage):
def is_logged(self): def is_logged(self):
for form in self.document.getiterator('form'): for form in self.document.getiterator('form'):
if form.attrib.get('id', None) == 'formulaire': if form.attrib.get('id', None) == 'new_account_sidebar':
return False return False
return True return True

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon # Copyright(C) 2010-2011 Romain Bignon
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -17,9 +17,8 @@
from datetime import datetime from datetime import datetime
from logging import warning
from weboob.tools.misc import local2utc from weboob.tools.parsers.lxmlparser import select, SelectElementException
from weboob.backends.dlfp.tools import url2id from weboob.backends.dlfp.tools import url2id
from .index import DLFPPage from .index import DLFPPage
@ -37,31 +36,23 @@ class Comment(object):
self.url = u'' self.url = u''
self.comments = [] self.comments = []
for sub in div.getchildren(): self.id = div.attrib['id'].split('-')[1]
if sub.tag == 'a': self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
self.id = sub.attrib['name'] try:
self.url = u'https://linuxfr.org/comments/%s.html#%s' % (self.id, self.id) self.author = unicode(select(div.find('p'), 'a[rel=author]', 1).text)
elif sub.tag == 'h1': except SelectElementException:
try: self.author = 'Anonyme'
self.title = sub.find('b').text self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
except UnicodeError: '%Y-%m-%dT%H:%M:%S')
warning('Bad encoded title, but DLFP sucks') self.body = self.browser.parser.tostring(div.find('div'))
elif sub.tag == 'div' and sub.attrib.get('class', '').startswith('comment'): self.score = int(select(div.find('p'), 'span.score', 1).text)
self.author = sub.find('a').text if sub.find('a') is not None else 'Unknown' self.url = select(div.find('h2'), 'a.title', 1).attrib['href']
self.date = self.parse_date(sub.find('i').tail)
self.score = int(sub.findall('i')[-1].find('span').text)
self.body = self.browser.parser.tostring(sub.find('p'))
elif sub.attrib.get('class', '') == 'commentsul':
comment = Comment(self.browser, sub.find('li'), self.id)
self.comments.append(comment)
def parse_date(self, date_s): subs = div.find('ul')
date_s = date_s.strip().encode('utf-8') if subs is not None:
if not date_s: for sub in subs.findall('li'):
date = datetime.now() comment = Comment(self.browser, sub, self.id)
else: self.comments.append(comment)
date = datetime.strptime(date_s, u'le %d/%m/%Y \xe0 %H:%M.'.encode('utf-8'))
return local2utc(date)
def iter_all_comments(self): def iter_all_comments(self):
for comment in self.comments: for comment in self.comments:
@ -70,35 +61,25 @@ class Comment(object):
yield c yield c
def __repr__(self): def __repr__(self):
return u"<Comment id='%s' author='%s' title='%s'>" % (self.id, self.author, self.title) return u"<Comment id=%r author=%r title=%r>" % (self.id, self.author, self.title)
class Article(object): class Article(object):
def __init__(self, browser, _id, tree): def __init__(self, browser, url, tree):
self.browser = browser self.browser = browser
self.id = _id self.url = url
self.title = u'' self.id = url2id(self.url)
self.author = u''
self.body = u''
self.part2 = u''
self.date = None
self.url = u''
self.comments = []
for div in tree.findall('div'): header = tree.find('header')
if div.attrib.get('class', '').startswith('titlediv '): self.title = u''.join([a.text for a in header.find('h1').findall('a')])
self.author = div.find('a').text try:
for a in div.find('h1').getiterator('a'): self.author = select(header, 'a[rel=author]', 1).text
if a.text: self.title += a.text except SelectElementException:
if a.tail: self.title += a.tail self.author = 'Anonyme'
self.title = self.title.strip() self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
# TODO use the date_s self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
#subdivs = div.findall('a') '%Y-%m-%dT%H:%M:%S')
#if len(subdivs) > 1:
# date_s = unicode(subdivs[1].text) self.comments = []
#else:
# date_s = unicode(div.find('i').tail)
if div.attrib.get('class', '').startswith('bodydiv '):
self.body = self.browser.parser.tostring(div)
def append_comment(self, comment): def append_comment(self, comment):
self.comments.append(comment) self.comments.append(comment)
@ -115,21 +96,37 @@ class Article(object):
class ContentPage(DLFPPage): class ContentPage(DLFPPage):
def on_loaded(self): def on_loaded(self):
self.article = None self.article = None
for div in self.document.find('body').find('div').findall('div'):
self.parse_div(div)
if div.attrib.get('class', '') == 'centraldiv':
for subdiv in div.findall('div'):
self.parse_div(subdiv)
def parse_div(self, div):
if div.attrib.get('class', '') in ('newsdiv', 'centraldiv'):
self.article = Article(self.browser, url2id(self.url), div)
self.article.url = self.url
if div.attrib.get('class', '') == 'articlediv':
self.article.parse_part2(div)
if div.attrib.get('class', '') == 'comments':
comment = Comment(self.browser, div, 0)
self.article.append_comment(comment)
def get_article(self): def get_article(self):
if not self.article:
self.article = Article(self.browser,
self.url,
select(self.document.getroot(), 'article', 1))
try:
threads = select(self.document.getroot(), 'ul.threads', 1)
except SelectElementException:
pass # no comments
else:
for comment in threads.findall('li'):
self.article.append_comment(Comment(self.browser, comment, 0))
return self.article return self.article
def get_post_comment_url(self):
return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
class NewCommentPage(DLFPPage):
pass
class NodePage(DLFPPage):
def get_errors(self):
try:
div = select(self.document.getroot(), 'div.errors', 1)
except SelectElementException:
return []
l = []
for li in div.find('ul').findall('li'):
l.append(li.text)
return l

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2010 Romain Bignon # Copyright(C) 2010-2011 Romain Bignon
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -18,43 +18,50 @@
import re import re
ID2URL_NEWSPAPER = re.compile('.*/(\d{4})/(\d{2})/(\d{2})/(\d+)\.html$') RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
ID2URL_TELEGRAM = re.compile('.*/~([A-Za-z0-9_]+)/(\d+)\.html$') ID2URL_RE = re.compile('^(\w)([\w_]*)\.([^\.]+)$')
URL2ID_NEWSPAPER = re.compile('^N(\d{4})(\d{2})(\d{2}).(\d+)$') URL2ID_DIARY_RE = re.compile('.*/users/([\w_]+)/journaux/([^\.]+)')
URL2ID_TELEGRAM = re.compile('^T([A-Za-z0-9_]+).(\d+)$') URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
def rssid(entry):
m = RSSID_RE.match(entry.id)
if not m:
return None
if m.group(1) == 'D':
mm = URL2ID_DIARY_RE.match(entry.link)
if not mm:
return
return 'D%s.%s' % (mm.group(1), m.group(2))
return '%s.%s' % (m.group(1), m.group(2))
def id2url(id):
m = ID2URL_RE.match(id)
if not m:
return None
if m.group(1) == 'N':
return '/news/%s' % m.group(3)
if m.group(1) == 'D':
return '/users/%s/journaux/%s' % (m.group(2), m.group(3))
def url2id(url): def url2id(url):
m = ID2URL_NEWSPAPER.match(url) m = URL2ID_NEWSPAPER_RE.match(url)
if m: if m:
return 'N%04d%02d%02d.%d' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))) return 'N.%s' % (m.group(1))
m = ID2URL_TELEGRAM.match(url) m = URL2ID_DIARY_RE.match(url)
if m: if m:
return 'T%s.%d' % (m.group(1), int(m.group(2))) return 'D%s.%s' % (m.group(1), m.group(2))
return None
def id2url(_id): def id2threadid(id):
m = URL2ID_NEWSPAPER.match(_id) m = ID2URL_RE.match(id)
if m: if m:
return '/%04d/%02d/%02d/%d.html' % (int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))) return m.group(3)
m = URL2ID_TELEGRAM.match(_id)
if m:
return '/~%s/%d.html' % (m.group(1), int(m.group(2)))
return None
def id2threadid(_id):
m = URL2ID_NEWSPAPER.match(_id)
if m:
return int(m.group(4))
m = URL2ID_TELEGRAM.match(_id)
if m:
return int(m.group(2))
return None
def id2contenttype(_id): def id2contenttype(_id):
if not _id: if not _id:
return None return None
if _id[0] == 'N': if _id[0] == 'N':
return 1 return 1
if _id[0] == 'T': if _id[0] == 'D':
return 5 return 5
return None return None

View file

@ -91,9 +91,8 @@ class Message(CapBaseObject):
return unicode(self.id) == unicode(msg.id) return unicode(self.id) == unicode(msg.id)
def __repr__(self): def __repr__(self):
result = '<Message id="%s" title="%s" date="%s" from="%s">' % ( return '<Message id=%r title=%r date=%r from=%r>' % (
self.full_id, self.title, self.date, self.sender) self.full_id, self.title, self.date, self.sender)
return result.encode('utf-8')
class Thread(CapBaseObject): class Thread(CapBaseObject):
IS_THREADS = 0x001 IS_THREADS = 0x001