support wiki and forums

This commit is contained in:
Romain Bignon 2011-03-19 09:27:42 +01:00
commit 4a864a97af
3 changed files with 25 additions and 12 deletions

View file

@ -34,7 +34,9 @@ class DLFP(BaseBrowser):
PAGES = {'https://linuxfr.org/?': IndexPage, PAGES = {'https://linuxfr.org/?': IndexPage,
'https://linuxfr.org/login.html': LoginPage, 'https://linuxfr.org/login.html': LoginPage,
'https://linuxfr.org/news/[^\.]+': ContentPage, 'https://linuxfr.org/news/[^\.]+': ContentPage,
'https://linuxfr.org/wiki/[^\.]+': ContentPage,
'https://linuxfr.org/users/[\w\-_]+/journaux/[^\.]+': ContentPage, 'https://linuxfr.org/users/[\w\-_]+/journaux/[^\.]+': ContentPage,
'https://linuxfr.org/forums/[\w\-_]+/posts/[^\.]+': ContentPage,
'https://linuxfr.org/nodes/(\d+)/comments/(\d+)$': CommentPage, 'https://linuxfr.org/nodes/(\d+)/comments/(\d+)$': CommentPage,
'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage, 'https://linuxfr.org/nodes/(\d+)/comments/nouveau': NewCommentPage,
'https://linuxfr.org/nodes/(\d+)/comments$': NodePage, 'https://linuxfr.org/nodes/(\d+)/comments$': NodePage,

View file

@ -119,9 +119,12 @@ class Article(Content):
self.author = unicode(a.text) self.author = unicode(a.text)
self.username = unicode(a.attrib['href'].split('/')[2]) self.username = unicode(a.attrib['href'].split('/')[2])
self.body = self.browser.parser.tostring(select(tree, 'div.content', 1)) self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0], try:
'%Y-%m-%dT%H:%M:%S') self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
self.date = local2utc(self.date) '%Y-%m-%dT%H:%M:%S')
self.date = local2utc(self.date)
except SelectElementException:
pass
forms = select(tree.find('footer'), 'form.button_to') forms = select(tree.find('footer'), 'form.button_to')
if len(forms) > 0: if len(forms) > 0:
self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')

View file

@ -22,6 +22,8 @@ RSSID_RE = re.compile('tag:.*:(\w)\w+/(\d+)')
ID2URL_RE = re.compile('^(\w)([\w\-_]*)\.([^\.]+)$') ID2URL_RE = re.compile('^(\w)([\w\-_]*)\.([^\.]+)$')
URL2ID_DIARY_RE = re.compile('.*/users/([\w\-_]+)/journaux/([^\.]+)') URL2ID_DIARY_RE = re.compile('.*/users/([\w\-_]+)/journaux/([^\.]+)')
URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)') URL2ID_NEWSPAPER_RE = re.compile('.*/news/(.+)')
URL2ID_WIKI_RE = re.compile('.*/wiki/(.+)')
URL2ID_FORUM_RE = re.compile('.*/forums/([\w\-_]+)/posts/([^\.]+)')
def rssid(entry): def rssid(entry):
m = RSSID_RE.match(entry.id) m = RSSID_RE.match(entry.id)
@ -32,6 +34,11 @@ def rssid(entry):
if not mm: if not mm:
return return
return 'D%s.%s' % (mm.group(1), m.group(2)) return 'D%s.%s' % (mm.group(1), m.group(2))
if m.group(1) == 'F':
mm = URL2ID_FORUM_RE.match(entry.link)
if not mm:
return
return 'F%s.%s' % (mm.group(1), m.group(2))
return '%s.%s' % (m.group(1), m.group(2)) return '%s.%s' % (m.group(1), m.group(2))
def id2url(id): def id2url(id):
@ -43,6 +50,10 @@ def id2url(id):
return '/news/%s' % m.group(3) return '/news/%s' % m.group(3)
if m.group(1) == 'D': if m.group(1) == 'D':
return '/users/%s/journaux/%s' % (m.group(2), m.group(3)) return '/users/%s/journaux/%s' % (m.group(2), m.group(3))
if m.group(1) == 'W':
return '/wiki/%s' % m.group(3)
if m.group(1) == 'F':
return '/forums/%s/posts/%s' % (m.group(2), m.group(3))
def url2id(url): def url2id(url):
m = URL2ID_NEWSPAPER_RE.match(url) m = URL2ID_NEWSPAPER_RE.match(url)
@ -51,17 +62,14 @@ def url2id(url):
m = URL2ID_DIARY_RE.match(url) m = URL2ID_DIARY_RE.match(url)
if m: if m:
return 'D%s.%s' % (m.group(1), m.group(2)) return 'D%s.%s' % (m.group(1), m.group(2))
m = URL2ID_WIKI_RE.match(url)
if m:
return 'W.%s' % (m.group(1))
m = URL2ID_FORUM_RE.match(url)
if m:
return 'F%s.%s' % (m.group(1), m.group(2))
def id2threadid(id): def id2threadid(id):
m = ID2URL_RE.match(id) m = ID2URL_RE.match(id)
if m: if m:
return m.group(3) return m.group(3)
def id2contenttype(_id):
if not _id:
return None
if _id[0] == 'N':
return 1
if _id[0] == 'D':
return 5
return None