more flexible code

This commit is contained in:
Romain Bignon 2011-08-24 17:19:37 +02:00
commit c0773c1f92
4 changed files with 69 additions and 22 deletions

View file

@ -19,7 +19,8 @@
from urlparse import urlsplit, parse_qs
from datetime import datetime
from weboob.tools.browser import BrokenPageError
from .index import PhpBBPage
from ..tools import parse_date
@ -51,7 +52,16 @@ class ForumPage(PhpBBPage):
title = li.cssselect('a.topictitle')[0]
link = Link(Link.TOPIC, title.attrib['href'])
link.title = title.text.strip()
link.date = parse_date(li.find('dl').find('dt').findall('a')[-1].tail.strip(u'» \r\n'))
for a in li.find('dl').find('dt').findall('a'):
for text in (a.text, a.tail):
if text is None:
continue
try:
link.date = parse_date(text.strip(u'» \r\n'))
except ValueError:
continue
else:
break
# it only lists number of answers, so we add 1.
link.nb_messages = int(li.cssselect('dd.posts')[0].text.strip()) + 1
yield link
@ -78,13 +88,13 @@ class TopicPage(PhpBBPage):
args = parse_qs(v.query)
self.topic_id = int(args['t'][0])
nav = self.parser.select(self.document.getroot(), 'li.icon-home', 1)
self.forum_title = u''
for a in nav.findall('a')[1:]:
text = a.text.strip()
nav = self.parser.select(self.document.getroot(), 'li.icon-home')
if len(nav) > 0:
text = nav[0].findall('a')[-1].text.strip()
if len(text) >= 20:
text = text[:20] + u''
self.forum_title = '[%s]' % text
self.forum_title = '[%s] ' % text
def next_page_url(self):
return self.document.getroot().cssselect('a.right-box')[0].attrib['href']
@ -116,9 +126,39 @@ class TopicPage(PhpBBPage):
id = div.attrib['id'][1:]
post = Post(self.topic_id, id)
post.title = '%s %s' % (self.forum_title, body.cssselect('h3 a')[0].text.strip())
post.author = profile.cssselect('dt a')[-1].text.strip()
post.date = parse_date(body.cssselect('p.author')[0].find('strong').tail.strip(u'» \n\r'))
title_tags = body.cssselect('h3 a')
if len(title_tags) == 0:
title_tags = self.document.getroot().cssselect('h2 a')
if len(title_tags) == 0:
title = u''
self.logger.warning('Unable to parse title')
else:
title = title_tags[0].text.strip()
post.title = self.forum_title + title
for a in profile.cssselect('dt a'):
if a.text:
post.author = a.text.strip()
p_tags = body.cssselect('p.author')
if len(p_tags) == 0:
p_tags = body.find('p')
if len(p_tags) == 0:
post.date = None
self.logger.warning('Unable to parse datetime')
else:
p = p_tags[0]
text = p.find('strong') and p.find('strong').tail
if not text:
text = p.text[4:]
text = text.strip(u'» \n\r')
try:
post.date = parse_date(text)
except ValueError:
self.logger.warning(u'Unable to parse datetime "%s"' % text)
post.content = self.parser.tostring(body.cssselect('div.content')[0])
signature = body.cssselect('div.signature')