more flexible code

2011-08-24 17:19:37 +02:00 · 2011-08-24 17:19:37 +02:00 · c0773c1f92
commit c0773c1f92
parent 2184951c4a
4 changed files with 69 additions and 22 deletions
--- a/weboob/backends/phpbb/pages/forum.py
+++ b/weboob/backends/phpbb/pages/forum.py
@ -19,7 +19,8 @@


 from urlparse import urlsplit, parse_qs
-from datetime import datetime
+
+from weboob.tools.browser import BrokenPageError

 from .index import PhpBBPage
 from ..tools import parse_date
@ -51,7 +52,16 @@ class ForumPage(PhpBBPage):
            title = li.cssselect('a.topictitle')[0]
            link = Link(Link.TOPIC, title.attrib['href'])
            link.title = title.text.strip()
-            link.date = parse_date(li.find('dl').find('dt').findall('a')[-1].tail.strip(u'» \r\n'))
+            for a in li.find('dl').find('dt').findall('a'):
+                for text in (a.text, a.tail):
+                    if text is None:
+                        continue
+                    try:
+                        link.date = parse_date(text.strip(u'» \r\n'))
+                    except ValueError:
+                        continue
+                    else:
+                        break
            # it only lists number of answers, so we add 1.
            link.nb_messages = int(li.cssselect('dd.posts')[0].text.strip()) + 1
            yield link
@ -78,13 +88,13 @@ class TopicPage(PhpBBPage):
        args = parse_qs(v.query)
        self.topic_id = int(args['t'][0])

-        nav = self.parser.select(self.document.getroot(), 'li.icon-home', 1)
        self.forum_title = u''
-        for a in nav.findall('a')[1:]:
-            text = a.text.strip()
+        nav = self.parser.select(self.document.getroot(), 'li.icon-home')
+        if len(nav) > 0:
+            text = nav[0].findall('a')[-1].text.strip()
            if len(text) >= 20:
                text = text[:20] + u'…'
-            self.forum_title = '[%s]' % text
+            self.forum_title = '[%s] ' % text

    def next_page_url(self):
        return self.document.getroot().cssselect('a.right-box')[0].attrib['href']
@ -116,9 +126,39 @@ class TopicPage(PhpBBPage):

        id = div.attrib['id'][1:]
        post = Post(self.topic_id, id)
-        post.title = '%s %s' % (self.forum_title, body.cssselect('h3 a')[0].text.strip())
-        post.author = profile.cssselect('dt a')[-1].text.strip()
-        post.date = parse_date(body.cssselect('p.author')[0].find('strong').tail.strip(u'» \n\r'))
+
+        title_tags = body.cssselect('h3 a')
+        if len(title_tags) == 0:
+            title_tags = self.document.getroot().cssselect('h2 a')
+        if len(title_tags) == 0:
+            title = u''
+            self.logger.warning('Unable to parse title')
+        else:
+            title = title_tags[0].text.strip()
+
+        post.title = self.forum_title + title
+        for a in profile.cssselect('dt a'):
+            if a.text:
+                post.author = a.text.strip()
+
+        p_tags = body.cssselect('p.author')
+        if len(p_tags) == 0:
+            p_tags = body.find('p')
+        if len(p_tags) == 0:
+            post.date = None
+            self.logger.warning('Unable to parse datetime')
+        else:
+            p = p_tags[0]
+            text = p.find('strong') and p.find('strong').tail
+            if not text:
+                text = p.text[4:]
+
+            text = text.strip(u'» \n\r')
+            try:
+                post.date = parse_date(text)
+            except ValueError:
+                self.logger.warning(u'Unable to parse datetime "%s"' % text)
+
        post.content = self.parser.tostring(body.cssselect('div.content')[0])

        signature = body.cssselect('div.signature')