From 87f89178fcc81f018c9140a251b78848095dc540 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Mon, 24 Aug 2015 13:36:58 +0200 Subject: [PATCH] [twitter] fix comments parsing --- modules/twitter/pages.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modules/twitter/pages.py b/modules/twitter/pages.py index ee7986fc..37f6b533 100644 --- a/modules/twitter/pages.py +++ b/modules/twitter/pages.py @@ -104,18 +104,22 @@ class ThreadPage(HTMLPage): @method class iter_comments(ListElement): - item_xpath = '//ol[@id="stream-items-id"]/li/div' + item_xpath = '//ol[@id="stream-items-id"]/li/ol/div/li/div' class item(ItemElement): klass = Message - obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]'), '/.+/status/(.+)') + obj_id = Regexp(Link('./div/div/small/a', default=''), '/.+/status/(.+)', default=None) + obj_title = Regexp(CleanText('./div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), '(.{50}|.+).+') obj_content = CleanText('./div/p', replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]) - obj_sender = Regexp(Link('./div/div/a[@class="details with-icn js-details"]'), '/(.+)/status/.+') + obj_sender = Regexp(Link('./div/div/small/a', default=''), '/(.+)/status/.+', default=None) obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span | ./div/div[@class="ProfileTweet-authorDetails"]/span/a/span', 'data-time')) + def validate(self, obj): + return obj.id is not None + class SearchPage(HTMLPage): def get_trends_token(self):