From a8fad76245f2470beb0c551e0f27a14153e6f8e8 Mon Sep 17 00:00:00 2001 From: Bezleputh Date: Sun, 1 Jun 2014 16:37:37 +0200 Subject: [PATCH] [twitter] follow timelines in search requests --- modules/twitter/browser.py | 12 +++++--- modules/twitter/pages.py | 62 ++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/modules/twitter/browser.py b/modules/twitter/browser.py index 6a5f9992..a2a40184 100644 --- a/modules/twitter/browser.py +++ b/modules/twitter/browser.py @@ -20,7 +20,8 @@ from weboob.tools.browser2 import LoginBrowser, URL, need_login from weboob.tools.browser import BrowserIncorrectPassword from weboob.capabilities.messages import Message -from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage, TimelinePage, HomeTimelinePage +from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage,\ + TimelinePage, HomeTimelinePage, SearchTimelinePage __all__ = ['TwitterBrowser'] @@ -34,8 +35,8 @@ class TwitterBrowser(LoginBrowser): tweet = URL(u'i/tweet/create', Tweet) trends = URL(u'trends', TrendsPage) hashtag = URL(u'hashtag/(?P.+)\?f=realtime', TwitterBasePage) - search = URL(u'search\?q="(?P.+)&f=realtime&src=typd"', TwitterBasePage) - profil = URL(u'i/profiles/show/(?P.+)/timeline', HomeTimelinePage) + search = URL(u'i/search/timeline', SearchTimelinePage) + profil = URL(u'i/profiles/show/(?P.+)/timeline/with_replies', HomeTimelinePage) timeline = URL(u'i/timeline', TimelinePage) login = URL(u'', LoginPage) @@ -117,4 +118,7 @@ class TwitterBrowser(LoginBrowser): return self.hashtag.go(path=path.lstrip('#')).iter_threads() def get_tweets_from_search(self, path): - return self.search.go(path=path).iter_threads() + params = {'q': "%s" % path, + 'src': 'typd', + 'f': 'realtime'} + return self.search.go(params=params).iter_threads(params=params) diff --git a/modules/twitter/pages.py b/modules/twitter/pages.py index f292fad5..92444570 100644 --- a/modules/twitter/pages.py +++ b/modules/twitter/pages.py @@ -21,12 +21,13 @@ from datetime import datetime from weboob.tools.date import DATE_TRANSLATE_FR from io import StringIO import lxml.html as html +import urllib from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound, pagination from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter from weboob.capabilities.messages import Thread, Message from weboob.capabilities.base import CapBaseObject -__all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage', 'TimelinePage', 'HomeTimelinePage'] +__all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage', 'TimelinePage', 'HomeTimelinePage', 'SearchTimeLinePage'] class DatetimeFromTimestamp(Filter): @@ -38,14 +39,18 @@ class TwitterJsonHTMLPage(JsonPage): ENCODING = None has_next = None + scroll_cursor = None def __init__(self, browser, response, *args, **kwargs): super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs) self.encoding = self.ENCODING or response.encoding parser = html.HTMLParser(encoding=self.encoding) - if hasattr(self.doc, 'module_html'): + if 'module_html' in self.doc: self.doc = html.parse(StringIO(self.doc['module_html']), parser) else: + if 'scroll_cursor' in self.doc: + self.scroll_cursor = self.doc['scroll_cursor'] + self.has_next = self.doc['has_more_items'] self.doc = html.parse(StringIO(self.doc['items_html']), parser) @@ -131,38 +136,57 @@ class TrendsPage(TwitterJsonHTMLPage): obj_id = Attr('.', 'data-trend-name') +class TimelineListElement(ListElement): + item_xpath = '//*[@data-item-type="tweet"]/div' + + def get_last_id(self): + _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1] + return Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/.+/status/(.+)')(_el) + + class item(ItemElement): + klass = Thread + + obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/(.+)/status/(.+)', '\\1#\\2') + obj_title = Format('%s \n\t %s', + CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a', + replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), + CleanText('./div/p', + replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])) + obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]/span', 'data-time')) + + class TimelinePage(TwitterJsonHTMLPage): @pagination @method - class iter_threads(ListElement): - item_xpath = '//*[@data-item-type="tweet"]/div' + class iter_threads(TimelineListElement): def next_page(self): if self.page.has_next: return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id()) - def get_last_id(self): - _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1] - return Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/.+/status/(.+)')(_el) - class item(ItemElement): - klass = Thread +class HomeTimelinePage(TwitterJsonHTMLPage): + @pagination + @method + class iter_threads(TimelineListElement): - obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/(.+)/status/(.+)', '\\1#\\2') - obj_title = Format('%s \n\t %s', - CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a', - replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]), - CleanText('./div/p', - replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')])) - obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]/span', 'data-time')) - - -class HomeTimelinePage(TimelinePage): def next_page(self): if self.page.has_next: return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id()) +class SearchTimelinePage(TwitterJsonHTMLPage): + @pagination + @method + class iter_threads(TimelineListElement): + + def next_page(self): + params = self.env['params'] + params['scroll_cursor'] = self.page.scroll_cursor + if self.page.has_next: + return u'%s?%s' % (self.page.url.split('?')[0], urllib.urlencode(params)) + + class LoginErrorPage(HTMLPage): pass