[twitter] follow timelines in search requests

2014-06-01 16:37:37 +02:00 · 2014-06-01 16:37:37 +02:00 · a8fad76245
commit a8fad76245
parent c20e6123fc
2 changed files with 50 additions and 22 deletions
--- a/modules/twitter/browser.py
+++ b/modules/twitter/browser.py
@ -20,7 +20,8 @@
 from weboob.tools.browser2 import LoginBrowser, URL, need_login
 from weboob.tools.browser import BrowserIncorrectPassword
 from weboob.capabilities.messages import Message
-from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage, TimelinePage, HomeTimelinePage
+from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage,\
+                   TimelinePage, HomeTimelinePage, SearchTimelinePage


 __all__ = ['TwitterBrowser']
@ -34,8 +35,8 @@ class TwitterBrowser(LoginBrowser):
    tweet = URL(u'i/tweet/create', Tweet)
    trends = URL(u'trends', TrendsPage)
    hashtag = URL(u'hashtag/(?P<path>.+)\?f=realtime', TwitterBasePage)
-    search = URL(u'search\?q="(?P<path>.+)&f=realtime&src=typd"', TwitterBasePage)
-    profil = URL(u'i/profiles/show/(?P<path>.+)/timeline', HomeTimelinePage)
+    search = URL(u'i/search/timeline', SearchTimelinePage)
+    profil = URL(u'i/profiles/show/(?P<path>.+)/timeline/with_replies', HomeTimelinePage)
    timeline = URL(u'i/timeline', TimelinePage)
    login = URL(u'', LoginPage)

@ -117,4 +118,7 @@ class TwitterBrowser(LoginBrowser):
        return self.hashtag.go(path=path.lstrip('#')).iter_threads()

    def get_tweets_from_search(self, path):
-        return self.search.go(path=path).iter_threads()
+        params = {'q': "%s" % path,
+                  'src': 'typd',
+                  'f': 'realtime'}
+        return self.search.go(params=params).iter_threads(params=params)
--- a/modules/twitter/pages.py
+++ b/modules/twitter/pages.py
@ -21,12 +21,13 @@ from datetime import datetime
 from weboob.tools.date import DATE_TRANSLATE_FR
 from io import StringIO
 import lxml.html as html
+import urllib

 from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound, pagination
 from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter
 from weboob.capabilities.messages import Thread, Message
 from weboob.capabilities.base import CapBaseObject
-__all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage', 'TimelinePage', 'HomeTimelinePage']
+__all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage', 'TimelinePage', 'HomeTimelinePage', 'SearchTimeLinePage']


 class DatetimeFromTimestamp(Filter):
@ -38,14 +39,18 @@ class TwitterJsonHTMLPage(JsonPage):

    ENCODING = None
    has_next = None
+    scroll_cursor = None

    def __init__(self, browser, response, *args, **kwargs):
        super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
        self.encoding = self.ENCODING or response.encoding
        parser = html.HTMLParser(encoding=self.encoding)
-        if hasattr(self.doc, 'module_html'):
+        if 'module_html' in self.doc:
            self.doc = html.parse(StringIO(self.doc['module_html']), parser)
        else:
+            if 'scroll_cursor' in self.doc:
+                self.scroll_cursor = self.doc['scroll_cursor']
+
            self.has_next = self.doc['has_more_items']
            self.doc = html.parse(StringIO(self.doc['items_html']), parser)

@ -131,38 +136,57 @@ class TrendsPage(TwitterJsonHTMLPage):
            obj_id = Attr('.', 'data-trend-name')


+class TimelineListElement(ListElement):
+    item_xpath = '//*[@data-item-type="tweet"]/div'
+
+    def get_last_id(self):
+        _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
+        return Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/.+/status/(.+)')(_el)
+
+    class item(ItemElement):
+        klass = Thread
+
+        obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/(.+)/status/(.+)', '\\1#\\2')
+        obj_title = Format('%s \n\t %s',
+                           CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
+                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
+                           CleanText('./div/p',
+                                     replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
+        obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]/span', 'data-time'))
+
+
 class TimelinePage(TwitterJsonHTMLPage):
    @pagination
    @method
-    class iter_threads(ListElement):
-        item_xpath = '//*[@data-item-type="tweet"]/div'
+    class iter_threads(TimelineListElement):

        def next_page(self):
            if self.page.has_next:
                return u'%s?max_position=%s' % (self.page.url.split('?')[0], self.get_last_id())

-        def get_last_id(self):
-            _el = self.page.doc.xpath('//*[@data-item-type="tweet"]/div')[-1]
-            return Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/.+/status/(.+)')(_el)

-        class item(ItemElement):
-            klass = Thread
+class HomeTimelinePage(TwitterJsonHTMLPage):
+    @pagination
+    @method
+    class iter_threads(TimelineListElement):

-            obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/(.+)/status/(.+)', '\\1#\\2')
-            obj_title = Format('%s \n\t %s',
-                               CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a',
-                                         replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]),
-                               CleanText('./div/p',
-                                         replace=[('@ ', '@'), ('# ', '#'), ('http:// ', 'http://')]))
-            obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]/span', 'data-time'))
-
-
-class HomeTimelinePage(TimelinePage):
        def next_page(self):
            if self.page.has_next:
                return u'%s?max_id=%s' % (self.page.url.split('?')[0], self.get_last_id())


+class SearchTimelinePage(TwitterJsonHTMLPage):
+    @pagination
+    @method
+    class iter_threads(TimelineListElement):
+
+        def next_page(self):
+            params = self.env['params']
+            params['scroll_cursor'] = self.page.scroll_cursor
+            if self.page.has_next:
+                return u'%s?%s' % (self.page.url.split('?')[0], urllib.urlencode(params))
+
+
 class LoginErrorPage(HTMLPage):
    pass