[twitter] add pagination on timeline

This commit is contained in:
Bezleputh 2014-05-30 22:54:44 +02:00
commit 169b75b72f
2 changed files with 34 additions and 21 deletions

View file

@ -20,7 +20,7 @@
from weboob.tools.browser2 import LoginBrowser, URL, need_login from weboob.tools.browser2 import LoginBrowser, URL, need_login
from weboob.tools.browser import BrowserIncorrectPassword from weboob.tools.browser import BrowserIncorrectPassword
from weboob.capabilities.messages import Message from weboob.capabilities.messages import Message
from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage from .pages import LoginPage, LoginErrorPage, ThreadPage, TwitterBasePage, Tweet, TrendsPage, TimelinePage
__all__ = ['TwitterBrowser'] __all__ = ['TwitterBrowser']
@ -36,6 +36,7 @@ class TwitterBrowser(LoginBrowser):
hashtag = URL(u'hashtag/(?P<path>.+)', TwitterBasePage) hashtag = URL(u'hashtag/(?P<path>.+)', TwitterBasePage)
search = URL(u'search\?q="(?P<path>.+)"', TwitterBasePage) search = URL(u'search\?q="(?P<path>.+)"', TwitterBasePage)
profil = URL(u'(?P<path>.+)/with_replies', TwitterBasePage) profil = URL(u'(?P<path>.+)/with_replies', TwitterBasePage)
timeline = URL(u'i/timeline', TimelinePage)
login = URL(u'', LoginPage) login = URL(u'', LoginPage)
def do_login(self): def do_login(self):
@ -53,7 +54,7 @@ class TwitterBrowser(LoginBrowser):
@need_login @need_login
def iter_threads(self): def iter_threads(self):
return self.login.stay_or_go().iter_threads() return self.timeline.go().iter_threads()
def get_trendy_subjects(self): def get_trendy_subjects(self):
if self.username: if self.username:

View file

@ -22,11 +22,11 @@ from weboob.tools.date import DATE_TRANSLATE_FR
from io import StringIO from io import StringIO
import lxml.html as html import lxml.html as html
from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound from weboob.tools.browser2.page import HTMLPage, JsonPage, method, ListElement, ItemElement, FormNotFound, pagination
from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter from weboob.tools.browser2.filters import CleanText, Format, Link, Regexp, Env, DateTime, Attr, Filter
from weboob.capabilities.messages import Thread, Message from weboob.capabilities.messages import Thread, Message
from weboob.capabilities.base import CapBaseObject from weboob.capabilities.base import CapBaseObject
__all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage'] __all__ = ['LoginPage', 'LoginErrorPage', 'ThreadPage', 'TwitterBasePage', 'Tweet', 'TrendsPage', 'TimelinePage']
class DatetimeFromTimestamp(Filter): class DatetimeFromTimestamp(Filter):
@ -34,15 +34,20 @@ class DatetimeFromTimestamp(Filter):
return datetime.fromtimestamp(float(el)) return datetime.fromtimestamp(float(el))
class TwitterJsonHMLPage(JsonPage): class TwitterJsonHTMLPage(JsonPage):
ENCODING = None ENCODING = None
has_next = None
def __init__(self, browser, response, *args, **kwargs): def __init__(self, browser, response, *args, **kwargs):
super(TwitterJsonHMLPage, self).__init__(browser, response, *args, **kwargs) super(TwitterJsonHTMLPage, self).__init__(browser, response, *args, **kwargs)
self.encoding = self.ENCODING or response.encoding self.encoding = self.ENCODING or response.encoding
parser = html.HTMLParser(encoding=self.encoding) parser = html.HTMLParser(encoding=self.encoding)
if hasattr(self.doc, 'module_html'):
self.doc = html.parse(StringIO(self.doc['module_html']), parser) self.doc = html.parse(StringIO(self.doc['module_html']), parser)
else:
self.has_next = self.doc['has_more_items']
self.doc = html.parse(StringIO(self.doc['items_html']), parser)
class TwitterBasePage(HTMLPage): class TwitterBasePage(HTMLPage):
@ -79,19 +84,6 @@ class LoginPage(TwitterBasePage):
def get_me(self): def get_me(self):
return Regexp(Link('//a[@data-nav="profile"]'), '/(.+)')(self.doc) return Regexp(Link('//a[@data-nav="profile"]'), '/(.+)')(self.doc)
@method
class iter_threads(ListElement):
item_xpath = '//li[@data-item-type="tweet"]/div'
class item(ItemElement):
klass = Thread
obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]'), '/(.+)/status/(.+)', '\\1#\\2')
obj_title = Format('%s \n\t %s',
CleanText('./div/div[@class="stream-item-header"]/a'),
CleanText('./div/p'))
obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span', 'data-time'))
class ThreadPage(HTMLPage): class ThreadPage(HTMLPage):
@ -122,7 +114,7 @@ class ThreadPage(HTMLPage):
obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span', 'data-time')) obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span', 'data-time'))
class TrendsPage(TwitterJsonHMLPage): class TrendsPage(TwitterJsonHTMLPage):
@method @method
class get_trendy_subjects(ListElement): class get_trendy_subjects(ListElement):
@ -134,6 +126,26 @@ class TrendsPage(TwitterJsonHMLPage):
obj_id = Attr('.', 'data-trend-name') obj_id = Attr('.', 'data-trend-name')
class TimelinePage(TwitterJsonHTMLPage):
@pagination
@method
class iter_threads(ListElement):
item_xpath = '//*[@data-item-type="tweet"]/div'
def next_page(self):
if self.page.has_next:
return u'https://twitter.com/i/timeline?max_position=%s' % self.objects.keys()[-1].split('#')[-1]
class item(ItemElement):
klass = Thread
obj_id = Regexp(Link('./div/div/a[@class="details with-icn js-details"]|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]'), '/(.+)/status/(.+)', '\\1#\\2')
obj_title = Format('%s \n\t %s',
CleanText('./div/div[@class="stream-item-header"]/a|./div/div[@class="ProfileTweet-authorDetails"]/a'),
CleanText('./div/p'))
obj_date = DatetimeFromTimestamp(Attr('./div/div[@class="stream-item-header"]/small/a/span|./div/div/span/a[@class="ProfileTweet-timestamp js-permalink js-nav js-tooltip"]/span', 'data-time'))
class LoginErrorPage(HTMLPage): class LoginErrorPage(HTMLPage):
pass pass