pastealacon: Port to Browser2

It works, but there are a lot of things that could be improved.
This commit is contained in:
Laurent Bachelier 2014-03-22 22:31:01 +01:00
commit 22ce4438e5
5 changed files with 88 additions and 147 deletions

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Laurent Bachelier # Copyright(C) 2011-2014 Laurent Bachelier
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -18,19 +18,13 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re import re
from weboob.tools.capabilities.paste import BasePasteBackend from weboob.tools.capabilities.paste import BasePasteBackend
from weboob.tools.backend import BaseBackend from weboob.tools.backend import BaseBackend
from weboob.capabilities.base import NotLoaded from weboob.capabilities.base import NotLoaded
from .browser import PastealaconBrowser from .browser import PastealaconBrowser, PastealaconPaste
from .paste import PastealaconPaste
__all__ = ['PastealaconBackend']
class PastealaconBackend(BaseBackend, BasePasteBackend): class PastealaconBackend(BaseBackend, BasePasteBackend):
@ -53,7 +47,7 @@ class PastealaconBackend(BaseBackend, BasePasteBackend):
def can_post(self, contents, title=None, public=None, max_age=None): def can_post(self, contents, title=None, public=None, max_age=None):
try: try:
contents.encode(self.browser.ENCODING) contents.encode('ISO-8859-1')
except UnicodeEncodeError: except UnicodeEncodeError:
return 0 return 0
if public is False: if public is False:
@ -67,20 +61,17 @@ class PastealaconBackend(BaseBackend, BasePasteBackend):
return 1 return 1
def get_paste(self, _id): def get_paste(self, _id):
with self.browser: return self.browser.get_paste(_id)
return self.browser.get_paste(_id)
def fill_paste(self, paste, fields): def fill_paste(self, paste, fields):
# if we only want the contents # if we only want the contents
if fields == ['contents']: if fields == ['contents']:
if paste.contents is NotLoaded: if paste.contents is NotLoaded:
with self.browser: contents = self.browser.get_contents(paste.id)
contents = self.browser.get_contents(paste.id) paste.contents = contents
paste.contents = contents
# get all fields # get all fields
elif fields is None or len(fields): elif fields is None or len(fields):
with self.browser: self.browser.fill_paste(paste)
self.browser.fill_paste(paste)
return paste return paste
def post_paste(self, paste, max_age=None): def post_paste(self, paste, max_age=None):
@ -88,7 +79,6 @@ class PastealaconBackend(BaseBackend, BasePasteBackend):
expiration = self.get_closest_expiration(max_age) expiration = self.get_closest_expiration(max_age)
else: else:
expiration = None expiration = None
with self.browser: self.browser.post_paste(paste, expiration=self.EXPIRATIONS.get(expiration))
self.browser.post_paste(paste, expiration=self.EXPIRATIONS.get(expiration))
OBJECTS = {PastealaconPaste: fill_paste} OBJECTS = {PastealaconPaste: fill_paste}

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Laurent Bachelier # Copyright(C) 2011-2014 Laurent Bachelier
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -17,43 +17,84 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from mechanize import RobustFactory
import re import re
from weboob.tools.browser import BaseBrowser, BrowserUnavailable, BrowserHTTPNotFound import requests
from weboob.capabilities.paste import PasteNotFound from weboob.capabilities.paste import BasePaste, PasteNotFound
from weboob.tools.browser.decorators import id2url, check_url from weboob.tools.browser2 import HTMLPage, PagesBrowser, URL
from .pages import PastePage, CaptchaPage, PostPage
from .paste import PastealaconPaste
__all__ = ['PastealaconBrowser']
class PastealaconBrowser(BaseBrowser): class Spam(Exception):
DOMAIN = 'pastealacon.com' def __init__(self):
ENCODING = 'ISO-8859-1' super(Spam, self).__init__("Detected as spam and unable to handle the captcha")
PASTE_URL = 'http://%s/(?P<id>\d+)' % DOMAIN
PAGES = {PASTE_URL: PastePage,
'http://%s/%s' % (DOMAIN, re.escape('pastebin.php?captcha=1')): CaptchaPage,
'http://%s/' % DOMAIN: PostPage}
def __init__(self, *args, **kwargs):
kwargs['factory'] = RobustFactory()
BaseBrowser.__init__(self, *args, **kwargs)
@id2url(PastealaconPaste.id2url) class PastealaconPaste(BasePaste):
@check_url(PASTE_URL) # all pastes are public
public = True
# TODO perhaps move this logic elsewhere, remove this and id2url from capability
# (page_url is required by pastoob)
@property
def page_url(self):
return '%s%s' % (PastealaconBrowser.BASEURL, self.id)
class PastePage(HTMLPage):
# TODO use magic Browser2 methods (if possible)
def fill_paste(self, paste):
# there is no 404, try to detect if there really is a content
if len(self.doc.xpath('id("content")/div[@class="syntax"]//ol')) != 1:
raise PasteNotFound()
header = self.doc.xpath('id("content")/h3')[0]
matches = re.match(r'Posted by (?P<author>.+) on (?P<date>.+) \(', header.text)
paste.title = matches.groupdict().get('author')
paste.contents = unicode(self.doc.xpath('//textarea[@id="code"]')[0].text)
return paste
def get_id(self):
return self.params['id']
class CaptchaPage(HTMLPage):
pass
class PostPage(HTMLPage):
# TODO handle encoding in Browser2
def post(self, paste, expiration=None):
encoding = 'ISO-8859-1'
form = self.get_form(name='editor')
form['code2'] = paste.contents.encode(encoding)
form['poster'] = paste.title.encode(encoding)
if expiration:
form['expiry'] = expiration
form.submit()
class PastealaconBrowser(PagesBrowser):
BASEURL = 'http://pastealacon.com/'
paste = URL(r'(?P<id>\d+)', PastePage)
captcha = URL(r'%s' % re.escape('pastebin.php?captcha=1'), CaptchaPage)
raw = URL(r'%s(?P<id>\d+)' % re.escape('pastebin.php?dl='))
post = URL(r'$', PostPage)
@paste.id2url
def get_paste(self, url): def get_paste(self, url):
_id = re.match('^%s$' % self.PASTE_URL, url).groupdict()['id'] url = self.absurl(url, base=True)
return PastealaconPaste(_id) m = self.paste.match(url)
if m:
return PastealaconPaste(m.groupdict()['id'])
def fill_paste(self, paste): def fill_paste(self, paste):
""" """
Get as much as information possible from the paste page Get as much as information possible from the paste page
""" """
self.location(paste.page_url) self.paste.stay_or_go(id=paste.id)
return self.page.fill_paste(paste) return self.page.fill_paste(paste)
def get_contents(self, _id): def get_contents(self, _id):
@ -63,13 +104,16 @@ class PastealaconBrowser(BaseBrowser):
Returns unicode. Returns unicode.
""" """
try: try:
return self.readurl('http://%s/pastebin.php?dl=%s' % (self.DOMAIN, _id), if_fail='raise').decode(self.ENCODING) return self.raw.open(id=_id).text
except BrowserHTTPNotFound: # TODO maybe have Browser2 raise a specialized exception
raise PasteNotFound() except requests.exceptions.HTTPError as e:
if e.response.status_code == requests.codes.not_found:
raise PasteNotFound()
else:
raise e
def post_paste(self, paste, expiration=None): def post_paste(self, paste, expiration=None):
self.home() self.post.stay_or_go().post(paste, expiration=expiration)
self.page.post(paste, expiration=expiration) if self.captcha.is_here():
if self.is_on_page(CaptchaPage): raise Spam()
raise BrowserUnavailable("Detected as spam and unable to handle the captcha")
paste.id = self.page.get_id() paste.id = self.page.get_id()

View file

@ -1,62 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import re
from weboob.tools.browser import BasePage, BrokenPageError
from weboob.capabilities.paste import PasteNotFound
__all__ = ['PastePage', 'PostPage', 'CaptchaPage']
class PastePage(BasePage):
def fill_paste(self, paste):
root = self.document.getroot()
try:
# there is no 404, try to detect if there really is a content
self.parser.select(root, 'id("content")/div[@class="syntax"]//ol', 1, 'xpath')
except BrokenPageError:
raise PasteNotFound()
header = self.parser.select(root, 'id("content")/h3', 1, 'xpath')
matches = re.match(r'Posted by (?P<author>.+) on (?P<date>.+) \(', header.text)
paste.title = matches.groupdict().get('author')
paste.contents = self.parser.select(root, '//textarea[@id="code"]', 1, 'xpath').text
return paste
def get_id(self):
"""
Find out the ID from the URL
"""
return self.group_dict['id']
class PostPage(BasePage):
def post(self, paste, expiration=None):
self.browser.select_form(name='editor')
self.browser['code2'] = paste.contents.encode(self.browser.ENCODING)
self.browser['poster'] = paste.title.encode(self.browser.ENCODING)
if expiration:
self.browser['expiry'] = [expiration]
self.browser.submit()
class CaptchaPage(BasePage):
pass

View file

@ -1,33 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright(C) 2011 Laurent Bachelier
#
# This file is part of weboob.
#
# weboob is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# weboob is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.paste import BasePaste
__all__ = ['PastealaconPaste']
class PastealaconPaste(BasePaste):
# all pastes are public
public = True
@classmethod
def id2url(cls, _id):
return 'http://pastealacon.com/%s' % _id

View file

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright(C) 2011 Laurent Bachelier # Copyright(C) 2011-2014 Laurent Bachelier
# #
# This file is part of weboob. # This file is part of weboob.
# #
@ -20,10 +20,11 @@
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
from weboob.capabilities.base import NotLoaded from weboob.capabilities.base import NotLoaded
from weboob.tools.browser import BrowserUnavailable
from weboob.capabilities.paste import PasteNotFound from weboob.capabilities.paste import PasteNotFound
from .browser import Spam
class PastealaconTest(BackendTest): class PastealaconTest(BackendTest):
BACKEND = 'pastealacon' BACKEND = 'pastealacon'
@ -62,10 +63,11 @@ class PastealaconTest(BackendTest):
def test_spam(self): def test_spam(self):
p = self.backend.new_paste(None, title=u'viagra', contents=u'http://example.com/') p = self.backend.new_paste(None, title=u'viagra', contents=u'http://example.com/')
self.assertRaises(BrowserUnavailable, self.backend.post_paste, p) self.assertRaises(Spam, self.backend.post_paste, p)
def test_notfound(self): def test_notfound(self):
for _id in ('424242424242424242424242424242424242', 'http://pastealacon.com/424242424242424242424242424242424242'): for _id in ('424242424242424242424242424242424242',
'http://pastealacon.com/424242424242424242424242424242424242'):
# html method # html method
p = self.backend.get_paste(_id) p = self.backend.get_paste(_id)
self.assertRaises(PasteNotFound, self.backend.fillobj, p, ['title']) self.assertRaises(PasteNotFound, self.backend.fillobj, p, ['title'])