From 90641ef956e18ba00d59facfdbae66cb5e6f571d Mon Sep 17 00:00:00 2001 From: Roger Philibert Date: Thu, 14 Apr 2011 03:12:11 +0200 Subject: [PATCH] E-Hentai backend --- weboob/backends/ehentai/__init__.py | 22 ++++++ weboob/backends/ehentai/backend.py | 79 +++++++++++++++++++++ weboob/backends/ehentai/browser.py | 98 ++++++++++++++++++++++++++ weboob/backends/ehentai/gallery.py | 35 ++++++++++ weboob/backends/ehentai/pages.py | 103 ++++++++++++++++++++++++++++ weboob/backends/ehentai/test.py | 38 ++++++++++ 6 files changed, 375 insertions(+) create mode 100644 weboob/backends/ehentai/__init__.py create mode 100644 weboob/backends/ehentai/backend.py create mode 100644 weboob/backends/ehentai/browser.py create mode 100644 weboob/backends/ehentai/gallery.py create mode 100644 weboob/backends/ehentai/pages.py create mode 100644 weboob/backends/ehentai/test.py diff --git a/weboob/backends/ehentai/__init__.py b/weboob/backends/ehentai/__init__.py new file mode 100644 index 00000000..cc134689 --- /dev/null +++ b/weboob/backends/ehentai/__init__.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from .backend import EHentaiBackend + +__all__ = ['EHentaiBackend'] diff --git a/weboob/backends/ehentai/backend.py b/weboob/backends/ehentai/backend.py new file mode 100644 index 00000000..99657781 --- /dev/null +++ b/weboob/backends/ehentai/backend.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from __future__ import with_statement + +import datetime +import re +import urllib + +from weboob.capabilities.gallery import ICapGallery +from weboob.tools.backend import BaseBackend +from weboob.tools.misc import to_unicode +from weboob.tools.value import Value, ValuesDict + +from .browser import EHentaiBrowser +from .gallery import EHentaiGallery, EHentaiImage + + +__all__ = ['EHentaiBackend'] + + +class EHentaiBackend(BaseBackend, ICapGallery): + NAME = 'ehentai' + MAINTAINER = 'Roger Philibert' + EMAIL = 'roger.philibert@gmail.com' + VERSION = '0.7' + DESCRIPTION = 'E-hentai galleries' + LICENSE = 'GPLv3' + BROWSER = EHentaiBrowser + CONFIG = ValuesDict( + Value('domain', label='Domain', default='g.e-hentai.org'), + Value('username', label='Username', default=None), + Value('password', label='Password', default=None, masked=True)) + + def create_default_browser(self): + return self.create_browser( + self.config['domain'], + self.config['username'], + self.config['password']) + + def iter_search_results(self, pattern=None, sortby=None, max_results=None): + with self.browser: + return self.browser.iter_search_results(pattern) + + def iter_gallery_images(self, gallery): + self.fillobj(gallery, ('url',)) + with self.browser: + return self.browser.iter_gallery_images(gallery) + + def get_gallery(self, _id): + return EHentaiGallery(_id) + + def fill_gallery(self, gallery, fields): + with self.browser: + self.browser.fill_gallery(gallery, fields) + + def fill_image(self, image, fields): + with self.browser: + image.url = self.browser.get_image_url(image) + + OBJECTS = { + EHentaiGallery: fill_gallery, + EHentaiImage: fill_image } diff --git a/weboob/backends/ehentai/browser.py b/weboob/backends/ehentai/browser.py new file mode 100644 index 00000000..aeb44a65 --- /dev/null +++ b/weboob/backends/ehentai/browser.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword +from urllib import urlencode + +from .pages import IndexPage, GalleryPage, ImagePage, HomePage, LoginPage +from .gallery import EHentaiImage + + +__all__ = ['EHentaiBrowser'] + + +class EHentaiBrowser(BaseBrowser): + ENCODING = None + PAGES = { + r'http://[^/]+/': IndexPage, + r'http://[^/]+/\?.*': IndexPage, + r'http://[^/]+/g/.+': GalleryPage, + r'http://[^/]+/s/.*': ImagePage, + r'http://[^/]+/home\.php': HomePage, + r'http://e-hentai\.org/bounce_login\.php': LoginPage, + } + + def __init__(self, domain, username, password, *args, **kwargs): + self.DOMAIN = domain + self.logged = False + BaseBrowser.__init__(self, *args, **kwargs) + if password is not None: + self.login(username, password) + + def _gallery_page(self, gallery, n): + return gallery.url + ('?p=%d' % n) + + def iter_search_results(self, pattern): + self.location(self.buildurl('/', f_search=pattern)) + assert self.is_on_page(IndexPage) + return self.page.iter_galleries() + + def iter_gallery_images(self, gallery): + self.location(gallery.url) + assert self.is_on_page(GalleryPage) + i = 0 + while True: + n = self.page._next_page_link(); + + for img in self.page.image_pages(): + yield EHentaiImage(img) + + if n is None: + break + + i += 1 + self.location(self._gallery_page(gallery, i)) + assert self.is_on_page(GalleryPage) + + def get_image_url(self, image): + self.location(image.id) + assert self.is_on_page(ImagePage) + return self.page.get_url() + + def fill_gallery(self, gallery, fields): + self.location(gallery.id) + assert self.is_on_page(GalleryPage) + gallery.url = gallery.id + self.page.fill_gallery(gallery) + + def login(self, username, password): + assert isinstance(username, basestring) + assert isinstance(password, basestring) + + data = {'ipb_login_username': username, + 'ipb_login_password': password} + self.location('http://e-hentai.org/bounce_login.php', urlencode(data), no_login=True) + + assert self.is_on_page(LoginPage) + if not self.page.is_logged(): + raise BrowserIncorrectPassword() + + # necessary in order to reach the fjords + self.home() + diff --git a/weboob/backends/ehentai/gallery.py b/weboob/backends/ehentai/gallery.py new file mode 100644 index 00000000..dcbbc85b --- /dev/null +++ b/weboob/backends/ehentai/gallery.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.gallery import BaseGallery, BaseImage + +__all_ = ['EHentaiGallery', 'EHentaiImage'] + +class EHentaiGallery(BaseGallery): + def __init__(self, *args, **kwargs): + BaseGallery.__init__(self, *args, **kwargs) + self.nsfw = True + + def iter_image(self): + self.browser.iter_gallery_images() + + +class EHentaiImage(BaseImage): + def __init__(self, *args, **kwargs): + BaseImage.__init__(self, *args, **kwargs) diff --git a/weboob/backends/ehentai/pages.py b/weboob/backends/ehentai/pages.py new file mode 100644 index 00000000..d3cda26a --- /dev/null +++ b/weboob/backends/ehentai/pages.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlsoupparser import LxmlSoupParser +from weboob.tools.misc import html2text +from weboob.capabilities.gallery import Thumbnail +from datetime import datetime +import re + +from .gallery import EHentaiGallery, EHentaiImage + +__all__ = ['GalleryPage', 'ImagePage', 'IndexPage', 'HomePage', 'LoginPage'] + +class LoginPage(BasePage): + def is_logged(self): + success_p = self.document.xpath( + '//p[text() = "Login Successful. You will be returned momentarily."]') + if len(success_p): + print 'logged on' + return True + else: + print 'not logged on' + return False + +class HomePage(BasePage): + pass + +class IndexPage(BasePage): + def iter_galleries(self): + lines = self.document.xpath('//table[@class="itg"]//tr[@class="gtr0" or @class="gtr1"]') + for line in lines: + a = line.xpath('.//div[@class="it3"]/a')[-1] + url = a.attrib["href"] + title = a.text.strip() + yield EHentaiGallery(url, title=title) + +class GalleryPage(BasePage): + def image_pages(self): + return self.document.xpath('//div[@class="gdtm"]//a/attribute::href') + + def _next_page_link(self): + try: + return self.document.xpath("//table[@class='ptt']//a[text()='>']")[0] + except IndexError: + return None + + def fill_gallery(self, gallery): + gallery.title = self.document.xpath("//h1[@id='gn']/text()")[0] + try: + gallery.original_title = self.document.xpath("//h1[@id='gj']/text()")[0] + except IndexError: + gallery.orginal_title = None + description_div = self.document.xpath("//div[@id='gds']/div")[0] + description_html = self.parser.tostring(description_div) + gallery.description = html2text(description_html) + cardinality_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Images:']/td[@class='gdt2']/text()")[0] + gallery.cardinality = int(re.match(r"\d+", cardinality_string).group(0)) + date_string = self.document.xpath("//div[@id='gdd']//tr[td[@class='gdt1']/text()='Posted:']/td[@class='gdt2']/text()")[0] + gallery.date = datetime.strptime(date_string, "%Y-%m-%d %H:%M") + rating_string = self.document.xpath("//td[@id='rating_label']/text()")[0] + rating_match = re.search(r"\d+\.\d+", rating_string) + if rating_match is None: + gallery.rating = None + else: + gallery.rating = float(rating_match.group(0)) + + gallery.rating_max = 5 + + try: + thumbnail_url = self.document.xpath("//div[@class='gdtm']/a/img/attribute::src")[0] + except IndexError: + thumbnail_style = self.document.xpath("//div[@class='gdtm']/div/attribute::style")[0] + thumbnail_url = re.search(r"background:[^;]+url\((.+?)\)", thumbnail_style).group(1) + + gallery.thumbnail = Thumbnail(thumbnail_url) + + def _prev_page_link(self): + try: + return self.document.xpath("//table[@class='ptt']//a[text()='<']")[0] + except IndexError: + return None + +class ImagePage(BasePage): + def get_url(self): + return self.document.xpath('//div[@class="sni"]/a/img/attribute::src')[0] + diff --git a/weboob/backends/ehentai/test.py b/weboob/backends/ehentai/test.py new file mode 100644 index 00000000..014af2c4 --- /dev/null +++ b/weboob/backends/ehentai/test.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2010-2011 Roger Philibert +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +from weboob.tools.test import BackendTest + +class EHentaiTest(BackendTest): + BACKEND = 'ehentai' + + def test_ehentai(self): + l = list(self.backend.iter_search_results('lol')) + self.assertTrue(len(l) > 0) + v = l[0] + self.backend.fillobj(v, ('url',)) + self.assertTrue(v.url and v.url.startswith('http://'), 'URL for gallery "%s" not found: %s' % (v.id, v.url)) + self.backend.browser.openurl(v.url) + + img = self.backend.iter_gallery_images(v).next() + self.backend.fillobj(img, ('url',)) + self.assertTrue(v.url and v.url.startswith('http://'), 'URL for first image in gallery "%s" not found: %s' % (v.id, img.url)) + self.backend.browser.openurl(img.url) +