From b4c672fa464a883b484079014f3329c4baeee5e4 Mon Sep 17 00:00:00 2001 From: Christophe Benz Date: Wed, 14 Jul 2010 02:27:40 +0200 Subject: [PATCH] new select() helper --- weboob/backends/aum/pages/home.py | 4 +-- weboob/backends/youjizz/pages/index.py | 29 ++++++---------- weboob/backends/youporn/pages/video.py | 8 ++--- weboob/backends/youtube/pages.py | 21 +++-------- weboob/tools/browser/browser.py | 6 +--- weboob/tools/parsers/lxmlparser.py | 48 ++++++++++++++++++++++++-- 6 files changed, 66 insertions(+), 50 deletions(-) diff --git a/weboob/backends/aum/pages/home.py b/weboob/backends/aum/pages/home.py index f4c6569d..46454229 100644 --- a/weboob/backends/aum/pages/home.py +++ b/weboob/backends/aum/pages/home.py @@ -18,7 +18,6 @@ import re -from weboob.tools.browser import ExpectedElementNotFound from weboob.backends.aum.pages.base import PageBase from logging import error @@ -47,4 +46,5 @@ class HomePage(PageBase): i += 1 if i == 3: return int(font.firstChild.data) - raise ExpectedElementNotFound(u'Could not parse number of charms available') + logging.warning(u'Could not parse number of charms available') + return 0 diff --git a/weboob/backends/youjizz/pages/index.py b/weboob/backends/youjizz/pages/index.py index 288a14eb..d514acd7 100644 --- a/weboob/backends/youjizz/pages/index.py +++ b/weboob/backends/youjizz/pages/index.py @@ -16,9 +16,11 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +import datetime import re -from weboob.tools.browser import BasePage, ExpectedElementNotFound +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlparser import select from ..video import YoujizzVideo @@ -28,31 +30,20 @@ __all__ = ['IndexPage'] class IndexPage(BasePage): def iter_videos(self): - div_id = 'span#miniatura' - span_list = self.document.getroot().cssselect(div_id) - if not span_list: - raise ExpectedElementNotFound(div_id) - + span_list = select(self.document.getroot(), 'span#miniatura') for span in span_list: - a = span.find('.//a') - if a is None: - raise ExpectedElementNotFound('%s.//a' % span) + a = select(span, 'a', 1) url = a.attrib['href'] _id = re.sub(r'/videos/(.+)\.html', r'\1', url) thumbnail_url = span.find('.//img').attrib['src'] - title1_selector = 'span#title1' - title1 = span.cssselect(title1_selector) - if title1 is None: - raise ExpectedElementNotFound(title1_selector) - title = title1[0].text.strip() + selector = 'span#title1' + title_el = select(span, 'span#title1', 1) + title = title_el.text.strip() - thumbtime = span.cssselect('span.thumbtime') - minutes = seconds = 0 - if thumbtime is not None: - time_span = thumbtime[0].find('span') - minutes, seconds = (int(v) for v in time_span.text.strip().split(':')) + time_span = select(span, 'span.thumbtime span', 1) + minutes, seconds = (int(v) for v in time_span.text.strip().split(':')) yield YoujizzVideo(_id, title=title, diff --git a/weboob/backends/youporn/pages/video.py b/weboob/backends/youporn/pages/video.py index 1f651340..0a7cc21f 100644 --- a/weboob/backends/youporn/pages/video.py +++ b/weboob/backends/youporn/pages/video.py @@ -20,7 +20,7 @@ import re import datetime from logging import warning -from weboob.tools.browser import ExpectedElementNotFound +from weboob.tools.parsers.lxmlparser import select from .base import PornPage from ..video import YoupornVideo @@ -42,11 +42,7 @@ class VideoPage(PornPage): return el[0].cssselect('a')[0].attrib['href'] def get_title(self): - selector = '#videoArea h1' - try: - element = self.document.getroot().cssselect(selector)[0] - except IndexError: - raise ExpectedElementNotFound(selector) + element = select(self.document.getroot(), '#videoArea h1', 1) return unicode(element.getchildren()[0].tail).strip() DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)") diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py index bace372e..6fc523c4 100644 --- a/weboob/backends/youtube/pages.py +++ b/weboob/backends/youtube/pages.py @@ -18,7 +18,8 @@ import re -from weboob.tools.browser import BasePage, ExpectedElementNotFound +from weboob.tools.browser import BasePage +from weboob.tools.parsers.lxmlparser import select from .video import YoutubeVideo @@ -32,11 +33,7 @@ class ForbiddenVideo(Exception): class ForbiddenVideoPage(BasePage): def on_loaded(self): - selector = '.yt-alert-content' - try: - element = self.document.getroot().cssselect(selector)[0] - except IndexError: - raise ExpectedElementNotFound(selector) + element = select(self.document.getroot(), '.yt-alert-content', 1) raise ForbiddenVideo(element.text.strip()) @@ -57,19 +54,11 @@ class VideoPage(BasePage): ) def get_author(self): - selector = 'a.watch-description-username strong' - try: - element = self.document.getroot().cssselect(selector)[0] - except IndexError: - raise ExpectedElementNotFound(selector) + element = select(self.document.getroot(), 'a.watch-description-username strong', 1) return element.text.strip() def get_title(self): - selector = 'meta[name=title]' - try: - element = self.document.getroot().cssselect(selector)[0] - except IndexError: - raise ExpectedElementNotFound(selector) + element = select(self.document.getroot(), 'meta[name=title]', 1) return unicode(element.attrib['content']).strip() def get_url(self, _id): diff --git a/weboob/tools/browser/browser.py b/weboob/tools/browser/browser.py index 81322d92..20a783c0 100644 --- a/weboob/tools/browser/browser.py +++ b/weboob/tools/browser/browser.py @@ -43,7 +43,7 @@ else: __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry', - 'BasePage', 'BaseBrowser', 'ExpectedElementNotFound'] + 'BasePage', 'BaseBrowser'] # Exceptions @@ -63,10 +63,6 @@ class BrowserRetry(Exception): pass -class ExpectedElementNotFound(Exception): - pass - - class NoHistory(object): """ We don't want to fill memory with history diff --git a/weboob/tools/parsers/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py index 05783645..34ffaee6 100644 --- a/weboob/tools/parsers/lxmlparser.py +++ b/weboob/tools/parsers/lxmlparser.py @@ -21,7 +21,48 @@ import lxml.html from .iparser import IParser -__all__ = ['LxmlHtmlParser'] +__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException'] + + +class SelectElementException(Exception): + pass + + +def select(element, selector, nb=None, method='cssselect'): + """ + Select one or many elements from an element, using lxml cssselect by default. + + Raises SelectElementException if not found. + + @param element [obj] element on which to apply selector + @param selector [str] CSS or XPath expression + @param method [str] (cssselect|xpath) + @param nb [int] number of elements expected to be found. + Use None for undefined number, and 'many' for 1 to infinite. + @return one or many Element + """ + if method == 'cssselect': + results = element.cssselect(selector) + if nb is None: + return results + elif isinstance(nb, basestring) and nb == 'many': + if results is None or len(results) == 0: + raise SelectElementException('Element not found with selector "%s"' % selector) + elif len(results) == 1: + raise SelectElementException('Only one element found with selector "%s"' % selector) + else: + return results + elif isinstance(nb, int) and nb > 0: + if results is None: + raise SelectElementException('Element not found with selector "%s"' % selector) + elif len(results) < nb: + raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector)) + else: + return results[0] if nb == 1 else results + else: + raise Exception('Unhandled value for kwarg "nb": %s' % nb) + else: + raise NotImplementedError('Only cssselect method is implemented for the moment') class LxmlHtmlParser(IParser): @@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser): """ def parse(self, data, encoding=None): - parser = lxml.html.HTMLParser(encoding=encoding) + if encoding is None: + parser = None + else: + parser = lxml.html.HTMLParser(encoding=encoding) return lxml.html.parse(data, parser) def tostring(self, element):