new select() helper

2010-07-14 02:27:40 +02:00 · 2010-07-14 02:27:40 +02:00 · b4c672fa46
commit b4c672fa46
parent eb026b7c3c
6 changed files with 67 additions and 51 deletions
--- a/weboob/backends/aum/pages/home.py
+++ b/weboob/backends/aum/pages/home.py
@ -18,7 +18,6 @@
 import re
 from weboob.tools.browser import ExpectedElementNotFound
 from weboob.backends.aum.pages.base import PageBase
 from logging import error
@ -47,4 +46,5 @@ class HomePage(PageBase):
                        i += 1
                        if i == 3:
                            return int(font.firstChild.data)
-        raise ExpectedElementNotFound(u'Could not parse number of charms available')
+        logging.warning(u'Could not parse number of charms available')
        return 0
--- a/weboob/backends/youjizz/pages/index.py
+++ b/weboob/backends/youjizz/pages/index.py
@ -16,9 +16,11 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 import datetime
 import re
-from weboob.tools.browser import BasePage, ExpectedElementNotFound
+from weboob.tools.browser import BasePage
 from weboob.tools.parsers.lxmlparser import select
 from ..video import YoujizzVideo
@ -28,31 +30,20 @@ __all__ = ['IndexPage']
 class IndexPage(BasePage):
    def iter_videos(self):
-        div_id = 'span#miniatura'
+        span_list = select(self.document.getroot(), 'span#miniatura')
        span_list = self.document.getroot().cssselect(div_id)
        if not span_list:
            raise ExpectedElementNotFound(div_id)
        for span in span_list:
-            a = span.find('.//a')
+            a = select(span, 'a', 1)
            if a is None:
                raise ExpectedElementNotFound('%s.//a' % span)
            url = a.attrib['href']
            _id = re.sub(r'/videos/(.+)\.html', r'\1', url)
            thumbnail_url = span.find('.//img').attrib['src']
-            title1_selector = 'span#title1'
+            selector = 'span#title1'
-            title1 = span.cssselect(title1_selector)
+            title_el = select(span, 'span#title1', 1)
-            if title1 is None:
+            title = title_el.text.strip()
                raise ExpectedElementNotFound(title1_selector)
            title = title1[0].text.strip()
-            thumbtime = span.cssselect('span.thumbtime')
+            time_span = select(span, 'span.thumbtime span', 1)
-            minutes = seconds = 0
+            minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
            if thumbtime is not None:
                time_span = thumbtime[0].find('span')
                minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
            yield YoujizzVideo(_id,
                               title=title,
--- a/weboob/backends/youporn/pages/video.py
+++ b/weboob/backends/youporn/pages/video.py
@ -20,7 +20,7 @@ import re
 import datetime
 from logging import warning
-from weboob.tools.browser import ExpectedElementNotFound
+from weboob.tools.parsers.lxmlparser import select
 from .base import PornPage
 from ..video import YoupornVideo
@ -42,11 +42,7 @@ class VideoPage(PornPage):
            return el[0].cssselect('a')[0].attrib['href']
    def get_title(self):
-        selector = '#videoArea h1'
+        element = select(self.document.getroot(), '#videoArea h1', 1)
        try:
            element = self.document.getroot().cssselect(selector)[0]
        except IndexError:
            raise ExpectedElementNotFound(selector)
        return unicode(element.getchildren()[0].tail).strip()
    DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
--- a/weboob/backends/youtube/pages.py
+++ b/weboob/backends/youtube/pages.py
@ -18,7 +18,8 @@
 import re
-from weboob.tools.browser import BasePage, ExpectedElementNotFound
+from weboob.tools.browser import BasePage
 from weboob.tools.parsers.lxmlparser import select
 from .video import YoutubeVideo
@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):
 class ForbiddenVideoPage(BasePage):
    def on_loaded(self):
-        selector = '.yt-alert-content'
+        element = select(self.document.getroot(), '.yt-alert-content', 1)
        try:
            element = self.document.getroot().cssselect(selector)[0]
        except IndexError:
            raise ExpectedElementNotFound(selector)
        raise ForbiddenVideo(element.text.strip())
@ -57,19 +54,11 @@ class VideoPage(BasePage):
                                  )
    def get_author(self):
-        selector = 'a.watch-description-username strong'
+        element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
        try:
            element = self.document.getroot().cssselect(selector)[0]
        except IndexError:
            raise ExpectedElementNotFound(selector)
        return element.text.strip()
    def get_title(self):
-        selector = 'meta[name=title]'
+        element = select(self.document.getroot(), 'meta[name=title]', 1)
        try:
            element = self.document.getroot().cssselect(selector)[0]
        except IndexError:
            raise ExpectedElementNotFound(selector)
        return unicode(element.attrib['content']).strip()
    def get_url(self, _id):
--- a/weboob/tools/browser/browser.py
+++ b/weboob/tools/browser/browser.py
@ -43,7 +43,7 @@ else:
 __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
-           'BasePage', 'BaseBrowser', 'ExpectedElementNotFound']
+           'BasePage', 'BaseBrowser']
 # Exceptions
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
    pass
 class ExpectedElementNotFound(Exception):
    pass
 class NoHistory(object):
    """
    We don't want to fill memory with history
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@ -21,7 +21,48 @@ import lxml.html
 from .iparser import IParser
-__all__ = ['LxmlHtmlParser']
+__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
 class SelectElementException(Exception):
    pass
 def select(element, selector, nb=None, method='cssselect'):
    """
    Select one or many elements from an element, using lxml cssselect by default.
    Raises SelectElementException if not found.
    @param element [obj]  element on which to apply selector
    @param selector [str]  CSS or XPath expression
    @param method [str]  (cssselect|xpath)
    @param nb [int]  number of elements expected to be found.
                     Use None for undefined number, and 'many' for 1 to infinite.
    @return  one or many Element
    """
    if method == 'cssselect':
        results = element.cssselect(selector)
        if nb is None:
            return results
        elif isinstance(nb, basestring) and nb == 'many':
            if results is None or len(results) == 0:
                raise SelectElementException('Element not found with selector "%s"' % selector)
            elif len(results) == 1:
                raise SelectElementException('Only one element found with selector "%s"' % selector)
            else:
                return results
        elif isinstance(nb, int) and nb > 0:
            if results is None:
                raise SelectElementException('Element not found with selector "%s"' % selector)
            elif len(results) < nb:
                raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
            else:
                return results[0] if nb == 1 else results
        else:
            raise Exception('Unhandled value for kwarg "nb": %s' % nb)
    else:
        raise NotImplementedError('Only cssselect method is implemented for the moment')
 class LxmlHtmlParser(IParser):
@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser):
    """
    def parse(self, data, encoding=None):
-        parser = lxml.html.HTMLParser(encoding=encoding)
+        if encoding is None:
            parser = None
        else:
            parser = lxml.html.HTMLParser(encoding=encoding)
        return lxml.html.parse(data, parser)
    def tostring(self, element):