new select() helper

2010-07-14 02:27:40 +02:00 · 2010-07-14 02:27:40 +02:00 · b4c672fa46
commit b4c672fa46
parent eb026b7c3c
6 changed files with 67 additions and 51 deletions
--- a/weboob/backends/aum/pages/home.py
+++ b/weboob/backends/aum/pages/home.py
@ -18,7 +18,6 @@

 import re

-from weboob.tools.browser import ExpectedElementNotFound
 from weboob.backends.aum.pages.base import PageBase
 from logging import error

@ -47,4 +46,5 @@ class HomePage(PageBase):
                        i += 1
                        if i == 3:
                            return int(font.firstChild.data)
-        raise ExpectedElementNotFound(u'Could not parse number of charms available')
+        logging.warning(u'Could not parse number of charms available')
+        return 0
--- a/weboob/backends/youjizz/pages/index.py
+++ b/weboob/backends/youjizz/pages/index.py
@ -16,9 +16,11 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.


+import datetime
 import re

-from weboob.tools.browser import BasePage, ExpectedElementNotFound
+from weboob.tools.browser import BasePage
+from weboob.tools.parsers.lxmlparser import select

 from ..video import YoujizzVideo

@ -28,31 +30,20 @@ __all__ = ['IndexPage']

 class IndexPage(BasePage):
    def iter_videos(self):
-        div_id = 'span#miniatura'
-        span_list = self.document.getroot().cssselect(div_id)
-        if not span_list:
-            raise ExpectedElementNotFound(div_id)
-
+        span_list = select(self.document.getroot(), 'span#miniatura')
        for span in span_list:
-            a = span.find('.//a')
-            if a is None:
-                raise ExpectedElementNotFound('%s.//a' % span)
+            a = select(span, 'a', 1)
            url = a.attrib['href']
            _id = re.sub(r'/videos/(.+)\.html', r'\1', url)

            thumbnail_url = span.find('.//img').attrib['src']

-            title1_selector = 'span#title1'
-            title1 = span.cssselect(title1_selector)
-            if title1 is None:
-                raise ExpectedElementNotFound(title1_selector)
-            title = title1[0].text.strip()
+            selector = 'span#title1'
+            title_el = select(span, 'span#title1', 1)
+            title = title_el.text.strip()

-            thumbtime = span.cssselect('span.thumbtime')
-            minutes = seconds = 0
-            if thumbtime is not None:
-                time_span = thumbtime[0].find('span')
-                minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
+            time_span = select(span, 'span.thumbtime span', 1)
+            minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))

            yield YoujizzVideo(_id,
                               title=title,
--- a/weboob/backends/youporn/pages/video.py
+++ b/weboob/backends/youporn/pages/video.py
@ -20,7 +20,7 @@ import re
 import datetime
 from logging import warning

-from weboob.tools.browser import ExpectedElementNotFound
+from weboob.tools.parsers.lxmlparser import select

 from .base import PornPage
 from ..video import YoupornVideo
@ -42,11 +42,7 @@ class VideoPage(PornPage):
            return el[0].cssselect('a')[0].attrib['href']

    def get_title(self):
-        selector = '#videoArea h1'
-        try:
-            element = self.document.getroot().cssselect(selector)[0]
-        except IndexError:
-            raise ExpectedElementNotFound(selector)
+        element = select(self.document.getroot(), '#videoArea h1', 1)
        return unicode(element.getchildren()[0].tail).strip()

    DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
--- a/weboob/backends/youtube/pages.py
+++ b/weboob/backends/youtube/pages.py
@ -18,7 +18,8 @@

 import re

-from weboob.tools.browser import BasePage, ExpectedElementNotFound
+from weboob.tools.browser import BasePage
+from weboob.tools.parsers.lxmlparser import select

 from .video import YoutubeVideo

@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):

 class ForbiddenVideoPage(BasePage):
    def on_loaded(self):
-        selector = '.yt-alert-content'
-        try:
-            element = self.document.getroot().cssselect(selector)[0]
-        except IndexError:
-            raise ExpectedElementNotFound(selector)
+        element = select(self.document.getroot(), '.yt-alert-content', 1)
        raise ForbiddenVideo(element.text.strip())


@ -57,19 +54,11 @@ class VideoPage(BasePage):
                                  )

    def get_author(self):
-        selector = 'a.watch-description-username strong'
-        try:
-            element = self.document.getroot().cssselect(selector)[0]
-        except IndexError:
-            raise ExpectedElementNotFound(selector)
+        element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
        return element.text.strip()

    def get_title(self):
-        selector = 'meta[name=title]'
-        try:
-            element = self.document.getroot().cssselect(selector)[0]
-        except IndexError:
-            raise ExpectedElementNotFound(selector)
+        element = select(self.document.getroot(), 'meta[name=title]', 1)
        return unicode(element.attrib['content']).strip()

    def get_url(self, _id):
--- a/weboob/tools/browser/browser.py
+++ b/weboob/tools/browser/browser.py
@ -43,7 +43,7 @@ else:


 __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
-           'BasePage', 'BaseBrowser', 'ExpectedElementNotFound']
+           'BasePage', 'BaseBrowser']


 # Exceptions
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
    pass


-class ExpectedElementNotFound(Exception):
-    pass
-
-
 class NoHistory(object):
    """
    We don't want to fill memory with history
--- a/weboob/tools/parsers/lxmlparser.py
+++ b/weboob/tools/parsers/lxmlparser.py
@ -21,7 +21,48 @@ import lxml.html
 from .iparser import IParser


-__all__ = ['LxmlHtmlParser']
+__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
+
+
+class SelectElementException(Exception):
+    pass
+
+
+def select(element, selector, nb=None, method='cssselect'):
+    """
+    Select one or many elements from an element, using lxml cssselect by default.
+
+    Raises SelectElementException if not found.
+
+    @param element [obj]  element on which to apply selector
+    @param selector [str]  CSS or XPath expression
+    @param method [str]  (cssselect|xpath)
+    @param nb [int]  number of elements expected to be found.
+                     Use None for undefined number, and 'many' for 1 to infinite.
+    @return  one or many Element
+    """
+    if method == 'cssselect':
+        results = element.cssselect(selector)
+        if nb is None:
+            return results
+        elif isinstance(nb, basestring) and nb == 'many':
+            if results is None or len(results) == 0:
+                raise SelectElementException('Element not found with selector "%s"' % selector)
+            elif len(results) == 1:
+                raise SelectElementException('Only one element found with selector "%s"' % selector)
+            else:
+                return results
+        elif isinstance(nb, int) and nb > 0:
+            if results is None:
+                raise SelectElementException('Element not found with selector "%s"' % selector)
+            elif len(results) < nb:
+                raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
+            else:
+                return results[0] if nb == 1 else results
+        else:
+            raise Exception('Unhandled value for kwarg "nb": %s' % nb)
+    else:
+        raise NotImplementedError('Only cssselect method is implemented for the moment')


 class LxmlHtmlParser(IParser):
@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser):
    """

    def parse(self, data, encoding=None):
-        parser = lxml.html.HTMLParser(encoding=encoding)
+        if encoding is None:
+            parser = None
+        else:
+            parser = lxml.html.HTMLParser(encoding=encoding)
        return lxml.html.parse(data, parser)

    def tostring(self, element):