new select() helper

This commit is contained in:
Christophe Benz 2010-07-14 02:27:40 +02:00 committed by Romain Bignon
commit b4c672fa46
6 changed files with 67 additions and 51 deletions

View file

@ -18,7 +18,6 @@
import re import re
from weboob.tools.browser import ExpectedElementNotFound
from weboob.backends.aum.pages.base import PageBase from weboob.backends.aum.pages.base import PageBase
from logging import error from logging import error
@ -47,4 +46,5 @@ class HomePage(PageBase):
i += 1 i += 1
if i == 3: if i == 3:
return int(font.firstChild.data) return int(font.firstChild.data)
raise ExpectedElementNotFound(u'Could not parse number of charms available') logging.warning(u'Could not parse number of charms available')
return 0

View file

@ -16,9 +16,11 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import re import re
from weboob.tools.browser import BasePage, ExpectedElementNotFound from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from ..video import YoujizzVideo from ..video import YoujizzVideo
@ -28,31 +30,20 @@ __all__ = ['IndexPage']
class IndexPage(BasePage): class IndexPage(BasePage):
def iter_videos(self): def iter_videos(self):
div_id = 'span#miniatura' span_list = select(self.document.getroot(), 'span#miniatura')
span_list = self.document.getroot().cssselect(div_id)
if not span_list:
raise ExpectedElementNotFound(div_id)
for span in span_list: for span in span_list:
a = span.find('.//a') a = select(span, 'a', 1)
if a is None:
raise ExpectedElementNotFound('%s.//a' % span)
url = a.attrib['href'] url = a.attrib['href']
_id = re.sub(r'/videos/(.+)\.html', r'\1', url) _id = re.sub(r'/videos/(.+)\.html', r'\1', url)
thumbnail_url = span.find('.//img').attrib['src'] thumbnail_url = span.find('.//img').attrib['src']
title1_selector = 'span#title1' selector = 'span#title1'
title1 = span.cssselect(title1_selector) title_el = select(span, 'span#title1', 1)
if title1 is None: title = title_el.text.strip()
raise ExpectedElementNotFound(title1_selector)
title = title1[0].text.strip()
thumbtime = span.cssselect('span.thumbtime') time_span = select(span, 'span.thumbtime span', 1)
minutes = seconds = 0 minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
if thumbtime is not None:
time_span = thumbtime[0].find('span')
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
yield YoujizzVideo(_id, yield YoujizzVideo(_id,
title=title, title=title,

View file

@ -20,7 +20,7 @@ import re
import datetime import datetime
from logging import warning from logging import warning
from weboob.tools.browser import ExpectedElementNotFound from weboob.tools.parsers.lxmlparser import select
from .base import PornPage from .base import PornPage
from ..video import YoupornVideo from ..video import YoupornVideo
@ -42,11 +42,7 @@ class VideoPage(PornPage):
return el[0].cssselect('a')[0].attrib['href'] return el[0].cssselect('a')[0].attrib['href']
def get_title(self): def get_title(self):
selector = '#videoArea h1' element = select(self.document.getroot(), '#videoArea h1', 1)
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
return unicode(element.getchildren()[0].tail).strip() return unicode(element.getchildren()[0].tail).strip()
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)") DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")

View file

@ -18,7 +18,8 @@
import re import re
from weboob.tools.browser import BasePage, ExpectedElementNotFound from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from .video import YoutubeVideo from .video import YoutubeVideo
@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):
class ForbiddenVideoPage(BasePage): class ForbiddenVideoPage(BasePage):
def on_loaded(self): def on_loaded(self):
selector = '.yt-alert-content' element = select(self.document.getroot(), '.yt-alert-content', 1)
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
raise ForbiddenVideo(element.text.strip()) raise ForbiddenVideo(element.text.strip())
@ -57,19 +54,11 @@ class VideoPage(BasePage):
) )
def get_author(self): def get_author(self):
selector = 'a.watch-description-username strong' element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
return element.text.strip() return element.text.strip()
def get_title(self): def get_title(self):
selector = 'meta[name=title]' element = select(self.document.getroot(), 'meta[name=title]', 1)
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
return unicode(element.attrib['content']).strip() return unicode(element.attrib['content']).strip()
def get_url(self, _id): def get_url(self, _id):

View file

@ -43,7 +43,7 @@ else:
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry', __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
'BasePage', 'BaseBrowser', 'ExpectedElementNotFound'] 'BasePage', 'BaseBrowser']
# Exceptions # Exceptions
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
pass pass
class ExpectedElementNotFound(Exception):
pass
class NoHistory(object): class NoHistory(object):
""" """
We don't want to fill memory with history We don't want to fill memory with history

View file

@ -21,7 +21,48 @@ import lxml.html
from .iparser import IParser from .iparser import IParser
__all__ = ['LxmlHtmlParser'] __all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
class SelectElementException(Exception):
pass
def select(element, selector, nb=None, method='cssselect'):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises SelectElementException if not found.
@param element [obj] element on which to apply selector
@param selector [str] CSS or XPath expression
@param method [str] (cssselect|xpath)
@param nb [int] number of elements expected to be found.
Use None for undefined number, and 'many' for 1 to infinite.
@return one or many Element
"""
if method == 'cssselect':
results = element.cssselect(selector)
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise SelectElementException('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
else:
raise NotImplementedError('Only cssselect method is implemented for the moment')
class LxmlHtmlParser(IParser): class LxmlHtmlParser(IParser):
@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser):
""" """
def parse(self, data, encoding=None): def parse(self, data, encoding=None):
parser = lxml.html.HTMLParser(encoding=encoding) if encoding is None:
parser = None
else:
parser = lxml.html.HTMLParser(encoding=encoding)
return lxml.html.parse(data, parser) return lxml.html.parse(data, parser)
def tostring(self, element): def tostring(self, element):