new select() helper

This commit is contained in:
Christophe Benz 2010-07-14 02:27:40 +02:00 committed by Romain Bignon
commit b4c672fa46
6 changed files with 67 additions and 51 deletions

View file

@ -18,7 +18,6 @@
import re
from weboob.tools.browser import ExpectedElementNotFound
from weboob.backends.aum.pages.base import PageBase
from logging import error
@ -47,4 +46,5 @@ class HomePage(PageBase):
i += 1
if i == 3:
return int(font.firstChild.data)
raise ExpectedElementNotFound(u'Could not parse number of charms available')
logging.warning(u'Could not parse number of charms available')
return 0

View file

@ -16,9 +16,11 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import datetime
import re
from weboob.tools.browser import BasePage, ExpectedElementNotFound
from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from ..video import YoujizzVideo
@ -28,31 +30,20 @@ __all__ = ['IndexPage']
class IndexPage(BasePage):
def iter_videos(self):
div_id = 'span#miniatura'
span_list = self.document.getroot().cssselect(div_id)
if not span_list:
raise ExpectedElementNotFound(div_id)
span_list = select(self.document.getroot(), 'span#miniatura')
for span in span_list:
a = span.find('.//a')
if a is None:
raise ExpectedElementNotFound('%s.//a' % span)
a = select(span, 'a', 1)
url = a.attrib['href']
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
thumbnail_url = span.find('.//img').attrib['src']
title1_selector = 'span#title1'
title1 = span.cssselect(title1_selector)
if title1 is None:
raise ExpectedElementNotFound(title1_selector)
title = title1[0].text.strip()
selector = 'span#title1'
title_el = select(span, 'span#title1', 1)
title = title_el.text.strip()
thumbtime = span.cssselect('span.thumbtime')
minutes = seconds = 0
if thumbtime is not None:
time_span = thumbtime[0].find('span')
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
time_span = select(span, 'span.thumbtime span', 1)
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
yield YoujizzVideo(_id,
title=title,

View file

@ -20,7 +20,7 @@ import re
import datetime
from logging import warning
from weboob.tools.browser import ExpectedElementNotFound
from weboob.tools.parsers.lxmlparser import select
from .base import PornPage
from ..video import YoupornVideo
@ -42,11 +42,7 @@ class VideoPage(PornPage):
return el[0].cssselect('a')[0].attrib['href']
def get_title(self):
selector = '#videoArea h1'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
element = select(self.document.getroot(), '#videoArea h1', 1)
return unicode(element.getchildren()[0].tail).strip()
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")

View file

@ -18,7 +18,8 @@
import re
from weboob.tools.browser import BasePage, ExpectedElementNotFound
from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from .video import YoutubeVideo
@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):
class ForbiddenVideoPage(BasePage):
def on_loaded(self):
selector = '.yt-alert-content'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
element = select(self.document.getroot(), '.yt-alert-content', 1)
raise ForbiddenVideo(element.text.strip())
@ -57,19 +54,11 @@ class VideoPage(BasePage):
)
def get_author(self):
selector = 'a.watch-description-username strong'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
return element.text.strip()
def get_title(self):
selector = 'meta[name=title]'
try:
element = self.document.getroot().cssselect(selector)[0]
except IndexError:
raise ExpectedElementNotFound(selector)
element = select(self.document.getroot(), 'meta[name=title]', 1)
return unicode(element.attrib['content']).strip()
def get_url(self, _id):

View file

@ -43,7 +43,7 @@ else:
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
'BasePage', 'BaseBrowser', 'ExpectedElementNotFound']
'BasePage', 'BaseBrowser']
# Exceptions
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
pass
class ExpectedElementNotFound(Exception):
pass
class NoHistory(object):
"""
We don't want to fill memory with history

View file

@ -21,7 +21,48 @@ import lxml.html
from .iparser import IParser
__all__ = ['LxmlHtmlParser']
__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
class SelectElementException(Exception):
pass
def select(element, selector, nb=None, method='cssselect'):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises SelectElementException if not found.
@param element [obj] element on which to apply selector
@param selector [str] CSS or XPath expression
@param method [str] (cssselect|xpath)
@param nb [int] number of elements expected to be found.
Use None for undefined number, and 'many' for 1 to infinite.
@return one or many Element
"""
if method == 'cssselect':
results = element.cssselect(selector)
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise SelectElementException('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
else:
raise NotImplementedError('Only cssselect method is implemented for the moment')
class LxmlHtmlParser(IParser):
@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser):
"""
def parse(self, data, encoding=None):
parser = lxml.html.HTMLParser(encoding=encoding)
if encoding is None:
parser = None
else:
parser = lxml.html.HTMLParser(encoding=encoding)
return lxml.html.parse(data, parser)
def tostring(self, element):