new select() helper
This commit is contained in:
parent
eb026b7c3c
commit
b4c672fa46
6 changed files with 67 additions and 51 deletions
|
|
@ -18,7 +18,6 @@
|
|||
|
||||
import re
|
||||
|
||||
from weboob.tools.browser import ExpectedElementNotFound
|
||||
from weboob.backends.aum.pages.base import PageBase
|
||||
from logging import error
|
||||
|
||||
|
|
@ -47,4 +46,5 @@ class HomePage(PageBase):
|
|||
i += 1
|
||||
if i == 3:
|
||||
return int(font.firstChild.data)
|
||||
raise ExpectedElementNotFound(u'Could not parse number of charms available')
|
||||
logging.warning(u'Could not parse number of charms available')
|
||||
return 0
|
||||
|
|
|
|||
|
|
@ -16,9 +16,11 @@
|
|||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||
|
||||
|
||||
import datetime
|
||||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage, ExpectedElementNotFound
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
from ..video import YoujizzVideo
|
||||
|
||||
|
|
@ -28,31 +30,20 @@ __all__ = ['IndexPage']
|
|||
|
||||
class IndexPage(BasePage):
|
||||
def iter_videos(self):
|
||||
div_id = 'span#miniatura'
|
||||
span_list = self.document.getroot().cssselect(div_id)
|
||||
if not span_list:
|
||||
raise ExpectedElementNotFound(div_id)
|
||||
|
||||
span_list = select(self.document.getroot(), 'span#miniatura')
|
||||
for span in span_list:
|
||||
a = span.find('.//a')
|
||||
if a is None:
|
||||
raise ExpectedElementNotFound('%s.//a' % span)
|
||||
a = select(span, 'a', 1)
|
||||
url = a.attrib['href']
|
||||
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
|
||||
|
||||
thumbnail_url = span.find('.//img').attrib['src']
|
||||
|
||||
title1_selector = 'span#title1'
|
||||
title1 = span.cssselect(title1_selector)
|
||||
if title1 is None:
|
||||
raise ExpectedElementNotFound(title1_selector)
|
||||
title = title1[0].text.strip()
|
||||
selector = 'span#title1'
|
||||
title_el = select(span, 'span#title1', 1)
|
||||
title = title_el.text.strip()
|
||||
|
||||
thumbtime = span.cssselect('span.thumbtime')
|
||||
minutes = seconds = 0
|
||||
if thumbtime is not None:
|
||||
time_span = thumbtime[0].find('span')
|
||||
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
|
||||
time_span = select(span, 'span.thumbtime span', 1)
|
||||
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
|
||||
|
||||
yield YoujizzVideo(_id,
|
||||
title=title,
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ import re
|
|||
import datetime
|
||||
from logging import warning
|
||||
|
||||
from weboob.tools.browser import ExpectedElementNotFound
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
from .base import PornPage
|
||||
from ..video import YoupornVideo
|
||||
|
|
@ -42,11 +42,7 @@ class VideoPage(PornPage):
|
|||
return el[0].cssselect('a')[0].attrib['href']
|
||||
|
||||
def get_title(self):
|
||||
selector = '#videoArea h1'
|
||||
try:
|
||||
element = self.document.getroot().cssselect(selector)[0]
|
||||
except IndexError:
|
||||
raise ExpectedElementNotFound(selector)
|
||||
element = select(self.document.getroot(), '#videoArea h1', 1)
|
||||
return unicode(element.getchildren()[0].tail).strip()
|
||||
|
||||
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@
|
|||
|
||||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage, ExpectedElementNotFound
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
from .video import YoutubeVideo
|
||||
|
||||
|
|
@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):
|
|||
|
||||
class ForbiddenVideoPage(BasePage):
|
||||
def on_loaded(self):
|
||||
selector = '.yt-alert-content'
|
||||
try:
|
||||
element = self.document.getroot().cssselect(selector)[0]
|
||||
except IndexError:
|
||||
raise ExpectedElementNotFound(selector)
|
||||
element = select(self.document.getroot(), '.yt-alert-content', 1)
|
||||
raise ForbiddenVideo(element.text.strip())
|
||||
|
||||
|
||||
|
|
@ -57,19 +54,11 @@ class VideoPage(BasePage):
|
|||
)
|
||||
|
||||
def get_author(self):
|
||||
selector = 'a.watch-description-username strong'
|
||||
try:
|
||||
element = self.document.getroot().cssselect(selector)[0]
|
||||
except IndexError:
|
||||
raise ExpectedElementNotFound(selector)
|
||||
element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
|
||||
return element.text.strip()
|
||||
|
||||
def get_title(self):
|
||||
selector = 'meta[name=title]'
|
||||
try:
|
||||
element = self.document.getroot().cssselect(selector)[0]
|
||||
except IndexError:
|
||||
raise ExpectedElementNotFound(selector)
|
||||
element = select(self.document.getroot(), 'meta[name=title]', 1)
|
||||
return unicode(element.attrib['content']).strip()
|
||||
|
||||
def get_url(self, _id):
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ else:
|
|||
|
||||
|
||||
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
||||
'BasePage', 'BaseBrowser', 'ExpectedElementNotFound']
|
||||
'BasePage', 'BaseBrowser']
|
||||
|
||||
|
||||
# Exceptions
|
||||
|
|
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
|
|||
pass
|
||||
|
||||
|
||||
class ExpectedElementNotFound(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NoHistory(object):
|
||||
"""
|
||||
We don't want to fill memory with history
|
||||
|
|
|
|||
|
|
@ -21,7 +21,48 @@ import lxml.html
|
|||
from .iparser import IParser
|
||||
|
||||
|
||||
__all__ = ['LxmlHtmlParser']
|
||||
__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
|
||||
|
||||
|
||||
class SelectElementException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def select(element, selector, nb=None, method='cssselect'):
|
||||
"""
|
||||
Select one or many elements from an element, using lxml cssselect by default.
|
||||
|
||||
Raises SelectElementException if not found.
|
||||
|
||||
@param element [obj] element on which to apply selector
|
||||
@param selector [str] CSS or XPath expression
|
||||
@param method [str] (cssselect|xpath)
|
||||
@param nb [int] number of elements expected to be found.
|
||||
Use None for undefined number, and 'many' for 1 to infinite.
|
||||
@return one or many Element
|
||||
"""
|
||||
if method == 'cssselect':
|
||||
results = element.cssselect(selector)
|
||||
if nb is None:
|
||||
return results
|
||||
elif isinstance(nb, basestring) and nb == 'many':
|
||||
if results is None or len(results) == 0:
|
||||
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||
elif len(results) == 1:
|
||||
raise SelectElementException('Only one element found with selector "%s"' % selector)
|
||||
else:
|
||||
return results
|
||||
elif isinstance(nb, int) and nb > 0:
|
||||
if results is None:
|
||||
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||
elif len(results) < nb:
|
||||
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
|
||||
else:
|
||||
return results[0] if nb == 1 else results
|
||||
else:
|
||||
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
||||
else:
|
||||
raise NotImplementedError('Only cssselect method is implemented for the moment')
|
||||
|
||||
|
||||
class LxmlHtmlParser(IParser):
|
||||
|
|
@ -32,7 +73,10 @@ class LxmlHtmlParser(IParser):
|
|||
"""
|
||||
|
||||
def parse(self, data, encoding=None):
|
||||
parser = lxml.html.HTMLParser(encoding=encoding)
|
||||
if encoding is None:
|
||||
parser = None
|
||||
else:
|
||||
parser = lxml.html.HTMLParser(encoding=encoding)
|
||||
return lxml.html.parse(data, parser)
|
||||
|
||||
def tostring(self, element):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue