new select() helper
This commit is contained in:
parent
eb026b7c3c
commit
b4c672fa46
6 changed files with 67 additions and 51 deletions
|
|
@ -18,7 +18,6 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from weboob.tools.browser import ExpectedElementNotFound
|
|
||||||
from weboob.backends.aum.pages.base import PageBase
|
from weboob.backends.aum.pages.base import PageBase
|
||||||
from logging import error
|
from logging import error
|
||||||
|
|
||||||
|
|
@ -47,4 +46,5 @@ class HomePage(PageBase):
|
||||||
i += 1
|
i += 1
|
||||||
if i == 3:
|
if i == 3:
|
||||||
return int(font.firstChild.data)
|
return int(font.firstChild.data)
|
||||||
raise ExpectedElementNotFound(u'Could not parse number of charms available')
|
logging.warning(u'Could not parse number of charms available')
|
||||||
|
return 0
|
||||||
|
|
|
||||||
|
|
@ -16,9 +16,11 @@
|
||||||
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
|
||||||
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from weboob.tools.browser import BasePage, ExpectedElementNotFound
|
from weboob.tools.browser import BasePage
|
||||||
|
from weboob.tools.parsers.lxmlparser import select
|
||||||
|
|
||||||
from ..video import YoujizzVideo
|
from ..video import YoujizzVideo
|
||||||
|
|
||||||
|
|
@ -28,30 +30,19 @@ __all__ = ['IndexPage']
|
||||||
|
|
||||||
class IndexPage(BasePage):
|
class IndexPage(BasePage):
|
||||||
def iter_videos(self):
|
def iter_videos(self):
|
||||||
div_id = 'span#miniatura'
|
span_list = select(self.document.getroot(), 'span#miniatura')
|
||||||
span_list = self.document.getroot().cssselect(div_id)
|
|
||||||
if not span_list:
|
|
||||||
raise ExpectedElementNotFound(div_id)
|
|
||||||
|
|
||||||
for span in span_list:
|
for span in span_list:
|
||||||
a = span.find('.//a')
|
a = select(span, 'a', 1)
|
||||||
if a is None:
|
|
||||||
raise ExpectedElementNotFound('%s.//a' % span)
|
|
||||||
url = a.attrib['href']
|
url = a.attrib['href']
|
||||||
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
|
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
|
||||||
|
|
||||||
thumbnail_url = span.find('.//img').attrib['src']
|
thumbnail_url = span.find('.//img').attrib['src']
|
||||||
|
|
||||||
title1_selector = 'span#title1'
|
selector = 'span#title1'
|
||||||
title1 = span.cssselect(title1_selector)
|
title_el = select(span, 'span#title1', 1)
|
||||||
if title1 is None:
|
title = title_el.text.strip()
|
||||||
raise ExpectedElementNotFound(title1_selector)
|
|
||||||
title = title1[0].text.strip()
|
|
||||||
|
|
||||||
thumbtime = span.cssselect('span.thumbtime')
|
time_span = select(span, 'span.thumbtime span', 1)
|
||||||
minutes = seconds = 0
|
|
||||||
if thumbtime is not None:
|
|
||||||
time_span = thumbtime[0].find('span')
|
|
||||||
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
|
minutes, seconds = (int(v) for v in time_span.text.strip().split(':'))
|
||||||
|
|
||||||
yield YoujizzVideo(_id,
|
yield YoujizzVideo(_id,
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ import re
|
||||||
import datetime
|
import datetime
|
||||||
from logging import warning
|
from logging import warning
|
||||||
|
|
||||||
from weboob.tools.browser import ExpectedElementNotFound
|
from weboob.tools.parsers.lxmlparser import select
|
||||||
|
|
||||||
from .base import PornPage
|
from .base import PornPage
|
||||||
from ..video import YoupornVideo
|
from ..video import YoupornVideo
|
||||||
|
|
@ -42,11 +42,7 @@ class VideoPage(PornPage):
|
||||||
return el[0].cssselect('a')[0].attrib['href']
|
return el[0].cssselect('a')[0].attrib['href']
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
selector = '#videoArea h1'
|
element = select(self.document.getroot(), '#videoArea h1', 1)
|
||||||
try:
|
|
||||||
element = self.document.getroot().cssselect(selector)[0]
|
|
||||||
except IndexError:
|
|
||||||
raise ExpectedElementNotFound(selector)
|
|
||||||
return unicode(element.getchildren()[0].tail).strip()
|
return unicode(element.getchildren()[0].tail).strip()
|
||||||
|
|
||||||
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
|
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,8 @@
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from weboob.tools.browser import BasePage, ExpectedElementNotFound
|
from weboob.tools.browser import BasePage
|
||||||
|
from weboob.tools.parsers.lxmlparser import select
|
||||||
|
|
||||||
from .video import YoutubeVideo
|
from .video import YoutubeVideo
|
||||||
|
|
||||||
|
|
@ -32,11 +33,7 @@ class ForbiddenVideo(Exception):
|
||||||
|
|
||||||
class ForbiddenVideoPage(BasePage):
|
class ForbiddenVideoPage(BasePage):
|
||||||
def on_loaded(self):
|
def on_loaded(self):
|
||||||
selector = '.yt-alert-content'
|
element = select(self.document.getroot(), '.yt-alert-content', 1)
|
||||||
try:
|
|
||||||
element = self.document.getroot().cssselect(selector)[0]
|
|
||||||
except IndexError:
|
|
||||||
raise ExpectedElementNotFound(selector)
|
|
||||||
raise ForbiddenVideo(element.text.strip())
|
raise ForbiddenVideo(element.text.strip())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -57,19 +54,11 @@ class VideoPage(BasePage):
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_author(self):
|
def get_author(self):
|
||||||
selector = 'a.watch-description-username strong'
|
element = select(self.document.getroot(), 'a.watch-description-username strong', 1)
|
||||||
try:
|
|
||||||
element = self.document.getroot().cssselect(selector)[0]
|
|
||||||
except IndexError:
|
|
||||||
raise ExpectedElementNotFound(selector)
|
|
||||||
return element.text.strip()
|
return element.text.strip()
|
||||||
|
|
||||||
def get_title(self):
|
def get_title(self):
|
||||||
selector = 'meta[name=title]'
|
element = select(self.document.getroot(), 'meta[name=title]', 1)
|
||||||
try:
|
|
||||||
element = self.document.getroot().cssselect(selector)[0]
|
|
||||||
except IndexError:
|
|
||||||
raise ExpectedElementNotFound(selector)
|
|
||||||
return unicode(element.attrib['content']).strip()
|
return unicode(element.attrib['content']).strip()
|
||||||
|
|
||||||
def get_url(self, _id):
|
def get_url(self, _id):
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,7 @@ else:
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
||||||
'BasePage', 'BaseBrowser', 'ExpectedElementNotFound']
|
'BasePage', 'BaseBrowser']
|
||||||
|
|
||||||
|
|
||||||
# Exceptions
|
# Exceptions
|
||||||
|
|
@ -63,10 +63,6 @@ class BrowserRetry(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ExpectedElementNotFound(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class NoHistory(object):
|
class NoHistory(object):
|
||||||
"""
|
"""
|
||||||
We don't want to fill memory with history
|
We don't want to fill memory with history
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,48 @@ import lxml.html
|
||||||
from .iparser import IParser
|
from .iparser import IParser
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['LxmlHtmlParser']
|
__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
|
||||||
|
|
||||||
|
|
||||||
|
class SelectElementException(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def select(element, selector, nb=None, method='cssselect'):
|
||||||
|
"""
|
||||||
|
Select one or many elements from an element, using lxml cssselect by default.
|
||||||
|
|
||||||
|
Raises SelectElementException if not found.
|
||||||
|
|
||||||
|
@param element [obj] element on which to apply selector
|
||||||
|
@param selector [str] CSS or XPath expression
|
||||||
|
@param method [str] (cssselect|xpath)
|
||||||
|
@param nb [int] number of elements expected to be found.
|
||||||
|
Use None for undefined number, and 'many' for 1 to infinite.
|
||||||
|
@return one or many Element
|
||||||
|
"""
|
||||||
|
if method == 'cssselect':
|
||||||
|
results = element.cssselect(selector)
|
||||||
|
if nb is None:
|
||||||
|
return results
|
||||||
|
elif isinstance(nb, basestring) and nb == 'many':
|
||||||
|
if results is None or len(results) == 0:
|
||||||
|
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||||
|
elif len(results) == 1:
|
||||||
|
raise SelectElementException('Only one element found with selector "%s"' % selector)
|
||||||
|
else:
|
||||||
|
return results
|
||||||
|
elif isinstance(nb, int) and nb > 0:
|
||||||
|
if results is None:
|
||||||
|
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||||
|
elif len(results) < nb:
|
||||||
|
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
|
||||||
|
else:
|
||||||
|
return results[0] if nb == 1 else results
|
||||||
|
else:
|
||||||
|
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('Only cssselect method is implemented for the moment')
|
||||||
|
|
||||||
|
|
||||||
class LxmlHtmlParser(IParser):
|
class LxmlHtmlParser(IParser):
|
||||||
|
|
@ -32,6 +73,9 @@ class LxmlHtmlParser(IParser):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def parse(self, data, encoding=None):
|
def parse(self, data, encoding=None):
|
||||||
|
if encoding is None:
|
||||||
|
parser = None
|
||||||
|
else:
|
||||||
parser = lxml.html.HTMLParser(encoding=encoding)
|
parser = lxml.html.HTMLParser(encoding=encoding)
|
||||||
return lxml.html.parse(data, parser)
|
return lxml.html.parse(data, parser)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue