move select() in parser
This commit is contained in:
parent
cf2dca7520
commit
9afb301ebe
30 changed files with 197 additions and 197 deletions
|
|
@ -22,7 +22,7 @@ import re
|
|||
import urllib
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
from .video import ArteVideo
|
||||
|
||||
|
|
@ -40,13 +40,13 @@ class IndexPage(BasePage):
|
|||
if m:
|
||||
_id = m.group(1)
|
||||
rating = rating_max = 0
|
||||
rates = select(div, 'div[class=rateContainer]', 1)
|
||||
rates = self.parser.select(div, 'div[class=rateContainer]', 1)
|
||||
for r in rates.findall('div'):
|
||||
if 'star-rating-on' in r.attrib['class']:
|
||||
rating += 1
|
||||
rating_max += 1
|
||||
|
||||
thumb = select(div, 'img[class=thumbnail]', 1)
|
||||
thumb = self.parser.select(div, 'img[class=thumbnail]', 1)
|
||||
thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src']
|
||||
|
||||
yield ArteVideo(_id,
|
||||
|
|
@ -67,12 +67,12 @@ class VideoPage(BasePage):
|
|||
return self.document.getroot().cssselect('h2')[0].text
|
||||
|
||||
def get_url(self, lang, quality):
|
||||
obj = select(self.document.getroot(), 'object', 1)
|
||||
movie_url = select(obj, 'param[name=movie]', 1)
|
||||
obj = self.parser.select(self.document.getroot(), 'object', 1)
|
||||
movie_url = self.parser.select(obj, 'param[name=movie]', 1)
|
||||
xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1])
|
||||
|
||||
doc = self.browser.get_document(self.browser.openurl(xml_url))
|
||||
videos_list = select(doc.getroot(), 'video')
|
||||
videos_list = self.parser.select(doc.getroot(), 'video')
|
||||
videos = {}
|
||||
for v in videos_list:
|
||||
videos[v.attrib['lang']] = v.attrib['ref']
|
||||
|
|
@ -84,8 +84,8 @@ class VideoPage(BasePage):
|
|||
|
||||
doc = self.browser.get_document(self.browser.openurl(xml_url))
|
||||
|
||||
obj = select(doc.getroot(), 'urls', 1)
|
||||
videos_list = select(obj, 'url')
|
||||
obj = self.parser.select(doc.getroot(), 'urls', 1)
|
||||
videos_list = self.parser.select(obj, 'url')
|
||||
urls = {}
|
||||
for v in videos_list:
|
||||
urls[v.attrib['quality']] = v.text
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ class AccountHistory(BasePage):
|
|||
operation = Operation(len(operations))
|
||||
operation.date = mvt.xpath("./td/span")[0].text
|
||||
tmp = mvt.xpath("./td/span")[1]
|
||||
operation.label = remove_extra_spaces(remove_html_tags(self.browser.parser.tostring(tmp)))
|
||||
operation.label = remove_extra_spaces(remove_html_tags(self.parser.tostring(tmp)))
|
||||
|
||||
r = re.compile(r'\d+')
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ from datetime import datetime, date, time
|
|||
|
||||
from weboob.tools.browser import BaseBrowser
|
||||
from weboob.tools.misc import to_unicode
|
||||
from weboob.tools.parsers.lxmlparser import SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
|
||||
|
||||
__all__ = ['CanalTP']
|
||||
|
|
@ -52,7 +52,7 @@ class CanalTP(BaseBrowser):
|
|||
departure = ''
|
||||
for line in result.split('&'):
|
||||
if not '=' in line:
|
||||
raise SelectElementException('Unable to parse result: %s' % line)
|
||||
raise BrokenPageError('Unable to parse result: %s' % line)
|
||||
key, value = line.split('=', 1)
|
||||
if key == 'nomgare':
|
||||
departure = value
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ import re
|
|||
from weboob.capabilities.video import VideoThumbnail
|
||||
from weboob.tools.misc import html2text
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
from .video import DailymotionVideo
|
||||
|
||||
|
|
@ -34,7 +34,7 @@ __all__ = ['IndexPage', 'VideoPage']
|
|||
|
||||
class IndexPage(BasePage):
|
||||
def iter_videos(self):
|
||||
for div in select(self.document.getroot(), 'div.dmpi_video_item'):
|
||||
for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'):
|
||||
_id = 0
|
||||
for cls in div.attrib['class'].split():
|
||||
if cls.startswith('id_'):
|
||||
|
|
@ -46,15 +46,15 @@ class IndexPage(BasePage):
|
|||
continue
|
||||
|
||||
video = DailymotionVideo(int(_id))
|
||||
video.title = select(div, 'h3 a', 1).text
|
||||
video.author = select(div, 'div.dmpi_user_login', 1).find('a').text
|
||||
video.description = html2text(self.browser.parser.tostring(select(div, 'div.dmpi_video_description', 1))).strip()
|
||||
minutes, seconds = select(div, 'div.duration', 1).text.split(':')
|
||||
video.title = self.parser.select(div, 'h3 a', 1).text
|
||||
video.author = self.parser.select(div, 'div.dmpi_user_login', 1).find('a').text
|
||||
video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip()
|
||||
minutes, seconds = self.parser.select(div, 'div.duration', 1).text.split(':')
|
||||
video.duration = datetime.timedelta(minutes=int(minutes), seconds=int(seconds))
|
||||
url = select(div, 'img.dmco_image', 1).attrib['src']
|
||||
url = self.parser.select(div, 'img.dmco_image', 1).attrib['src']
|
||||
video.thumbnail = VideoThumbnail(url)
|
||||
|
||||
rating_div = select(div, 'div.small_stars', 1)
|
||||
rating_div = self.parser.select(div, 'div.small_stars', 1)
|
||||
video.rating_max = self.get_rate(rating_div)
|
||||
video.rating = self.get_rate(rating_div.find('div'))
|
||||
# XXX missing date
|
||||
|
|
@ -73,12 +73,12 @@ class VideoPage(BasePage):
|
|||
if video is None:
|
||||
video = DailymotionVideo(self.group_dict['id'])
|
||||
|
||||
div = select(self.document.getroot(), 'div#content', 1)
|
||||
div = self.parser.select(self.document.getroot(), 'div#content', 1)
|
||||
|
||||
video.title = select(div, 'span.title', 1).text
|
||||
video.author = select(div, 'a.name', 1).text
|
||||
video.description = select(div, 'div#video_description', 1).text
|
||||
for script in select(self.document.getroot(), 'div.dmco_html'):
|
||||
video.title = self.parser.select(div, 'span.title', 1).text
|
||||
video.author = self.parser.select(div, 'a.name', 1).text
|
||||
video.description = self.parser.select(div, 'div#video_description', 1).text
|
||||
for script in self.parser.select(self.document.getroot(), 'div.dmco_html'):
|
||||
if 'id' in script.attrib and script.attrib['id'].startswith('container_player_'):
|
||||
text = script.find('script').text
|
||||
mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@
|
|||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
|
||||
class Message(object):
|
||||
|
|
@ -34,7 +34,7 @@ class BoardIndexPage(BasePage):
|
|||
|
||||
def get_messages(self, last=None):
|
||||
msgs = []
|
||||
for post in select(self.document.getroot(), 'post'):
|
||||
for post in self.parser.select(self.document.getroot(), 'post'):
|
||||
m = Message(int(post.attrib['id']),
|
||||
post.attrib['time'],
|
||||
post.find('login').text,
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
from datetime import datetime
|
||||
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
from weboob.tools.misc import local2utc
|
||||
from weboob.backends.dlfp.tools import url2id
|
||||
|
||||
|
|
@ -54,23 +54,23 @@ class Comment(Content):
|
|||
|
||||
self.id = div.attrib['id'].split('-')[1]
|
||||
self.url = '%s#%s' % (article.url, div.attrib['id'])
|
||||
self.title = unicode(select(div.find('h2'), 'a.title', 1).text)
|
||||
self.title = unicode(self.browser.parser.select(div.find('h2'), 'a.title', 1).text)
|
||||
try:
|
||||
a = select(div.find('p'), 'a[rel=author]', 1)
|
||||
except SelectElementException:
|
||||
a = self.browser.parser.select(div.find('p'), 'a[rel=author]', 1)
|
||||
except BrokenPageError:
|
||||
self.author = 'Anonyme'
|
||||
self.username = None
|
||||
else:
|
||||
self.author = unicode(a.text)
|
||||
self.username = unicode(a.attrib['href'].split('/')[2])
|
||||
self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
|
||||
self.date = datetime.strptime(self.browser.parser.select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
|
||||
'%Y-%m-%dT%H:%M:%S')
|
||||
self.date = local2utc(self.date)
|
||||
|
||||
content = div.find('div')
|
||||
try:
|
||||
signature = select(content, 'p.signature', 1)
|
||||
except SelectElementException:
|
||||
signature = self.browser.parser.select(content, 'p.signature', 1)
|
||||
except BrokenPageError:
|
||||
# No signature.
|
||||
pass
|
||||
else:
|
||||
|
|
@ -78,11 +78,11 @@ class Comment(Content):
|
|||
self.signature = self.browser.parser.tostring(signature)
|
||||
self.body = self.browser.parser.tostring(content)
|
||||
|
||||
self.score = int(select(div.find('p'), 'span.score', 1).text)
|
||||
forms = select(div.find('footer'), 'form.button_to')
|
||||
self.score = int(self.browser.parser.select(div.find('p'), 'span.score', 1).text)
|
||||
forms = self.browser.parser.select(div.find('footer'), 'form.button_to')
|
||||
if len(forms) > 0:
|
||||
self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')
|
||||
self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
|
||||
self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
|
||||
|
||||
subs = div.find('ul')
|
||||
if subs is not None:
|
||||
|
|
@ -113,26 +113,26 @@ class Article(Content):
|
|||
header = tree.find('header')
|
||||
self.title = u' — '.join([a.text for a in header.find('h1').findall('a')])
|
||||
try:
|
||||
a = select(header, 'a[rel=author]', 1)
|
||||
except SelectElementException:
|
||||
a = self.browser.parser.select(header, 'a[rel=author]', 1)
|
||||
except BrokenPageError:
|
||||
self.author = 'Anonyme'
|
||||
self.username = None
|
||||
else:
|
||||
self.author = unicode(a.text)
|
||||
self.username = unicode(a.attrib['href'].split('/')[2])
|
||||
self.body = self.browser.parser.tostring(select(tree, 'div.content', 1))
|
||||
self.body = self.browser.parser.tostring(self.browser.parser.select(tree, 'div.content', 1))
|
||||
try:
|
||||
self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0],
|
||||
self.date = datetime.strptime(self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0],
|
||||
'%Y-%m-%dT%H:%M:%S')
|
||||
self.date = local2utc(self.date)
|
||||
except SelectElementException:
|
||||
except BrokenPageError:
|
||||
pass
|
||||
forms = select(tree.find('footer'), 'form.button_to')
|
||||
forms = self.browser.parser.select(tree.find('footer'), 'form.button_to')
|
||||
if len(forms) > 0:
|
||||
self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')
|
||||
self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
|
||||
self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
|
||||
|
||||
self.score = int(select(tree, 'div.figures figure.score', 1).text)
|
||||
self.score = int(self.browser.parser.select(tree, 'div.figures figure.score', 1).text)
|
||||
|
||||
def append_comment(self, comment):
|
||||
self.comments.append(comment)
|
||||
|
|
@ -146,7 +146,7 @@ class Article(Content):
|
|||
class CommentPage(DLFPPage):
|
||||
def get_comment(self):
|
||||
article = Article(self.browser, self.url, None)
|
||||
return Comment(article, select(self.document.getroot(), 'li.comment', 1), 0)
|
||||
return Comment(article, self.parser.select(self.document.getroot(), 'li.comment', 1), 0)
|
||||
|
||||
class ContentPage(DLFPPage):
|
||||
def on_loaded(self):
|
||||
|
|
@ -158,8 +158,8 @@ class ContentPage(DLFPPage):
|
|||
def get_comment(self, id):
|
||||
article = Article(self.browser, self.url, None)
|
||||
try:
|
||||
li = select(self.document.getroot(), 'li#comment-%s' % id, 1)
|
||||
except SelectElementException:
|
||||
li = self.parser.select(self.document.getroot(), 'li#comment-%s' % id, 1)
|
||||
except BrokenPageError:
|
||||
return None
|
||||
else:
|
||||
return Comment(article, li, 0)
|
||||
|
|
@ -168,11 +168,11 @@ class ContentPage(DLFPPage):
|
|||
if not self.article:
|
||||
self.article = Article(self.browser,
|
||||
self.url,
|
||||
select(self.document.getroot(), 'div#contents article', 1))
|
||||
self.parser.select(self.document.getroot(), 'div#contents article', 1))
|
||||
|
||||
try:
|
||||
threads = select(self.document.getroot(), 'ul.threads', 1)
|
||||
except SelectElementException:
|
||||
threads = self.parser.select(self.document.getroot(), 'ul.threads', 1)
|
||||
except BrokenPageError:
|
||||
pass # no comments
|
||||
else:
|
||||
for comment in threads.findall('li'):
|
||||
|
|
@ -181,10 +181,10 @@ class ContentPage(DLFPPage):
|
|||
return self.article
|
||||
|
||||
def get_post_comment_url(self):
|
||||
return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
|
||||
return self.parser.select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
|
||||
|
||||
def get_tag_url(self):
|
||||
return select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href']
|
||||
return self.parser.select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href']
|
||||
|
||||
class NewCommentPage(DLFPPage):
|
||||
pass
|
||||
|
|
@ -201,8 +201,8 @@ class NewTagPage(DLFPPage):
|
|||
class NodePage(DLFPPage):
|
||||
def get_errors(self):
|
||||
try:
|
||||
div = select(self.document.getroot(), 'div.errors', 1)
|
||||
except SelectElementException:
|
||||
div = self.parser.select(self.document.getroot(), 'div.errors', 1)
|
||||
except BrokenPageError:
|
||||
return []
|
||||
|
||||
l = []
|
||||
|
|
|
|||
|
|
@ -17,15 +17,15 @@
|
|||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
|
||||
from .index import DLFPPage
|
||||
|
||||
class WikiEditPage(DLFPPage):
|
||||
def get_body(self):
|
||||
try:
|
||||
return select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text
|
||||
except SelectElementException:
|
||||
return self.parser.select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text
|
||||
except BrokenPageError:
|
||||
return ''
|
||||
|
||||
def _is_wiki_form(self, form):
|
||||
|
|
@ -52,5 +52,5 @@ class WikiEditPage(DLFPPage):
|
|||
self.browser.submit()
|
||||
|
||||
def get_preview_html(self):
|
||||
body = select(self.document.getroot(), 'article.wikipage div.content', 1)
|
||||
return self.browser.parser.tostring(body)
|
||||
body = self.parser.select(self.document.getroot(), 'article.wikipage div.content', 1)
|
||||
return self.parser.tostring(body)
|
||||
|
|
|
|||
|
|
@ -29,9 +29,9 @@ class ArticlePage(GenericNewsPage):
|
|||
|
||||
def get_body(self):
|
||||
element_body = self.get_element_body()
|
||||
remove_from_selector_list(element_body, ["p.auteur", "h4" ])
|
||||
try_remove_from_selector_list(element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"])
|
||||
try_drop_tree(element_body, "script")
|
||||
remove_from_selector_list(self.parser, element_body, ["p.auteur", "h4" ])
|
||||
try_remove_from_selector_list(self.parser, element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"])
|
||||
try_drop_tree(self.parser, element_body, "script")
|
||||
|
||||
return self.browser.parser.tostring(element_body)
|
||||
return self.parser.tostring(element_body)
|
||||
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ class BoardPage(BasePage):
|
|||
if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'):
|
||||
article.id = int(div.attrib.get('name', '0'))
|
||||
if div.tag == 'blockquote':
|
||||
article.text = self.browser.parser.tostring(div)
|
||||
article.text = self.parser.tostring(div)
|
||||
if div.tag == 'table':
|
||||
tags = div.cssselect('td.reply')
|
||||
if tags:
|
||||
|
|
|
|||
|
|
@ -168,7 +168,7 @@ class TorrentsPage(BasePage):
|
|||
title = title_t[0].find('strong').text.strip()
|
||||
body_t = box.cssselect('div.body')
|
||||
if body_t:
|
||||
body = html2text(self.browser.parser.tostring(body_t[0])).strip()
|
||||
body = html2text(self.parser.tostring(body_t[0])).strip()
|
||||
|
||||
if title and body:
|
||||
if torrent.description is NotLoaded:
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import datetime
|
|||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
|
||||
from ..video import InaVideo
|
||||
|
||||
|
|
@ -35,8 +35,8 @@ class SearchPage(BasePage):
|
|||
|
||||
def iter_videos(self):
|
||||
try:
|
||||
ul = select(self.document.getroot(), 'div.container-videos ul', 1)
|
||||
except SelectElementException:
|
||||
ul = self.parser.select(self.document.getroot(), 'div.container-videos ul', 1)
|
||||
except BrokenPageError:
|
||||
# It means there are no results.
|
||||
return
|
||||
for li in ul.findall('li'):
|
||||
|
|
@ -44,18 +44,18 @@ class SearchPage(BasePage):
|
|||
|
||||
thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']
|
||||
|
||||
title = select(li, 'p.titre', 1).text
|
||||
title = self.parser.select(li, 'p.titre', 1).text
|
||||
|
||||
date = select(li, 'p.date', 1).text
|
||||
date = self.parser.select(li, 'p.date', 1).text
|
||||
day, month, year = [int(s) for s in date.split('/')]
|
||||
date = datetime.datetime(year, month, day)
|
||||
|
||||
duration = select(li, 'p.duree', 1).text
|
||||
duration = self.parser.select(li, 'p.duree', 1).text
|
||||
m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration)
|
||||
if m:
|
||||
duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5)))
|
||||
else:
|
||||
raise SelectElementException('Unable to match duration (%r)' % duration)
|
||||
raise BrokenPageError('Unable to match duration (%r)' % duration)
|
||||
|
||||
yield InaVideo(id,
|
||||
title=title,
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ except ImportError:
|
|||
from cgi import parse_qs
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
|
||||
from ..video import InaVideo
|
||||
|
||||
|
|
@ -75,9 +75,9 @@ class VideoPage(BasePage):
|
|||
seconds=int(m.group(6)))
|
||||
return date, duration
|
||||
else:
|
||||
raise SelectElementException('Unable to parse date and duration')
|
||||
raise BrokenPageError('Unable to parse date and duration')
|
||||
else:
|
||||
raise SelectElementException('Unable to find date and duration element')
|
||||
raise BrokenPageError('Unable to find date and duration element')
|
||||
|
||||
def get_title(self):
|
||||
el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0]
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
from weboob.tools.genericArticle import GenericNewsPage, try_remove, \
|
||||
try_remove_from_selector_list, \
|
||||
drop_comments, NoneMainDiv
|
||||
|
|
@ -35,26 +35,28 @@ class ArticlePage(GenericNewsPage):
|
|||
try :
|
||||
element_body = self.get_element_body()
|
||||
except NoneMainDiv:
|
||||
return None
|
||||
return None
|
||||
else:
|
||||
div_header_element = select(element_body, "div.header", 1)
|
||||
element_detail = select(element_body, "div.details", 1)
|
||||
div_content_element = select(element_body, "div.content", 1)
|
||||
div_header_element = self.parser.select(element_body, "div.header", 1)
|
||||
element_detail = self.parser.select(element_body, "div.details", 1)
|
||||
div_content_element = self.parser.select(element_body, "div.content", 1)
|
||||
|
||||
drop_comments(element_body)
|
||||
try_remove(element_body, "div.sidebar")
|
||||
try_remove(element_detail, "div.footer")
|
||||
try_remove_from_selector_list(div_header_element,
|
||||
["h1", "div.picture", "div.date",
|
||||
"div.news-single-img",
|
||||
try_remove(self.parser, element_body, "div.sidebar")
|
||||
try_remove(self.parser, element_detail, "div.footer")
|
||||
try_remove_from_selector_list(self.parser,
|
||||
div_header_element,
|
||||
["h1", "div.picture", "div.date",
|
||||
"div.news-single-img",
|
||||
"div.metas_img", "strong"])
|
||||
try_remove_from_selector_list(div_content_element,
|
||||
try_remove_from_selector_list(self.parser,
|
||||
div_content_element,
|
||||
["div.tw_button", "div.wpfblike"])
|
||||
|
||||
try :
|
||||
description_element = select(div_header_element,
|
||||
description_element = self.parser.select(div_header_element,
|
||||
"div.description", 1)
|
||||
except SelectElementException:
|
||||
except BrokenPageError:
|
||||
pass
|
||||
else:
|
||||
text_content = description_element.text_content()
|
||||
|
|
@ -75,6 +77,4 @@ class ArticlePage(GenericNewsPage):
|
|||
|
||||
div_content_element.drop_tag()
|
||||
|
||||
return self.browser.parser.tostring(element_body)
|
||||
|
||||
|
||||
return self.parser.tostring(element_body)
|
||||
|
|
|
|||
|
|
@ -30,4 +30,4 @@ class InrocksTvPage(GenericNewsPage):
|
|||
|
||||
def get_body(self):
|
||||
element_body = self.get_element_body()
|
||||
return self.browser.parser.tostring(element_body)
|
||||
return self.parser.tostring(element_body)
|
||||
|
|
|
|||
|
|
@ -30,13 +30,13 @@ class ArticlePage(GenericNewsPage):
|
|||
|
||||
def get_body(self):
|
||||
element_body = self.get_element_body()
|
||||
remove_from_selector_list(element_body, [self.element_title_selector])
|
||||
remove_from_selector_list(self.parser, element_body, [self.element_title_selector])
|
||||
drop_comments(element_body)
|
||||
try_drop_tree(element_body, "script")
|
||||
try_drop_tree(self.parser, element_body, "script")
|
||||
|
||||
try_remove_from_selector_list(element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link'])
|
||||
try_remove_from_selector_list(self.parser, element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link'])
|
||||
|
||||
element_body.find_class("texte")[0].drop_tag()
|
||||
element_body.tag = "div"
|
||||
return self.browser.parser.tostring(element_body)
|
||||
return self.parser.tostring(element_body)
|
||||
|
||||
|
|
|
|||
|
|
@ -31,5 +31,5 @@ class FlashActuPage(GenericNewsPage):
|
|||
def get_body(self):
|
||||
element_body = self.get_element_body()
|
||||
element_body.tag = "div"
|
||||
return self.browser.parser.tostring(element_body)
|
||||
return self.parser.tostring(element_body)
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
|
||||
__all__ = ['XMLinfos']
|
||||
|
|
@ -28,7 +28,7 @@ __all__ = ['XMLinfos']
|
|||
class XMLinfos(BasePage):
|
||||
def get_current(self):
|
||||
try:
|
||||
for channel in select(self.document.getroot(), 'channel'):
|
||||
for channel in self.parser.select(self.document.getroot(), 'channel'):
|
||||
title = channel.find('item/song_title').text
|
||||
artist = channel.find('item/artist_name').text
|
||||
except AttributeError:
|
||||
|
|
|
|||
|
|
@ -35,11 +35,11 @@ class ArticlePage(SimplePage):
|
|||
except NoneMainDiv:
|
||||
return None
|
||||
else:
|
||||
try_remove(element_body, "div.mna-tools")
|
||||
try_remove(element_body, "div.mna-comment-call")
|
||||
try_remove(self.parser, element_body, "div.mna-tools")
|
||||
try_remove(self.parser, element_body, "div.mna-comment-call")
|
||||
try :
|
||||
element_body.remove(self.get_element_author())
|
||||
except NoAuthorElement:
|
||||
pass
|
||||
return self.browser.parser.tostring(element_body)
|
||||
return self.parser.tostring(element_body)
|
||||
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
|
||||
__all__ = ['PlayerPage']
|
||||
|
|
@ -27,6 +27,6 @@ __all__ = ['PlayerPage']
|
|||
|
||||
class PlayerPage(BasePage):
|
||||
def get_current(self):
|
||||
title = select(self.document.getroot(), 'span.titre_en_cours', 1).text
|
||||
artist = select(self.document.getroot(), 'span.artiste_en_cours', 1).text
|
||||
title = self.parser.select(self.document.getroot(), 'span.titre_en_cours', 1).text
|
||||
artist = self.parser.select(self.document.getroot(), 'span.artiste_en_cours', 1).text
|
||||
return unicode(artist).strip(), unicode(title).strip()
|
||||
|
|
|
|||
|
|
@ -19,11 +19,11 @@
|
|||
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
class WikiEditPage(BasePage):
|
||||
def get_source(self):
|
||||
return select(self.document.getroot(), 'textarea#content_text', 1).text
|
||||
return self.parser.select(self.document.getroot(), 'textarea#content_text', 1).text
|
||||
|
||||
def set_source(self, data, message):
|
||||
self.browser.select_form(nr=1)
|
||||
|
|
@ -33,7 +33,7 @@ class WikiEditPage(BasePage):
|
|||
self.browser.submit()
|
||||
|
||||
def get_authenticity_token(self):
|
||||
wiki_form = select(self.document.getroot(), 'form#wiki_form', 1)
|
||||
wiki_form = self.parser.select(self.document.getroot(), 'form#wiki_form', 1)
|
||||
return wiki_form.xpath('div/input')[0].get('value')
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import re
|
|||
|
||||
from weboob.capabilities.messages import CantSendMessage
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
|
||||
__all__ = ['ClosePage', 'ComposePage', 'ConfirmPage', 'SentPage']
|
||||
|
|
@ -37,7 +37,7 @@ class ComposePage(BasePage):
|
|||
|
||||
def get_nb_remaining_free_sms(self):
|
||||
remaining_regex = re.compile(u'Il vous reste (?P<nb>.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui')
|
||||
text = select(self.document.getroot(), '#smsReminder', 1).text.strip()
|
||||
text = self.parser.select(self.document.getroot(), '#smsReminder', 1).text.strip()
|
||||
return remaining_regex.match(text).groupdict().get('nb')
|
||||
|
||||
def post_message(self, message):
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@ import datetime
|
|||
import re
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
|
||||
from ..video import YoujizzVideo
|
||||
|
||||
|
|
@ -32,25 +32,25 @@ __all__ = ['IndexPage']
|
|||
|
||||
class IndexPage(BasePage):
|
||||
def iter_videos(self):
|
||||
span_list = select(self.document.getroot(), 'span#miniatura')
|
||||
span_list = self.parser.select(self.document.getroot(), 'span#miniatura')
|
||||
for span in span_list:
|
||||
a = select(span, 'a', 1)
|
||||
a = self.parser.select(span, 'a', 1)
|
||||
url = a.attrib['href']
|
||||
_id = re.sub(r'/videos/(.+)\.html', r'\1', url)
|
||||
|
||||
thumbnail_url = span.find('.//img').attrib['src']
|
||||
|
||||
title_el = select(span, 'span#title1', 1)
|
||||
title_el = self.parser.select(span, 'span#title1', 1)
|
||||
title = title_el.text.strip()
|
||||
|
||||
time_span = select(span, 'span.thumbtime span', 1)
|
||||
time_span = self.parser.select(span, 'span.thumbtime span', 1)
|
||||
time_txt = time_span.text.strip().replace(';', ':')
|
||||
if time_txt == 'N/A':
|
||||
minutes, seconds = 0, 0
|
||||
elif ':' in time_txt:
|
||||
minutes, seconds = (int(v) for v in time_txt.split(':'))
|
||||
else:
|
||||
raise SelectElementException('Unable to parse the video duration: %s' % time_txt)
|
||||
raise BrokenPageError('Unable to parse the video duration: %s' % time_txt)
|
||||
|
||||
|
||||
yield YoujizzVideo(_id,
|
||||
|
|
|
|||
|
|
@ -23,9 +23,8 @@ import lxml.html
|
|||
import re
|
||||
|
||||
from weboob.capabilities.base import NotAvailable
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.browser import BasePage, BrokenPageError
|
||||
from weboob.tools.misc import to_unicode
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
|
||||
from ..video import YoujizzVideo
|
||||
|
||||
|
|
@ -39,7 +38,7 @@ class VideoPage(BasePage):
|
|||
_id = to_unicode(self.group_dict['id'])
|
||||
if video is None:
|
||||
video = YoujizzVideo(_id)
|
||||
title_el = select(self.document.getroot(), 'title', 1)
|
||||
title_el = self.parser.select(self.document.getroot(), 'title', 1)
|
||||
video.title = to_unicode(title_el.text.strip())
|
||||
|
||||
# youjizz HTML is crap, we must parse it with regexps
|
||||
|
|
@ -53,13 +52,13 @@ class VideoPage(BasePage):
|
|||
minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
|
||||
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
|
||||
else:
|
||||
raise SelectElementException('Unable to retrieve video duration')
|
||||
raise BrokenPageError('Unable to retrieve video duration')
|
||||
|
||||
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
|
||||
if len(video_file_urls) == 0:
|
||||
raise SelectElementException('Video URL not found')
|
||||
raise BrokenPageError('Video URL not found')
|
||||
elif len(video_file_urls) > 1:
|
||||
raise SelectElementException('Many video file URL found')
|
||||
raise BrokenPageError('Many video file URL found')
|
||||
else:
|
||||
video.url = video_file_urls[0]
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
import re
|
||||
import datetime
|
||||
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
from .base import PornPage
|
||||
from ..video import YoupornVideo
|
||||
|
|
@ -39,19 +39,19 @@ class VideoPage(PornPage):
|
|||
return video
|
||||
|
||||
def get_url(self):
|
||||
download_div = select(self.document.getroot(), '#download', 1)
|
||||
a = select(download_div, 'a', 1)
|
||||
download_div = self.parser.select(self.document.getroot(), '#download', 1)
|
||||
a = self.parser.select(download_div, 'a', 1)
|
||||
return a.attrib['href']
|
||||
|
||||
def get_title(self):
|
||||
element = select(self.document.getroot(), '#videoArea h1', 1)
|
||||
element = self.parser.select(self.document.getroot(), '#videoArea h1', 1)
|
||||
return unicode(element.getchildren()[0].tail).strip()
|
||||
|
||||
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
|
||||
MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
|
||||
def set_details(self, v):
|
||||
details_div = select(self.document.getroot(), '#details', 1)
|
||||
details_div = self.parser.select(self.document.getroot(), '#details', 1)
|
||||
for li in details_div.getiterator('li'):
|
||||
span = li.find('span')
|
||||
name = span.text.strip()
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select
|
||||
|
||||
|
||||
|
||||
__all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage']
|
||||
|
|
@ -31,7 +31,7 @@ class ForbiddenVideo(Exception):
|
|||
|
||||
class ForbiddenVideoPage(BasePage):
|
||||
def get_video(self, video=None):
|
||||
element = select(self.document.getroot(), '.yt-alert-content', 1)
|
||||
element = self.parser.select(self.document.getroot(), '.yt-alert-content', 1)
|
||||
raise ForbiddenVideo(element.text.strip())
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ import sys
|
|||
import subprocess
|
||||
if sys.platform == 'win32':
|
||||
import WConio
|
||||
|
||||
|
||||
try:
|
||||
import tty, termios
|
||||
except ImportError:
|
||||
|
|
|
|||
|
|
@ -21,8 +21,8 @@
|
|||
from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \
|
||||
BrowserUnavailable, BrowserRetry, \
|
||||
BrowserHTTPNotFound, BrowserHTTPError, \
|
||||
BasePage, BaseBrowser
|
||||
BasePage, BaseBrowser, BrokenPageError
|
||||
|
||||
|
||||
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
|
||||
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser']
|
||||
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser', 'BrokenPageError']
|
||||
|
|
|
|||
|
|
@ -93,6 +93,8 @@ class NoHistory(object):
|
|||
def close(self):
|
||||
pass
|
||||
|
||||
class BrokenPageError(Exception):
|
||||
pass
|
||||
|
||||
class BasePage(object):
|
||||
"""
|
||||
|
|
@ -100,6 +102,7 @@ class BasePage(object):
|
|||
"""
|
||||
def __init__(self, browser, document, url='', groups=None, group_dict=None, logger=None):
|
||||
self.browser = browser
|
||||
self.parser = browser.parser
|
||||
self.document = document
|
||||
self.url = url
|
||||
self.groups = groups
|
||||
|
|
|
|||
|
|
@ -16,32 +16,33 @@
|
|||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from weboob.tools.browser import BasePage
|
||||
from weboob.tools.parsers.lxmlparser import select, SelectElementException
|
||||
from weboob.tools.browser import BrokenPageError
|
||||
from lxml.etree import Comment
|
||||
|
||||
|
||||
def try_remove(base_element, selector):
|
||||
def try_remove(parser, base_element, selector):
|
||||
try :
|
||||
base_element.remove(select(base_element, selector, 1 ))
|
||||
except (SelectElementException, ValueError):
|
||||
base_element.remove(parser.select(base_element, selector, 1 ))
|
||||
except (BrokenPageError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
def try_drop_tree(base_element, selector):
|
||||
def try_drop_tree(parser, base_element, selector):
|
||||
try:
|
||||
select(base_element, selector, 1).drop_tree()
|
||||
except SelectElementException:
|
||||
parser.select(base_element, selector, 1).drop_tree()
|
||||
except BrokenPageError:
|
||||
pass
|
||||
|
||||
def remove_from_selector_list(base_element, selector_list):
|
||||
def remove_from_selector_list(parser, base_element, selector_list):
|
||||
for selector in selector_list:
|
||||
base_element.remove(select(base_element, selector, 1))
|
||||
base_element.remove(parser.select(base_element, selector, 1))
|
||||
|
||||
|
||||
def try_remove_from_selector_list(base_element, selector_list):
|
||||
def try_remove_from_selector_list(parser, base_element, selector_list):
|
||||
for selector in selector_list:
|
||||
try_remove(base_element, selector)
|
||||
try_remove(parser, base_element, selector)
|
||||
|
||||
def drop_comments(base_element):
|
||||
for comment in base_element.getiterator(Comment):
|
||||
|
|
@ -49,13 +50,13 @@ def drop_comments(base_element):
|
|||
|
||||
|
||||
|
||||
class NoAuthorElement(SelectElementException):
|
||||
class NoAuthorElement(BrokenPageError):
|
||||
pass
|
||||
|
||||
class NoBodyElement(SelectElementException):
|
||||
class NoBodyElement(BrokenPageError):
|
||||
pass
|
||||
|
||||
class NoTitleException(SelectElementException):
|
||||
class NoTitleException(BrokenPageError):
|
||||
pass
|
||||
|
||||
class NoneMainDiv(AttributeError):
|
||||
|
|
@ -75,13 +76,13 @@ class Article(object):
|
|||
class GenericNewsPage(BasePage):
|
||||
__element_body = NotImplementedError
|
||||
__article = Article
|
||||
element_title_selector = NotImplementedError
|
||||
element_title_selector = NotImplementedError
|
||||
main_div = NotImplementedError
|
||||
element_body_selector = NotImplementedError
|
||||
element_author_selector = NotImplementedError
|
||||
|
||||
def get_body(self):
|
||||
return self.browser.parser.tostring(self.get_element_body())
|
||||
return self.parser.tostring(self.get_element_body())
|
||||
|
||||
def get_author(self):
|
||||
try:
|
||||
|
|
@ -92,7 +93,7 @@ class GenericNewsPage(BasePage):
|
|||
|
||||
def get_title(self):
|
||||
try :
|
||||
return select(
|
||||
return self.parser.select(
|
||||
self.main_div,
|
||||
self.element_title_selector,
|
||||
1).text_content().strip()
|
||||
|
|
@ -102,17 +103,17 @@ class GenericNewsPage(BasePage):
|
|||
return self.__article.title
|
||||
else:
|
||||
raise
|
||||
except SelectElementException:
|
||||
except BrokenPageError:
|
||||
try :
|
||||
self.element_title_selector = "h1"
|
||||
return self.get_title()
|
||||
except SelectElementException:
|
||||
except BrokenPageError:
|
||||
raise NoTitleException("no title on %s" % (self.browser))
|
||||
|
||||
def get_element_body(self):
|
||||
try :
|
||||
return select(self.main_div, self.element_body_selector, 1)
|
||||
except SelectElementException:
|
||||
return self.parser.select(self.main_div, self.element_body_selector, 1)
|
||||
except BrokenPageError:
|
||||
raise NoBodyElement("no body on %s" % (self.browser))
|
||||
except AttributeError:
|
||||
if self.main_div == None:
|
||||
|
|
@ -122,8 +123,8 @@ class GenericNewsPage(BasePage):
|
|||
|
||||
def get_element_author(self):
|
||||
try:
|
||||
return select(self.main_div, self.element_author_selector, 1)
|
||||
except SelectElementException:
|
||||
return self.parser.select(self.main_div, self.element_author_selector, 1)
|
||||
except BrokenPageError:
|
||||
raise NoAuthorElement()
|
||||
except AttributeError:
|
||||
if self.main_div == None:
|
||||
|
|
|
|||
|
|
@ -21,50 +21,10 @@
|
|||
import lxml.html
|
||||
|
||||
from .iparser import IParser
|
||||
from ..browser import BrokenPageError
|
||||
|
||||
|
||||
__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException']
|
||||
|
||||
|
||||
class SelectElementException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def select(element, selector, nb=None, method='cssselect'):
|
||||
"""
|
||||
Select one or many elements from an element, using lxml cssselect by default.
|
||||
|
||||
Raises SelectElementException if not found.
|
||||
|
||||
@param element [obj] element on which to apply selector
|
||||
@param selector [str] CSS or XPath expression
|
||||
@param method [str] (cssselect|xpath)
|
||||
@param nb [int] number of elements expected to be found.
|
||||
Use None for undefined number, and 'many' for 1 to infinite.
|
||||
@return one or many Element
|
||||
"""
|
||||
if method == 'cssselect':
|
||||
results = element.cssselect(selector)
|
||||
if nb is None:
|
||||
return results
|
||||
elif isinstance(nb, basestring) and nb == 'many':
|
||||
if results is None or len(results) == 0:
|
||||
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||
elif len(results) == 1:
|
||||
raise SelectElementException('Only one element found with selector "%s"' % selector)
|
||||
else:
|
||||
return results
|
||||
elif isinstance(nb, int) and nb > 0:
|
||||
if results is None:
|
||||
raise SelectElementException('Element not found with selector "%s"' % selector)
|
||||
elif len(results) < nb:
|
||||
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
|
||||
else:
|
||||
return results[0] if nb == 1 else results
|
||||
else:
|
||||
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
||||
else:
|
||||
raise NotImplementedError('Only cssselect method is implemented for the moment')
|
||||
__all__ = ['LxmlHtmlParser']
|
||||
|
||||
|
||||
class LxmlHtmlParser(IParser):
|
||||
|
|
@ -83,3 +43,40 @@ class LxmlHtmlParser(IParser):
|
|||
|
||||
def tostring(self, element):
|
||||
return lxml.html.tostring(element, encoding=unicode)
|
||||
|
||||
@classmethod
|
||||
def select(cls, element, selector, nb=None, method='cssselect'):
|
||||
"""
|
||||
Select one or many elements from an element, using lxml cssselect by default.
|
||||
|
||||
Raises BrokenPageError if not found.
|
||||
|
||||
@param element [obj] element on which to apply selector
|
||||
@param selector [str] CSS or XPath expression
|
||||
@param method [str] (cssselect|xpath)
|
||||
@param nb [int] number of elements expected to be found.
|
||||
Use None for undefined number, and 'many' for 1 to infinite.
|
||||
@return one or many Element
|
||||
"""
|
||||
if method == 'cssselect':
|
||||
results = element.cssselect(selector)
|
||||
if nb is None:
|
||||
return results
|
||||
elif isinstance(nb, basestring) and nb == 'many':
|
||||
if results is None or len(results) == 0:
|
||||
raise BrokenPageError('Element not found with selector "%s"' % selector)
|
||||
elif len(results) == 1:
|
||||
raise BrokenPageError('Only one element found with selector "%s"' % selector)
|
||||
else:
|
||||
return results
|
||||
elif isinstance(nb, int) and nb > 0:
|
||||
if results is None:
|
||||
raise BrokenPageError('Element not found with selector "%s"' % selector)
|
||||
elif len(results) < nb:
|
||||
raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
|
||||
else:
|
||||
return results[0] if nb == 1 else results
|
||||
else:
|
||||
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
|
||||
else:
|
||||
raise NotImplementedError('Only cssselect method is implemented for the moment')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue