move select() in parser

This commit is contained in:
Romain Bignon 2011-04-08 19:39:57 +02:00
commit 9afb301ebe
30 changed files with 197 additions and 197 deletions

View file

@ -22,7 +22,7 @@ import re
import urllib import urllib
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from .video import ArteVideo from .video import ArteVideo
@ -40,13 +40,13 @@ class IndexPage(BasePage):
if m: if m:
_id = m.group(1) _id = m.group(1)
rating = rating_max = 0 rating = rating_max = 0
rates = select(div, 'div[class=rateContainer]', 1) rates = self.parser.select(div, 'div[class=rateContainer]', 1)
for r in rates.findall('div'): for r in rates.findall('div'):
if 'star-rating-on' in r.attrib['class']: if 'star-rating-on' in r.attrib['class']:
rating += 1 rating += 1
rating_max += 1 rating_max += 1
thumb = select(div, 'img[class=thumbnail]', 1) thumb = self.parser.select(div, 'img[class=thumbnail]', 1)
thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src'] thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src']
yield ArteVideo(_id, yield ArteVideo(_id,
@ -67,12 +67,12 @@ class VideoPage(BasePage):
return self.document.getroot().cssselect('h2')[0].text return self.document.getroot().cssselect('h2')[0].text
def get_url(self, lang, quality): def get_url(self, lang, quality):
obj = select(self.document.getroot(), 'object', 1) obj = self.parser.select(self.document.getroot(), 'object', 1)
movie_url = select(obj, 'param[name=movie]', 1) movie_url = self.parser.select(obj, 'param[name=movie]', 1)
xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1]) xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1])
doc = self.browser.get_document(self.browser.openurl(xml_url)) doc = self.browser.get_document(self.browser.openurl(xml_url))
videos_list = select(doc.getroot(), 'video') videos_list = self.parser.select(doc.getroot(), 'video')
videos = {} videos = {}
for v in videos_list: for v in videos_list:
videos[v.attrib['lang']] = v.attrib['ref'] videos[v.attrib['lang']] = v.attrib['ref']
@ -84,8 +84,8 @@ class VideoPage(BasePage):
doc = self.browser.get_document(self.browser.openurl(xml_url)) doc = self.browser.get_document(self.browser.openurl(xml_url))
obj = select(doc.getroot(), 'urls', 1) obj = self.parser.select(doc.getroot(), 'urls', 1)
videos_list = select(obj, 'url') videos_list = self.parser.select(obj, 'url')
urls = {} urls = {}
for v in videos_list: for v in videos_list:
urls[v.attrib['quality']] = v.text urls[v.attrib['quality']] = v.text

View file

@ -50,7 +50,7 @@ class AccountHistory(BasePage):
operation = Operation(len(operations)) operation = Operation(len(operations))
operation.date = mvt.xpath("./td/span")[0].text operation.date = mvt.xpath("./td/span")[0].text
tmp = mvt.xpath("./td/span")[1] tmp = mvt.xpath("./td/span")[1]
operation.label = remove_extra_spaces(remove_html_tags(self.browser.parser.tostring(tmp))) operation.label = remove_extra_spaces(remove_html_tags(self.parser.tostring(tmp)))
r = re.compile(r'\d+') r = re.compile(r'\d+')

View file

@ -22,7 +22,7 @@ from datetime import datetime, date, time
from weboob.tools.browser import BaseBrowser from weboob.tools.browser import BaseBrowser
from weboob.tools.misc import to_unicode from weboob.tools.misc import to_unicode
from weboob.tools.parsers.lxmlparser import SelectElementException from weboob.tools.browser import BrokenPageError
__all__ = ['CanalTP'] __all__ = ['CanalTP']
@ -52,7 +52,7 @@ class CanalTP(BaseBrowser):
departure = '' departure = ''
for line in result.split('&'): for line in result.split('&'):
if not '=' in line: if not '=' in line:
raise SelectElementException('Unable to parse result: %s' % line) raise BrokenPageError('Unable to parse result: %s' % line)
key, value = line.split('=', 1) key, value = line.split('=', 1)
if key == 'nomgare': if key == 'nomgare':
departure = value departure = value

View file

@ -24,7 +24,7 @@ import re
from weboob.capabilities.video import VideoThumbnail from weboob.capabilities.video import VideoThumbnail
from weboob.tools.misc import html2text from weboob.tools.misc import html2text
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
from .video import DailymotionVideo from .video import DailymotionVideo
@ -34,7 +34,7 @@ __all__ = ['IndexPage', 'VideoPage']
class IndexPage(BasePage): class IndexPage(BasePage):
def iter_videos(self): def iter_videos(self):
for div in select(self.document.getroot(), 'div.dmpi_video_item'): for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'):
_id = 0 _id = 0
for cls in div.attrib['class'].split(): for cls in div.attrib['class'].split():
if cls.startswith('id_'): if cls.startswith('id_'):
@ -46,15 +46,15 @@ class IndexPage(BasePage):
continue continue
video = DailymotionVideo(int(_id)) video = DailymotionVideo(int(_id))
video.title = select(div, 'h3 a', 1).text video.title = self.parser.select(div, 'h3 a', 1).text
video.author = select(div, 'div.dmpi_user_login', 1).find('a').text video.author = self.parser.select(div, 'div.dmpi_user_login', 1).find('a').text
video.description = html2text(self.browser.parser.tostring(select(div, 'div.dmpi_video_description', 1))).strip() video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip()
minutes, seconds = select(div, 'div.duration', 1).text.split(':') minutes, seconds = self.parser.select(div, 'div.duration', 1).text.split(':')
video.duration = datetime.timedelta(minutes=int(minutes), seconds=int(seconds)) video.duration = datetime.timedelta(minutes=int(minutes), seconds=int(seconds))
url = select(div, 'img.dmco_image', 1).attrib['src'] url = self.parser.select(div, 'img.dmco_image', 1).attrib['src']
video.thumbnail = VideoThumbnail(url) video.thumbnail = VideoThumbnail(url)
rating_div = select(div, 'div.small_stars', 1) rating_div = self.parser.select(div, 'div.small_stars', 1)
video.rating_max = self.get_rate(rating_div) video.rating_max = self.get_rate(rating_div)
video.rating = self.get_rate(rating_div.find('div')) video.rating = self.get_rate(rating_div.find('div'))
# XXX missing date # XXX missing date
@ -73,12 +73,12 @@ class VideoPage(BasePage):
if video is None: if video is None:
video = DailymotionVideo(self.group_dict['id']) video = DailymotionVideo(self.group_dict['id'])
div = select(self.document.getroot(), 'div#content', 1) div = self.parser.select(self.document.getroot(), 'div#content', 1)
video.title = select(div, 'span.title', 1).text video.title = self.parser.select(div, 'span.title', 1).text
video.author = select(div, 'a.name', 1).text video.author = self.parser.select(div, 'a.name', 1).text
video.description = select(div, 'div#video_description', 1).text video.description = self.parser.select(div, 'div#video_description', 1).text
for script in select(self.document.getroot(), 'div.dmco_html'): for script in self.parser.select(self.document.getroot(), 'div.dmco_html'):
if 'id' in script.attrib and script.attrib['id'].startswith('container_player_'): if 'id' in script.attrib and script.attrib['id'].startswith('container_player_'):
text = script.find('script').text text = script.find('script').text
mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text) mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text)

View file

@ -17,7 +17,7 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.parsers.lxmlparser import select
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
class Message(object): class Message(object):
@ -34,7 +34,7 @@ class BoardIndexPage(BasePage):
def get_messages(self, last=None): def get_messages(self, last=None):
msgs = [] msgs = []
for post in select(self.document.getroot(), 'post'): for post in self.parser.select(self.document.getroot(), 'post'):
m = Message(int(post.attrib['id']), m = Message(int(post.attrib['id']),
post.attrib['time'], post.attrib['time'],
post.find('login').text, post.find('login').text,

View file

@ -20,7 +20,7 @@
from datetime import datetime from datetime import datetime
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from weboob.tools.misc import local2utc from weboob.tools.misc import local2utc
from weboob.backends.dlfp.tools import url2id from weboob.backends.dlfp.tools import url2id
@ -54,23 +54,23 @@ class Comment(Content):
self.id = div.attrib['id'].split('-')[1] self.id = div.attrib['id'].split('-')[1]
self.url = '%s#%s' % (article.url, div.attrib['id']) self.url = '%s#%s' % (article.url, div.attrib['id'])
self.title = unicode(select(div.find('h2'), 'a.title', 1).text) self.title = unicode(self.browser.parser.select(div.find('h2'), 'a.title', 1).text)
try: try:
a = select(div.find('p'), 'a[rel=author]', 1) a = self.browser.parser.select(div.find('p'), 'a[rel=author]', 1)
except SelectElementException: except BrokenPageError:
self.author = 'Anonyme' self.author = 'Anonyme'
self.username = None self.username = None
else: else:
self.author = unicode(a.text) self.author = unicode(a.text)
self.username = unicode(a.attrib['href'].split('/')[2]) self.username = unicode(a.attrib['href'].split('/')[2])
self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], self.date = datetime.strptime(self.browser.parser.select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0],
'%Y-%m-%dT%H:%M:%S') '%Y-%m-%dT%H:%M:%S')
self.date = local2utc(self.date) self.date = local2utc(self.date)
content = div.find('div') content = div.find('div')
try: try:
signature = select(content, 'p.signature', 1) signature = self.browser.parser.select(content, 'p.signature', 1)
except SelectElementException: except BrokenPageError:
# No signature. # No signature.
pass pass
else: else:
@ -78,11 +78,11 @@ class Comment(Content):
self.signature = self.browser.parser.tostring(signature) self.signature = self.browser.parser.tostring(signature)
self.body = self.browser.parser.tostring(content) self.body = self.browser.parser.tostring(content)
self.score = int(select(div.find('p'), 'span.score', 1).text) self.score = int(self.browser.parser.select(div.find('p'), 'span.score', 1).text)
forms = select(div.find('footer'), 'form.button_to') forms = self.browser.parser.select(div.find('footer'), 'form.button_to')
if len(forms) > 0: if len(forms) > 0:
self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')
self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
subs = div.find('ul') subs = div.find('ul')
if subs is not None: if subs is not None:
@ -113,26 +113,26 @@ class Article(Content):
header = tree.find('header') header = tree.find('header')
self.title = u''.join([a.text for a in header.find('h1').findall('a')]) self.title = u''.join([a.text for a in header.find('h1').findall('a')])
try: try:
a = select(header, 'a[rel=author]', 1) a = self.browser.parser.select(header, 'a[rel=author]', 1)
except SelectElementException: except BrokenPageError:
self.author = 'Anonyme' self.author = 'Anonyme'
self.username = None self.username = None
else: else:
self.author = unicode(a.text) self.author = unicode(a.text)
self.username = unicode(a.attrib['href'].split('/')[2]) self.username = unicode(a.attrib['href'].split('/')[2])
self.body = self.browser.parser.tostring(select(tree, 'div.content', 1)) self.body = self.browser.parser.tostring(self.browser.parser.select(tree, 'div.content', 1))
try: try:
self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0], self.date = datetime.strptime(self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0],
'%Y-%m-%dT%H:%M:%S') '%Y-%m-%dT%H:%M:%S')
self.date = local2utc(self.date) self.date = local2utc(self.date)
except SelectElementException: except BrokenPageError:
pass pass
forms = select(tree.find('footer'), 'form.button_to') forms = self.browser.parser.select(tree.find('footer'), 'form.button_to')
if len(forms) > 0: if len(forms) > 0:
self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against')
self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value']
self.score = int(select(tree, 'div.figures figure.score', 1).text) self.score = int(self.browser.parser.select(tree, 'div.figures figure.score', 1).text)
def append_comment(self, comment): def append_comment(self, comment):
self.comments.append(comment) self.comments.append(comment)
@ -146,7 +146,7 @@ class Article(Content):
class CommentPage(DLFPPage): class CommentPage(DLFPPage):
def get_comment(self): def get_comment(self):
article = Article(self.browser, self.url, None) article = Article(self.browser, self.url, None)
return Comment(article, select(self.document.getroot(), 'li.comment', 1), 0) return Comment(article, self.parser.select(self.document.getroot(), 'li.comment', 1), 0)
class ContentPage(DLFPPage): class ContentPage(DLFPPage):
def on_loaded(self): def on_loaded(self):
@ -158,8 +158,8 @@ class ContentPage(DLFPPage):
def get_comment(self, id): def get_comment(self, id):
article = Article(self.browser, self.url, None) article = Article(self.browser, self.url, None)
try: try:
li = select(self.document.getroot(), 'li#comment-%s' % id, 1) li = self.parser.select(self.document.getroot(), 'li#comment-%s' % id, 1)
except SelectElementException: except BrokenPageError:
return None return None
else: else:
return Comment(article, li, 0) return Comment(article, li, 0)
@ -168,11 +168,11 @@ class ContentPage(DLFPPage):
if not self.article: if not self.article:
self.article = Article(self.browser, self.article = Article(self.browser,
self.url, self.url,
select(self.document.getroot(), 'div#contents article', 1)) self.parser.select(self.document.getroot(), 'div#contents article', 1))
try: try:
threads = select(self.document.getroot(), 'ul.threads', 1) threads = self.parser.select(self.document.getroot(), 'ul.threads', 1)
except SelectElementException: except BrokenPageError:
pass # no comments pass # no comments
else: else:
for comment in threads.findall('li'): for comment in threads.findall('li'):
@ -181,10 +181,10 @@ class ContentPage(DLFPPage):
return self.article return self.article
def get_post_comment_url(self): def get_post_comment_url(self):
return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href'] return self.parser.select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href']
def get_tag_url(self): def get_tag_url(self):
return select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href'] return self.parser.select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href']
class NewCommentPage(DLFPPage): class NewCommentPage(DLFPPage):
pass pass
@ -201,8 +201,8 @@ class NewTagPage(DLFPPage):
class NodePage(DLFPPage): class NodePage(DLFPPage):
def get_errors(self): def get_errors(self):
try: try:
div = select(self.document.getroot(), 'div.errors', 1) div = self.parser.select(self.document.getroot(), 'div.errors', 1)
except SelectElementException: except BrokenPageError:
return [] return []
l = [] l = []

View file

@ -17,15 +17,15 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from .index import DLFPPage from .index import DLFPPage
class WikiEditPage(DLFPPage): class WikiEditPage(DLFPPage):
def get_body(self): def get_body(self):
try: try:
return select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text return self.parser.select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text
except SelectElementException: except BrokenPageError:
return '' return ''
def _is_wiki_form(self, form): def _is_wiki_form(self, form):
@ -52,5 +52,5 @@ class WikiEditPage(DLFPPage):
self.browser.submit() self.browser.submit()
def get_preview_html(self): def get_preview_html(self):
body = select(self.document.getroot(), 'article.wikipage div.content', 1) body = self.parser.select(self.document.getroot(), 'article.wikipage div.content', 1)
return self.browser.parser.tostring(body) return self.parser.tostring(body)

View file

@ -29,9 +29,9 @@ class ArticlePage(GenericNewsPage):
def get_body(self): def get_body(self):
element_body = self.get_element_body() element_body = self.get_element_body()
remove_from_selector_list(element_body, ["p.auteur", "h4" ]) remove_from_selector_list(self.parser, element_body, ["p.auteur", "h4" ])
try_remove_from_selector_list(element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"]) try_remove_from_selector_list(self.parser, element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"])
try_drop_tree(element_body, "script") try_drop_tree(self.parser, element_body, "script")
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -87,7 +87,7 @@ class BoardPage(BasePage):
if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'): if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'):
article.id = int(div.attrib.get('name', '0')) article.id = int(div.attrib.get('name', '0'))
if div.tag == 'blockquote': if div.tag == 'blockquote':
article.text = self.browser.parser.tostring(div) article.text = self.parser.tostring(div)
if div.tag == 'table': if div.tag == 'table':
tags = div.cssselect('td.reply') tags = div.cssselect('td.reply')
if tags: if tags:

View file

@ -168,7 +168,7 @@ class TorrentsPage(BasePage):
title = title_t[0].find('strong').text.strip() title = title_t[0].find('strong').text.strip()
body_t = box.cssselect('div.body') body_t = box.cssselect('div.body')
if body_t: if body_t:
body = html2text(self.browser.parser.tostring(body_t[0])).strip() body = html2text(self.parser.tostring(body_t[0])).strip()
if title and body: if title and body:
if torrent.description is NotLoaded: if torrent.description is NotLoaded:

View file

@ -22,7 +22,7 @@ import datetime
import re import re
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from ..video import InaVideo from ..video import InaVideo
@ -35,8 +35,8 @@ class SearchPage(BasePage):
def iter_videos(self): def iter_videos(self):
try: try:
ul = select(self.document.getroot(), 'div.container-videos ul', 1) ul = self.parser.select(self.document.getroot(), 'div.container-videos ul', 1)
except SelectElementException: except BrokenPageError:
# It means there are no results. # It means there are no results.
return return
for li in ul.findall('li'): for li in ul.findall('li'):
@ -44,18 +44,18 @@ class SearchPage(BasePage):
thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src']
title = select(li, 'p.titre', 1).text title = self.parser.select(li, 'p.titre', 1).text
date = select(li, 'p.date', 1).text date = self.parser.select(li, 'p.date', 1).text
day, month, year = [int(s) for s in date.split('/')] day, month, year = [int(s) for s in date.split('/')]
date = datetime.datetime(year, month, day) date = datetime.datetime(year, month, day)
duration = select(li, 'p.duree', 1).text duration = self.parser.select(li, 'p.duree', 1).text
m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration) m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration)
if m: if m:
duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5))) duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5)))
else: else:
raise SelectElementException('Unable to match duration (%r)' % duration) raise BrokenPageError('Unable to match duration (%r)' % duration)
yield InaVideo(id, yield InaVideo(id,
title=title, title=title,

View file

@ -27,7 +27,7 @@ except ImportError:
from cgi import parse_qs from cgi import parse_qs
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import SelectElementException from weboob.tools.browser import BrokenPageError
from ..video import InaVideo from ..video import InaVideo
@ -75,9 +75,9 @@ class VideoPage(BasePage):
seconds=int(m.group(6))) seconds=int(m.group(6)))
return date, duration return date, duration
else: else:
raise SelectElementException('Unable to parse date and duration') raise BrokenPageError('Unable to parse date and duration')
else: else:
raise SelectElementException('Unable to find date and duration element') raise BrokenPageError('Unable to find date and duration element')
def get_title(self): def get_title(self):
el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0] el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0]

View file

@ -18,7 +18,7 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from weboob.tools.genericArticle import GenericNewsPage, try_remove, \ from weboob.tools.genericArticle import GenericNewsPage, try_remove, \
try_remove_from_selector_list, \ try_remove_from_selector_list, \
drop_comments, NoneMainDiv drop_comments, NoneMainDiv
@ -37,24 +37,26 @@ class ArticlePage(GenericNewsPage):
except NoneMainDiv: except NoneMainDiv:
return None return None
else: else:
div_header_element = select(element_body, "div.header", 1) div_header_element = self.parser.select(element_body, "div.header", 1)
element_detail = select(element_body, "div.details", 1) element_detail = self.parser.select(element_body, "div.details", 1)
div_content_element = select(element_body, "div.content", 1) div_content_element = self.parser.select(element_body, "div.content", 1)
drop_comments(element_body) drop_comments(element_body)
try_remove(element_body, "div.sidebar") try_remove(self.parser, element_body, "div.sidebar")
try_remove(element_detail, "div.footer") try_remove(self.parser, element_detail, "div.footer")
try_remove_from_selector_list(div_header_element, try_remove_from_selector_list(self.parser,
div_header_element,
["h1", "div.picture", "div.date", ["h1", "div.picture", "div.date",
"div.news-single-img", "div.news-single-img",
"div.metas_img", "strong"]) "div.metas_img", "strong"])
try_remove_from_selector_list(div_content_element, try_remove_from_selector_list(self.parser,
div_content_element,
["div.tw_button", "div.wpfblike"]) ["div.tw_button", "div.wpfblike"])
try : try :
description_element = select(div_header_element, description_element = self.parser.select(div_header_element,
"div.description", 1) "div.description", 1)
except SelectElementException: except BrokenPageError:
pass pass
else: else:
text_content = description_element.text_content() text_content = description_element.text_content()
@ -75,6 +77,4 @@ class ArticlePage(GenericNewsPage):
div_content_element.drop_tag() div_content_element.drop_tag()
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -30,4 +30,4 @@ class InrocksTvPage(GenericNewsPage):
def get_body(self): def get_body(self):
element_body = self.get_element_body() element_body = self.get_element_body()
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -30,13 +30,13 @@ class ArticlePage(GenericNewsPage):
def get_body(self): def get_body(self):
element_body = self.get_element_body() element_body = self.get_element_body()
remove_from_selector_list(element_body, [self.element_title_selector]) remove_from_selector_list(self.parser, element_body, [self.element_title_selector])
drop_comments(element_body) drop_comments(element_body)
try_drop_tree(element_body, "script") try_drop_tree(self.parser, element_body, "script")
try_remove_from_selector_list(element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link']) try_remove_from_selector_list(self.parser, element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link'])
element_body.find_class("texte")[0].drop_tag() element_body.find_class("texte")[0].drop_tag()
element_body.tag = "div" element_body.tag = "div"
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -31,5 +31,5 @@ class FlashActuPage(GenericNewsPage):
def get_body(self): def get_body(self):
element_body = self.get_element_body() element_body = self.get_element_body()
element_body.tag = "div" element_body.tag = "div"
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -19,7 +19,7 @@
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
__all__ = ['XMLinfos'] __all__ = ['XMLinfos']
@ -28,7 +28,7 @@ __all__ = ['XMLinfos']
class XMLinfos(BasePage): class XMLinfos(BasePage):
def get_current(self): def get_current(self):
try: try:
for channel in select(self.document.getroot(), 'channel'): for channel in self.parser.select(self.document.getroot(), 'channel'):
title = channel.find('item/song_title').text title = channel.find('item/song_title').text
artist = channel.find('item/artist_name').text artist = channel.find('item/artist_name').text
except AttributeError: except AttributeError:

View file

@ -35,11 +35,11 @@ class ArticlePage(SimplePage):
except NoneMainDiv: except NoneMainDiv:
return None return None
else: else:
try_remove(element_body, "div.mna-tools") try_remove(self.parser, element_body, "div.mna-tools")
try_remove(element_body, "div.mna-comment-call") try_remove(self.parser, element_body, "div.mna-comment-call")
try : try :
element_body.remove(self.get_element_author()) element_body.remove(self.get_element_author())
except NoAuthorElement: except NoAuthorElement:
pass pass
return self.browser.parser.tostring(element_body) return self.parser.tostring(element_body)

View file

@ -19,7 +19,7 @@
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
__all__ = ['PlayerPage'] __all__ = ['PlayerPage']
@ -27,6 +27,6 @@ __all__ = ['PlayerPage']
class PlayerPage(BasePage): class PlayerPage(BasePage):
def get_current(self): def get_current(self):
title = select(self.document.getroot(), 'span.titre_en_cours', 1).text title = self.parser.select(self.document.getroot(), 'span.titre_en_cours', 1).text
artist = select(self.document.getroot(), 'span.artiste_en_cours', 1).text artist = self.parser.select(self.document.getroot(), 'span.artiste_en_cours', 1).text
return unicode(artist).strip(), unicode(title).strip() return unicode(artist).strip(), unicode(title).strip()

View file

@ -19,11 +19,11 @@
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
class WikiEditPage(BasePage): class WikiEditPage(BasePage):
def get_source(self): def get_source(self):
return select(self.document.getroot(), 'textarea#content_text', 1).text return self.parser.select(self.document.getroot(), 'textarea#content_text', 1).text
def set_source(self, data, message): def set_source(self, data, message):
self.browser.select_form(nr=1) self.browser.select_form(nr=1)
@ -33,7 +33,7 @@ class WikiEditPage(BasePage):
self.browser.submit() self.browser.submit()
def get_authenticity_token(self): def get_authenticity_token(self):
wiki_form = select(self.document.getroot(), 'form#wiki_form', 1) wiki_form = self.parser.select(self.document.getroot(), 'form#wiki_form', 1)
return wiki_form.xpath('div/input')[0].get('value') return wiki_form.xpath('div/input')[0].get('value')

View file

@ -22,7 +22,7 @@ import re
from weboob.capabilities.messages import CantSendMessage from weboob.capabilities.messages import CantSendMessage
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
__all__ = ['ClosePage', 'ComposePage', 'ConfirmPage', 'SentPage'] __all__ = ['ClosePage', 'ComposePage', 'ConfirmPage', 'SentPage']
@ -37,7 +37,7 @@ class ComposePage(BasePage):
def get_nb_remaining_free_sms(self): def get_nb_remaining_free_sms(self):
remaining_regex = re.compile(u'Il vous reste (?P<nb>.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui') remaining_regex = re.compile(u'Il vous reste (?P<nb>.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui')
text = select(self.document.getroot(), '#smsReminder', 1).text.strip() text = self.parser.select(self.document.getroot(), '#smsReminder', 1).text.strip()
return remaining_regex.match(text).groupdict().get('nb') return remaining_regex.match(text).groupdict().get('nb')
def post_message(self, message): def post_message(self, message):

View file

@ -22,7 +22,7 @@ import datetime
import re import re
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from ..video import YoujizzVideo from ..video import YoujizzVideo
@ -32,25 +32,25 @@ __all__ = ['IndexPage']
class IndexPage(BasePage): class IndexPage(BasePage):
def iter_videos(self): def iter_videos(self):
span_list = select(self.document.getroot(), 'span#miniatura') span_list = self.parser.select(self.document.getroot(), 'span#miniatura')
for span in span_list: for span in span_list:
a = select(span, 'a', 1) a = self.parser.select(span, 'a', 1)
url = a.attrib['href'] url = a.attrib['href']
_id = re.sub(r'/videos/(.+)\.html', r'\1', url) _id = re.sub(r'/videos/(.+)\.html', r'\1', url)
thumbnail_url = span.find('.//img').attrib['src'] thumbnail_url = span.find('.//img').attrib['src']
title_el = select(span, 'span#title1', 1) title_el = self.parser.select(span, 'span#title1', 1)
title = title_el.text.strip() title = title_el.text.strip()
time_span = select(span, 'span.thumbtime span', 1) time_span = self.parser.select(span, 'span.thumbtime span', 1)
time_txt = time_span.text.strip().replace(';', ':') time_txt = time_span.text.strip().replace(';', ':')
if time_txt == 'N/A': if time_txt == 'N/A':
minutes, seconds = 0, 0 minutes, seconds = 0, 0
elif ':' in time_txt: elif ':' in time_txt:
minutes, seconds = (int(v) for v in time_txt.split(':')) minutes, seconds = (int(v) for v in time_txt.split(':'))
else: else:
raise SelectElementException('Unable to parse the video duration: %s' % time_txt) raise BrokenPageError('Unable to parse the video duration: %s' % time_txt)
yield YoujizzVideo(_id, yield YoujizzVideo(_id,

View file

@ -23,9 +23,8 @@ import lxml.html
import re import re
from weboob.capabilities.base import NotAvailable from weboob.capabilities.base import NotAvailable
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage, BrokenPageError
from weboob.tools.misc import to_unicode from weboob.tools.misc import to_unicode
from weboob.tools.parsers.lxmlparser import select, SelectElementException
from ..video import YoujizzVideo from ..video import YoujizzVideo
@ -39,7 +38,7 @@ class VideoPage(BasePage):
_id = to_unicode(self.group_dict['id']) _id = to_unicode(self.group_dict['id'])
if video is None: if video is None:
video = YoujizzVideo(_id) video = YoujizzVideo(_id)
title_el = select(self.document.getroot(), 'title', 1) title_el = self.parser.select(self.document.getroot(), 'title', 1)
video.title = to_unicode(title_el.text.strip()) video.title = to_unicode(title_el.text.strip())
# youjizz HTML is crap, we must parse it with regexps # youjizz HTML is crap, we must parse it with regexps
@ -53,13 +52,13 @@ class VideoPage(BasePage):
minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) minutes, seconds = (int(v) for v in to_unicode(txt).split(':'))
video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) video.duration = datetime.timedelta(minutes=minutes, seconds=seconds)
else: else:
raise SelectElementException('Unable to retrieve video duration') raise BrokenPageError('Unable to retrieve video duration')
video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data)
if len(video_file_urls) == 0: if len(video_file_urls) == 0:
raise SelectElementException('Video URL not found') raise BrokenPageError('Video URL not found')
elif len(video_file_urls) > 1: elif len(video_file_urls) > 1:
raise SelectElementException('Many video file URL found') raise BrokenPageError('Many video file URL found')
else: else:
video.url = video_file_urls[0] video.url = video_file_urls[0]

View file

@ -21,7 +21,7 @@
import re import re
import datetime import datetime
from weboob.tools.parsers.lxmlparser import select
from .base import PornPage from .base import PornPage
from ..video import YoupornVideo from ..video import YoupornVideo
@ -39,19 +39,19 @@ class VideoPage(PornPage):
return video return video
def get_url(self): def get_url(self):
download_div = select(self.document.getroot(), '#download', 1) download_div = self.parser.select(self.document.getroot(), '#download', 1)
a = select(download_div, 'a', 1) a = self.parser.select(download_div, 'a', 1)
return a.attrib['href'] return a.attrib['href']
def get_title(self): def get_title(self):
element = select(self.document.getroot(), '#videoArea h1', 1) element = self.parser.select(self.document.getroot(), '#videoArea h1', 1)
return unicode(element.getchildren()[0].tail).strip() return unicode(element.getchildren()[0].tail).strip()
DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)") DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)")
MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
def set_details(self, v): def set_details(self, v):
details_div = select(self.document.getroot(), '#details', 1) details_div = self.parser.select(self.document.getroot(), '#details', 1)
for li in details_div.getiterator('li'): for li in details_div.getiterator('li'):
span = li.find('span') span = li.find('span')
name = span.text.strip() name = span.text.strip()

View file

@ -19,7 +19,7 @@
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select
__all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage'] __all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage']
@ -31,7 +31,7 @@ class ForbiddenVideo(Exception):
class ForbiddenVideoPage(BasePage): class ForbiddenVideoPage(BasePage):
def get_video(self, video=None): def get_video(self, video=None):
element = select(self.document.getroot(), '.yt-alert-content', 1) element = self.parser.select(self.document.getroot(), '.yt-alert-content', 1)
raise ForbiddenVideo(element.text.strip()) raise ForbiddenVideo(element.text.strip())

View file

@ -21,8 +21,8 @@
from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \ from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \
BrowserUnavailable, BrowserRetry, \ BrowserUnavailable, BrowserRetry, \
BrowserHTTPNotFound, BrowserHTTPError, \ BrowserHTTPNotFound, BrowserHTTPError, \
BasePage, BaseBrowser BasePage, BaseBrowser, BrokenPageError
__all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry', __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry',
'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser'] 'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser', 'BrokenPageError']

View file

@ -93,6 +93,8 @@ class NoHistory(object):
def close(self): def close(self):
pass pass
class BrokenPageError(Exception):
pass
class BasePage(object): class BasePage(object):
""" """
@ -100,6 +102,7 @@ class BasePage(object):
""" """
def __init__(self, browser, document, url='', groups=None, group_dict=None, logger=None): def __init__(self, browser, document, url='', groups=None, group_dict=None, logger=None):
self.browser = browser self.browser = browser
self.parser = browser.parser
self.document = document self.document = document
self.url = url self.url = url
self.groups = groups self.groups = groups

View file

@ -16,32 +16,33 @@
# #
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BasePage from weboob.tools.browser import BasePage
from weboob.tools.parsers.lxmlparser import select, SelectElementException from weboob.tools.browser import BrokenPageError
from lxml.etree import Comment from lxml.etree import Comment
def try_remove(base_element, selector): def try_remove(parser, base_element, selector):
try : try :
base_element.remove(select(base_element, selector, 1 )) base_element.remove(parser.select(base_element, selector, 1 ))
except (SelectElementException, ValueError): except (BrokenPageError, ValueError):
pass pass
def try_drop_tree(base_element, selector): def try_drop_tree(parser, base_element, selector):
try: try:
select(base_element, selector, 1).drop_tree() parser.select(base_element, selector, 1).drop_tree()
except SelectElementException: except BrokenPageError:
pass pass
def remove_from_selector_list(base_element, selector_list): def remove_from_selector_list(parser, base_element, selector_list):
for selector in selector_list: for selector in selector_list:
base_element.remove(select(base_element, selector, 1)) base_element.remove(parser.select(base_element, selector, 1))
def try_remove_from_selector_list(base_element, selector_list): def try_remove_from_selector_list(parser, base_element, selector_list):
for selector in selector_list: for selector in selector_list:
try_remove(base_element, selector) try_remove(parser, base_element, selector)
def drop_comments(base_element): def drop_comments(base_element):
for comment in base_element.getiterator(Comment): for comment in base_element.getiterator(Comment):
@ -49,13 +50,13 @@ def drop_comments(base_element):
class NoAuthorElement(SelectElementException): class NoAuthorElement(BrokenPageError):
pass pass
class NoBodyElement(SelectElementException): class NoBodyElement(BrokenPageError):
pass pass
class NoTitleException(SelectElementException): class NoTitleException(BrokenPageError):
pass pass
class NoneMainDiv(AttributeError): class NoneMainDiv(AttributeError):
@ -81,7 +82,7 @@ class GenericNewsPage(BasePage):
element_author_selector = NotImplementedError element_author_selector = NotImplementedError
def get_body(self): def get_body(self):
return self.browser.parser.tostring(self.get_element_body()) return self.parser.tostring(self.get_element_body())
def get_author(self): def get_author(self):
try: try:
@ -92,7 +93,7 @@ class GenericNewsPage(BasePage):
def get_title(self): def get_title(self):
try : try :
return select( return self.parser.select(
self.main_div, self.main_div,
self.element_title_selector, self.element_title_selector,
1).text_content().strip() 1).text_content().strip()
@ -102,17 +103,17 @@ class GenericNewsPage(BasePage):
return self.__article.title return self.__article.title
else: else:
raise raise
except SelectElementException: except BrokenPageError:
try : try :
self.element_title_selector = "h1" self.element_title_selector = "h1"
return self.get_title() return self.get_title()
except SelectElementException: except BrokenPageError:
raise NoTitleException("no title on %s" % (self.browser)) raise NoTitleException("no title on %s" % (self.browser))
def get_element_body(self): def get_element_body(self):
try : try :
return select(self.main_div, self.element_body_selector, 1) return self.parser.select(self.main_div, self.element_body_selector, 1)
except SelectElementException: except BrokenPageError:
raise NoBodyElement("no body on %s" % (self.browser)) raise NoBodyElement("no body on %s" % (self.browser))
except AttributeError: except AttributeError:
if self.main_div == None: if self.main_div == None:
@ -122,8 +123,8 @@ class GenericNewsPage(BasePage):
def get_element_author(self): def get_element_author(self):
try: try:
return select(self.main_div, self.element_author_selector, 1) return self.parser.select(self.main_div, self.element_author_selector, 1)
except SelectElementException: except BrokenPageError:
raise NoAuthorElement() raise NoAuthorElement()
except AttributeError: except AttributeError:
if self.main_div == None: if self.main_div == None:

View file

@ -21,50 +21,10 @@
import lxml.html import lxml.html
from .iparser import IParser from .iparser import IParser
from ..browser import BrokenPageError
__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException'] __all__ = ['LxmlHtmlParser']
class SelectElementException(Exception):
pass
def select(element, selector, nb=None, method='cssselect'):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises SelectElementException if not found.
@param element [obj] element on which to apply selector
@param selector [str] CSS or XPath expression
@param method [str] (cssselect|xpath)
@param nb [int] number of elements expected to be found.
Use None for undefined number, and 'many' for 1 to infinite.
@return one or many Element
"""
if method == 'cssselect':
results = element.cssselect(selector)
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise SelectElementException('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise SelectElementException('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
else:
raise NotImplementedError('Only cssselect method is implemented for the moment')
class LxmlHtmlParser(IParser): class LxmlHtmlParser(IParser):
@ -83,3 +43,40 @@ class LxmlHtmlParser(IParser):
def tostring(self, element): def tostring(self, element):
return lxml.html.tostring(element, encoding=unicode) return lxml.html.tostring(element, encoding=unicode)
@classmethod
def select(cls, element, selector, nb=None, method='cssselect'):
"""
Select one or many elements from an element, using lxml cssselect by default.
Raises BrokenPageError if not found.
@param element [obj] element on which to apply selector
@param selector [str] CSS or XPath expression
@param method [str] (cssselect|xpath)
@param nb [int] number of elements expected to be found.
Use None for undefined number, and 'many' for 1 to infinite.
@return one or many Element
"""
if method == 'cssselect':
results = element.cssselect(selector)
if nb is None:
return results
elif isinstance(nb, basestring) and nb == 'many':
if results is None or len(results) == 0:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) == 1:
raise BrokenPageError('Only one element found with selector "%s"' % selector)
else:
return results
elif isinstance(nb, int) and nb > 0:
if results is None:
raise BrokenPageError('Element not found with selector "%s"' % selector)
elif len(results) < nb:
raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector))
else:
return results[0] if nb == 1 else results
else:
raise Exception('Unhandled value for kwarg "nb": %s' % nb)
else:
raise NotImplementedError('Only cssselect method is implemented for the moment')