From 9afb301ebea2a6586c5d3ccdd74661f6e6eae5b3 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Fri, 8 Apr 2011 19:39:57 +0200 Subject: [PATCH] move select() in parser --- weboob/backends/arte/pages.py | 16 ++-- weboob/backends/bp/pages/accounthistory.py | 2 +- weboob/backends/canaltp/browser.py | 4 +- weboob/backends/dailymotion/pages.py | 26 +++--- weboob/backends/dlfp/pages/board.py | 4 +- weboob/backends/dlfp/pages/news.py | 56 ++++++------- weboob/backends/dlfp/pages/wiki.py | 10 +-- weboob/backends/ecrans/pages/article.py | 8 +- weboob/backends/fourchan/pages/board.py | 2 +- weboob/backends/gazelle/pages/torrents.py | 2 +- weboob/backends/ina/pages/search.py | 14 ++-- weboob/backends/ina/pages/video.py | 6 +- weboob/backends/inrocks/pages/article.py | 32 ++++---- weboob/backends/inrocks/pages/inrockstv.py | 2 +- weboob/backends/lefigaro/pages/article.py | 8 +- weboob/backends/lefigaro/pages/flashactu.py | 2 +- weboob/backends/lemouv/pages.py | 4 +- weboob/backends/minutes20/pages/article.py | 6 +- weboob/backends/ouifm/pages.py | 6 +- weboob/backends/redmine/pages/wiki.py | 6 +- weboob/backends/sfr/pages/compose.py | 4 +- weboob/backends/youjizz/pages/index.py | 12 +-- weboob/backends/youjizz/pages/video.py | 11 ++- weboob/backends/youporn/pages/video.py | 10 +-- weboob/backends/youtube/pages.py | 4 +- .../application/formatters/iformatter.py | 2 +- weboob/tools/browser/__init__.py | 4 +- weboob/tools/browser/browser.py | 3 + weboob/tools/genericArticle.py | 47 +++++------ weboob/tools/parsers/lxmlparser.py | 81 +++++++++---------- 30 files changed, 197 insertions(+), 197 deletions(-) diff --git a/weboob/backends/arte/pages.py b/weboob/backends/arte/pages.py index ed03b4f9..b2260b1c 100644 --- a/weboob/backends/arte/pages.py +++ b/weboob/backends/arte/pages.py @@ -22,7 +22,7 @@ import re import urllib from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + from .video import ArteVideo @@ -40,13 +40,13 @@ class IndexPage(BasePage): if m: _id = m.group(1) rating = rating_max = 0 - rates = select(div, 'div[class=rateContainer]', 1) + rates = self.parser.select(div, 'div[class=rateContainer]', 1) for r in rates.findall('div'): if 'star-rating-on' in r.attrib['class']: rating += 1 rating_max += 1 - thumb = select(div, 'img[class=thumbnail]', 1) + thumb = self.parser.select(div, 'img[class=thumbnail]', 1) thumbnail_url = 'http://videos.arte.tv' + thumb.attrib['src'] yield ArteVideo(_id, @@ -67,12 +67,12 @@ class VideoPage(BasePage): return self.document.getroot().cssselect('h2')[0].text def get_url(self, lang, quality): - obj = select(self.document.getroot(), 'object', 1) - movie_url = select(obj, 'param[name=movie]', 1) + obj = self.parser.select(self.document.getroot(), 'object', 1) + movie_url = self.parser.select(obj, 'param[name=movie]', 1) xml_url = urllib.unquote(movie_url.attrib['value'].split('videorefFileUrl=')[-1]) doc = self.browser.get_document(self.browser.openurl(xml_url)) - videos_list = select(doc.getroot(), 'video') + videos_list = self.parser.select(doc.getroot(), 'video') videos = {} for v in videos_list: videos[v.attrib['lang']] = v.attrib['ref'] @@ -84,8 +84,8 @@ class VideoPage(BasePage): doc = self.browser.get_document(self.browser.openurl(xml_url)) - obj = select(doc.getroot(), 'urls', 1) - videos_list = select(obj, 'url') + obj = self.parser.select(doc.getroot(), 'urls', 1) + videos_list = self.parser.select(obj, 'url') urls = {} for v in videos_list: urls[v.attrib['quality']] = v.text diff --git a/weboob/backends/bp/pages/accounthistory.py b/weboob/backends/bp/pages/accounthistory.py index c07f2513..1cb8341d 100644 --- a/weboob/backends/bp/pages/accounthistory.py +++ b/weboob/backends/bp/pages/accounthistory.py @@ -50,7 +50,7 @@ class AccountHistory(BasePage): operation = Operation(len(operations)) operation.date = mvt.xpath("./td/span")[0].text tmp = mvt.xpath("./td/span")[1] - operation.label = remove_extra_spaces(remove_html_tags(self.browser.parser.tostring(tmp))) + operation.label = remove_extra_spaces(remove_html_tags(self.parser.tostring(tmp))) r = re.compile(r'\d+') diff --git a/weboob/backends/canaltp/browser.py b/weboob/backends/canaltp/browser.py index 34177463..e0a58ccf 100644 --- a/weboob/backends/canaltp/browser.py +++ b/weboob/backends/canaltp/browser.py @@ -22,7 +22,7 @@ from datetime import datetime, date, time from weboob.tools.browser import BaseBrowser from weboob.tools.misc import to_unicode -from weboob.tools.parsers.lxmlparser import SelectElementException +from weboob.tools.browser import BrokenPageError __all__ = ['CanalTP'] @@ -52,7 +52,7 @@ class CanalTP(BaseBrowser): departure = '' for line in result.split('&'): if not '=' in line: - raise SelectElementException('Unable to parse result: %s' % line) + raise BrokenPageError('Unable to parse result: %s' % line) key, value = line.split('=', 1) if key == 'nomgare': departure = value diff --git a/weboob/backends/dailymotion/pages.py b/weboob/backends/dailymotion/pages.py index 7859d4b7..e886e429 100644 --- a/weboob/backends/dailymotion/pages.py +++ b/weboob/backends/dailymotion/pages.py @@ -24,7 +24,7 @@ import re from weboob.capabilities.video import VideoThumbnail from weboob.tools.misc import html2text from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + from .video import DailymotionVideo @@ -34,7 +34,7 @@ __all__ = ['IndexPage', 'VideoPage'] class IndexPage(BasePage): def iter_videos(self): - for div in select(self.document.getroot(), 'div.dmpi_video_item'): + for div in self.parser.select(self.document.getroot(), 'div.dmpi_video_item'): _id = 0 for cls in div.attrib['class'].split(): if cls.startswith('id_'): @@ -46,15 +46,15 @@ class IndexPage(BasePage): continue video = DailymotionVideo(int(_id)) - video.title = select(div, 'h3 a', 1).text - video.author = select(div, 'div.dmpi_user_login', 1).find('a').text - video.description = html2text(self.browser.parser.tostring(select(div, 'div.dmpi_video_description', 1))).strip() - minutes, seconds = select(div, 'div.duration', 1).text.split(':') + video.title = self.parser.select(div, 'h3 a', 1).text + video.author = self.parser.select(div, 'div.dmpi_user_login', 1).find('a').text + video.description = html2text(self.parser.tostring(self.parser.select(div, 'div.dmpi_video_description', 1))).strip() + minutes, seconds = self.parser.select(div, 'div.duration', 1).text.split(':') video.duration = datetime.timedelta(minutes=int(minutes), seconds=int(seconds)) - url = select(div, 'img.dmco_image', 1).attrib['src'] + url = self.parser.select(div, 'img.dmco_image', 1).attrib['src'] video.thumbnail = VideoThumbnail(url) - rating_div = select(div, 'div.small_stars', 1) + rating_div = self.parser.select(div, 'div.small_stars', 1) video.rating_max = self.get_rate(rating_div) video.rating = self.get_rate(rating_div.find('div')) # XXX missing date @@ -73,12 +73,12 @@ class VideoPage(BasePage): if video is None: video = DailymotionVideo(self.group_dict['id']) - div = select(self.document.getroot(), 'div#content', 1) + div = self.parser.select(self.document.getroot(), 'div#content', 1) - video.title = select(div, 'span.title', 1).text - video.author = select(div, 'a.name', 1).text - video.description = select(div, 'div#video_description', 1).text - for script in select(self.document.getroot(), 'div.dmco_html'): + video.title = self.parser.select(div, 'span.title', 1).text + video.author = self.parser.select(div, 'a.name', 1).text + video.description = self.parser.select(div, 'div#video_description', 1).text + for script in self.parser.select(self.document.getroot(), 'div.dmco_html'): if 'id' in script.attrib and script.attrib['id'].startswith('container_player_'): text = script.find('script').text mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', text) diff --git a/weboob/backends/dlfp/pages/board.py b/weboob/backends/dlfp/pages/board.py index ee9a525b..9cc3c48d 100644 --- a/weboob/backends/dlfp/pages/board.py +++ b/weboob/backends/dlfp/pages/board.py @@ -17,7 +17,7 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.parsers.lxmlparser import select + from weboob.tools.browser import BasePage class Message(object): @@ -34,7 +34,7 @@ class BoardIndexPage(BasePage): def get_messages(self, last=None): msgs = [] - for post in select(self.document.getroot(), 'post'): + for post in self.parser.select(self.document.getroot(), 'post'): m = Message(int(post.attrib['id']), post.attrib['time'], post.find('login').text, diff --git a/weboob/backends/dlfp/pages/news.py b/weboob/backends/dlfp/pages/news.py index 658761c7..97f326b1 100644 --- a/weboob/backends/dlfp/pages/news.py +++ b/weboob/backends/dlfp/pages/news.py @@ -20,7 +20,7 @@ from datetime import datetime -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from weboob.tools.misc import local2utc from weboob.backends.dlfp.tools import url2id @@ -54,23 +54,23 @@ class Comment(Content): self.id = div.attrib['id'].split('-')[1] self.url = '%s#%s' % (article.url, div.attrib['id']) - self.title = unicode(select(div.find('h2'), 'a.title', 1).text) + self.title = unicode(self.browser.parser.select(div.find('h2'), 'a.title', 1).text) try: - a = select(div.find('p'), 'a[rel=author]', 1) - except SelectElementException: + a = self.browser.parser.select(div.find('p'), 'a[rel=author]', 1) + except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) - self.date = datetime.strptime(select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], + self.date = datetime.strptime(self.browser.parser.select(div.find('p'), 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) content = div.find('div') try: - signature = select(content, 'p.signature', 1) - except SelectElementException: + signature = self.browser.parser.select(content, 'p.signature', 1) + except BrokenPageError: # No signature. pass else: @@ -78,11 +78,11 @@ class Comment(Content): self.signature = self.browser.parser.tostring(signature) self.body = self.browser.parser.tostring(content) - self.score = int(select(div.find('p'), 'span.score', 1).text) - forms = select(div.find('footer'), 'form.button_to') + self.score = int(self.browser.parser.select(div.find('p'), 'span.score', 1).text) + forms = self.browser.parser.select(div.find('footer'), 'form.button_to') if len(forms) > 0: self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') - self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] + self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] subs = div.find('ul') if subs is not None: @@ -113,26 +113,26 @@ class Article(Content): header = tree.find('header') self.title = u' — '.join([a.text for a in header.find('h1').findall('a')]) try: - a = select(header, 'a[rel=author]', 1) - except SelectElementException: + a = self.browser.parser.select(header, 'a[rel=author]', 1) + except BrokenPageError: self.author = 'Anonyme' self.username = None else: self.author = unicode(a.text) self.username = unicode(a.attrib['href'].split('/')[2]) - self.body = self.browser.parser.tostring(select(tree, 'div.content', 1)) + self.body = self.browser.parser.tostring(self.browser.parser.select(tree, 'div.content', 1)) try: - self.date = datetime.strptime(select(header, 'time', 1).attrib['datetime'].split('+')[0], + self.date = datetime.strptime(self.browser.parser.select(header, 'time', 1).attrib['datetime'].split('+')[0], '%Y-%m-%dT%H:%M:%S') self.date = local2utc(self.date) - except SelectElementException: + except BrokenPageError: pass - forms = select(tree.find('footer'), 'form.button_to') + forms = self.browser.parser.select(tree.find('footer'), 'form.button_to') if len(forms) > 0: self.relevance_url = forms[0].attrib['action'].rstrip('for').rstrip('against') - self.relevance_token = select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] + self.relevance_token = self.browser.parser.select(forms[0], 'input[name=authenticity_token]', 1).attrib['value'] - self.score = int(select(tree, 'div.figures figure.score', 1).text) + self.score = int(self.browser.parser.select(tree, 'div.figures figure.score', 1).text) def append_comment(self, comment): self.comments.append(comment) @@ -146,7 +146,7 @@ class Article(Content): class CommentPage(DLFPPage): def get_comment(self): article = Article(self.browser, self.url, None) - return Comment(article, select(self.document.getroot(), 'li.comment', 1), 0) + return Comment(article, self.parser.select(self.document.getroot(), 'li.comment', 1), 0) class ContentPage(DLFPPage): def on_loaded(self): @@ -158,8 +158,8 @@ class ContentPage(DLFPPage): def get_comment(self, id): article = Article(self.browser, self.url, None) try: - li = select(self.document.getroot(), 'li#comment-%s' % id, 1) - except SelectElementException: + li = self.parser.select(self.document.getroot(), 'li#comment-%s' % id, 1) + except BrokenPageError: return None else: return Comment(article, li, 0) @@ -168,11 +168,11 @@ class ContentPage(DLFPPage): if not self.article: self.article = Article(self.browser, self.url, - select(self.document.getroot(), 'div#contents article', 1)) + self.parser.select(self.document.getroot(), 'div#contents article', 1)) try: - threads = select(self.document.getroot(), 'ul.threads', 1) - except SelectElementException: + threads = self.parser.select(self.document.getroot(), 'ul.threads', 1) + except BrokenPageError: pass # no comments else: for comment in threads.findall('li'): @@ -181,10 +181,10 @@ class ContentPage(DLFPPage): return self.article def get_post_comment_url(self): - return select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href'] + return self.parser.select(self.document.getroot(), 'p#send-comment', 1).find('a').attrib['href'] def get_tag_url(self): - return select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href'] + return self.parser.select(self.document.getroot(), 'div.tag_in_place', 1).find('a').attrib['href'] class NewCommentPage(DLFPPage): pass @@ -201,8 +201,8 @@ class NewTagPage(DLFPPage): class NodePage(DLFPPage): def get_errors(self): try: - div = select(self.document.getroot(), 'div.errors', 1) - except SelectElementException: + div = self.parser.select(self.document.getroot(), 'div.errors', 1) + except BrokenPageError: return [] l = [] diff --git a/weboob/backends/dlfp/pages/wiki.py b/weboob/backends/dlfp/pages/wiki.py index 1fe6a58c..f8eed006 100644 --- a/weboob/backends/dlfp/pages/wiki.py +++ b/weboob/backends/dlfp/pages/wiki.py @@ -17,15 +17,15 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from .index import DLFPPage class WikiEditPage(DLFPPage): def get_body(self): try: - return select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text - except SelectElementException: + return self.parser.select(self.document.getroot(), 'textarea#wiki_page_wiki_body', 1).text + except BrokenPageError: return '' def _is_wiki_form(self, form): @@ -52,5 +52,5 @@ class WikiEditPage(DLFPPage): self.browser.submit() def get_preview_html(self): - body = select(self.document.getroot(), 'article.wikipage div.content', 1) - return self.browser.parser.tostring(body) + body = self.parser.select(self.document.getroot(), 'article.wikipage div.content', 1) + return self.parser.tostring(body) diff --git a/weboob/backends/ecrans/pages/article.py b/weboob/backends/ecrans/pages/article.py index bedbe0b3..b11ba6c3 100644 --- a/weboob/backends/ecrans/pages/article.py +++ b/weboob/backends/ecrans/pages/article.py @@ -29,9 +29,9 @@ class ArticlePage(GenericNewsPage): def get_body(self): element_body = self.get_element_body() - remove_from_selector_list(element_body, ["p.auteur", "h4" ]) - try_remove_from_selector_list(element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"]) - try_drop_tree(element_body, "script") + remove_from_selector_list(self.parser, element_body, ["p.auteur", "h4" ]) + try_remove_from_selector_list(self.parser, element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"]) + try_drop_tree(self.parser, element_body, "script") - return self.browser.parser.tostring(element_body) + return self.parser.tostring(element_body) diff --git a/weboob/backends/fourchan/pages/board.py b/weboob/backends/fourchan/pages/board.py index 2a1f5db5..5f3a0c6c 100644 --- a/weboob/backends/fourchan/pages/board.py +++ b/weboob/backends/fourchan/pages/board.py @@ -87,7 +87,7 @@ class BoardPage(BasePage): if div.tag == 'input' and div.attrib.get('type', 'checkbox') and div.attrib.get('value', 'delete'): article.id = int(div.attrib.get('name', '0')) if div.tag == 'blockquote': - article.text = self.browser.parser.tostring(div) + article.text = self.parser.tostring(div) if div.tag == 'table': tags = div.cssselect('td.reply') if tags: diff --git a/weboob/backends/gazelle/pages/torrents.py b/weboob/backends/gazelle/pages/torrents.py index 8f4a0aea..575efad2 100644 --- a/weboob/backends/gazelle/pages/torrents.py +++ b/weboob/backends/gazelle/pages/torrents.py @@ -168,7 +168,7 @@ class TorrentsPage(BasePage): title = title_t[0].find('strong').text.strip() body_t = box.cssselect('div.body') if body_t: - body = html2text(self.browser.parser.tostring(body_t[0])).strip() + body = html2text(self.parser.tostring(body_t[0])).strip() if title and body: if torrent.description is NotLoaded: diff --git a/weboob/backends/ina/pages/search.py b/weboob/backends/ina/pages/search.py index 0d0e73db..940dc533 100644 --- a/weboob/backends/ina/pages/search.py +++ b/weboob/backends/ina/pages/search.py @@ -22,7 +22,7 @@ import datetime import re from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from ..video import InaVideo @@ -35,8 +35,8 @@ class SearchPage(BasePage): def iter_videos(self): try: - ul = select(self.document.getroot(), 'div.container-videos ul', 1) - except SelectElementException: + ul = self.parser.select(self.document.getroot(), 'div.container-videos ul', 1) + except BrokenPageError: # It means there are no results. return for li in ul.findall('li'): @@ -44,18 +44,18 @@ class SearchPage(BasePage): thumbnail = 'http://boutique.ina.fr%s' % li.find('a').find('img').attrib['src'] - title = select(li, 'p.titre', 1).text + title = self.parser.select(li, 'p.titre', 1).text - date = select(li, 'p.date', 1).text + date = self.parser.select(li, 'p.date', 1).text day, month, year = [int(s) for s in date.split('/')] date = datetime.datetime(year, month, day) - duration = select(li, 'p.duree', 1).text + duration = self.parser.select(li, 'p.duree', 1).text m = re.match(r'((\d+)h)?((\d+)min)?(\d+)s', duration) if m: duration = datetime.timedelta(hours=int(m.group(2) or 0), minutes=int(m.group(4) or 0), seconds=int(m.group(5))) else: - raise SelectElementException('Unable to match duration (%r)' % duration) + raise BrokenPageError('Unable to match duration (%r)' % duration) yield InaVideo(id, title=title, diff --git a/weboob/backends/ina/pages/video.py b/weboob/backends/ina/pages/video.py index 155408b1..069894e4 100644 --- a/weboob/backends/ina/pages/video.py +++ b/weboob/backends/ina/pages/video.py @@ -27,7 +27,7 @@ except ImportError: from cgi import parse_qs from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import SelectElementException +from weboob.tools.browser import BrokenPageError from ..video import InaVideo @@ -75,9 +75,9 @@ class VideoPage(BasePage): seconds=int(m.group(6))) return date, duration else: - raise SelectElementException('Unable to parse date and duration') + raise BrokenPageError('Unable to parse date and duration') else: - raise SelectElementException('Unable to find date and duration element') + raise BrokenPageError('Unable to find date and duration element') def get_title(self): el = self.document.getroot().cssselect('div.bloc-produit-haut h1')[0] diff --git a/weboob/backends/inrocks/pages/article.py b/weboob/backends/inrocks/pages/article.py index 9580553d..6f0d980a 100644 --- a/weboob/backends/inrocks/pages/article.py +++ b/weboob/backends/inrocks/pages/article.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from weboob.tools.genericArticle import GenericNewsPage, try_remove, \ try_remove_from_selector_list, \ drop_comments, NoneMainDiv @@ -35,26 +35,28 @@ class ArticlePage(GenericNewsPage): try : element_body = self.get_element_body() except NoneMainDiv: - return None + return None else: - div_header_element = select(element_body, "div.header", 1) - element_detail = select(element_body, "div.details", 1) - div_content_element = select(element_body, "div.content", 1) + div_header_element = self.parser.select(element_body, "div.header", 1) + element_detail = self.parser.select(element_body, "div.details", 1) + div_content_element = self.parser.select(element_body, "div.content", 1) drop_comments(element_body) - try_remove(element_body, "div.sidebar") - try_remove(element_detail, "div.footer") - try_remove_from_selector_list(div_header_element, - ["h1", "div.picture", "div.date", - "div.news-single-img", + try_remove(self.parser, element_body, "div.sidebar") + try_remove(self.parser, element_detail, "div.footer") + try_remove_from_selector_list(self.parser, + div_header_element, + ["h1", "div.picture", "div.date", + "div.news-single-img", "div.metas_img", "strong"]) - try_remove_from_selector_list(div_content_element, + try_remove_from_selector_list(self.parser, + div_content_element, ["div.tw_button", "div.wpfblike"]) try : - description_element = select(div_header_element, + description_element = self.parser.select(div_header_element, "div.description", 1) - except SelectElementException: + except BrokenPageError: pass else: text_content = description_element.text_content() @@ -75,6 +77,4 @@ class ArticlePage(GenericNewsPage): div_content_element.drop_tag() - return self.browser.parser.tostring(element_body) - - + return self.parser.tostring(element_body) diff --git a/weboob/backends/inrocks/pages/inrockstv.py b/weboob/backends/inrocks/pages/inrockstv.py index 4a59f4ee..bbc89fd9 100644 --- a/weboob/backends/inrocks/pages/inrockstv.py +++ b/weboob/backends/inrocks/pages/inrockstv.py @@ -30,4 +30,4 @@ class InrocksTvPage(GenericNewsPage): def get_body(self): element_body = self.get_element_body() - return self.browser.parser.tostring(element_body) + return self.parser.tostring(element_body) diff --git a/weboob/backends/lefigaro/pages/article.py b/weboob/backends/lefigaro/pages/article.py index 47403118..e9b8ed0d 100644 --- a/weboob/backends/lefigaro/pages/article.py +++ b/weboob/backends/lefigaro/pages/article.py @@ -30,13 +30,13 @@ class ArticlePage(GenericNewsPage): def get_body(self): element_body = self.get_element_body() - remove_from_selector_list(element_body, [self.element_title_selector]) + remove_from_selector_list(self.parser, element_body, [self.element_title_selector]) drop_comments(element_body) - try_drop_tree(element_body, "script") + try_drop_tree(self.parser, element_body, "script") - try_remove_from_selector_list(element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link']) + try_remove_from_selector_list(self.parser, element_body, ["div.infos", "div.photo", "div.art_bandeau_bottom", "div.view", "span.auteur_long", "#toolsbar", 'link']) element_body.find_class("texte")[0].drop_tag() element_body.tag = "div" - return self.browser.parser.tostring(element_body) + return self.parser.tostring(element_body) diff --git a/weboob/backends/lefigaro/pages/flashactu.py b/weboob/backends/lefigaro/pages/flashactu.py index 6d3737e2..2b2d61c7 100644 --- a/weboob/backends/lefigaro/pages/flashactu.py +++ b/weboob/backends/lefigaro/pages/flashactu.py @@ -31,5 +31,5 @@ class FlashActuPage(GenericNewsPage): def get_body(self): element_body = self.get_element_body() element_body.tag = "div" - return self.browser.parser.tostring(element_body) + return self.parser.tostring(element_body) diff --git a/weboob/backends/lemouv/pages.py b/weboob/backends/lemouv/pages.py index 877e3446..38d303d8 100644 --- a/weboob/backends/lemouv/pages.py +++ b/weboob/backends/lemouv/pages.py @@ -19,7 +19,7 @@ from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + __all__ = ['XMLinfos'] @@ -28,7 +28,7 @@ __all__ = ['XMLinfos'] class XMLinfos(BasePage): def get_current(self): try: - for channel in select(self.document.getroot(), 'channel'): + for channel in self.parser.select(self.document.getroot(), 'channel'): title = channel.find('item/song_title').text artist = channel.find('item/artist_name').text except AttributeError: diff --git a/weboob/backends/minutes20/pages/article.py b/weboob/backends/minutes20/pages/article.py index 9ee73e28..9cee4575 100644 --- a/weboob/backends/minutes20/pages/article.py +++ b/weboob/backends/minutes20/pages/article.py @@ -35,11 +35,11 @@ class ArticlePage(SimplePage): except NoneMainDiv: return None else: - try_remove(element_body, "div.mna-tools") - try_remove(element_body, "div.mna-comment-call") + try_remove(self.parser, element_body, "div.mna-tools") + try_remove(self.parser, element_body, "div.mna-comment-call") try : element_body.remove(self.get_element_author()) except NoAuthorElement: pass - return self.browser.parser.tostring(element_body) + return self.parser.tostring(element_body) diff --git a/weboob/backends/ouifm/pages.py b/weboob/backends/ouifm/pages.py index 66cfb0c5..67baebdc 100644 --- a/weboob/backends/ouifm/pages.py +++ b/weboob/backends/ouifm/pages.py @@ -19,7 +19,7 @@ from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + __all__ = ['PlayerPage'] @@ -27,6 +27,6 @@ __all__ = ['PlayerPage'] class PlayerPage(BasePage): def get_current(self): - title = select(self.document.getroot(), 'span.titre_en_cours', 1).text - artist = select(self.document.getroot(), 'span.artiste_en_cours', 1).text + title = self.parser.select(self.document.getroot(), 'span.titre_en_cours', 1).text + artist = self.parser.select(self.document.getroot(), 'span.artiste_en_cours', 1).text return unicode(artist).strip(), unicode(title).strip() diff --git a/weboob/backends/redmine/pages/wiki.py b/weboob/backends/redmine/pages/wiki.py index faf2cf88..48c1a265 100644 --- a/weboob/backends/redmine/pages/wiki.py +++ b/weboob/backends/redmine/pages/wiki.py @@ -19,11 +19,11 @@ from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + class WikiEditPage(BasePage): def get_source(self): - return select(self.document.getroot(), 'textarea#content_text', 1).text + return self.parser.select(self.document.getroot(), 'textarea#content_text', 1).text def set_source(self, data, message): self.browser.select_form(nr=1) @@ -33,7 +33,7 @@ class WikiEditPage(BasePage): self.browser.submit() def get_authenticity_token(self): - wiki_form = select(self.document.getroot(), 'form#wiki_form', 1) + wiki_form = self.parser.select(self.document.getroot(), 'form#wiki_form', 1) return wiki_form.xpath('div/input')[0].get('value') diff --git a/weboob/backends/sfr/pages/compose.py b/weboob/backends/sfr/pages/compose.py index 5d4347d3..a2d4d22f 100644 --- a/weboob/backends/sfr/pages/compose.py +++ b/weboob/backends/sfr/pages/compose.py @@ -22,7 +22,7 @@ import re from weboob.capabilities.messages import CantSendMessage from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + __all__ = ['ClosePage', 'ComposePage', 'ConfirmPage', 'SentPage'] @@ -37,7 +37,7 @@ class ComposePage(BasePage): def get_nb_remaining_free_sms(self): remaining_regex = re.compile(u'Il vous reste (?P.+) Texto gratuits vers les numéros SFR à envoyer aujourd\'hui') - text = select(self.document.getroot(), '#smsReminder', 1).text.strip() + text = self.parser.select(self.document.getroot(), '#smsReminder', 1).text.strip() return remaining_regex.match(text).groupdict().get('nb') def post_message(self, message): diff --git a/weboob/backends/youjizz/pages/index.py b/weboob/backends/youjizz/pages/index.py index 7c49c24d..3b9ac594 100644 --- a/weboob/backends/youjizz/pages/index.py +++ b/weboob/backends/youjizz/pages/index.py @@ -22,7 +22,7 @@ import datetime import re from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from ..video import YoujizzVideo @@ -32,25 +32,25 @@ __all__ = ['IndexPage'] class IndexPage(BasePage): def iter_videos(self): - span_list = select(self.document.getroot(), 'span#miniatura') + span_list = self.parser.select(self.document.getroot(), 'span#miniatura') for span in span_list: - a = select(span, 'a', 1) + a = self.parser.select(span, 'a', 1) url = a.attrib['href'] _id = re.sub(r'/videos/(.+)\.html', r'\1', url) thumbnail_url = span.find('.//img').attrib['src'] - title_el = select(span, 'span#title1', 1) + title_el = self.parser.select(span, 'span#title1', 1) title = title_el.text.strip() - time_span = select(span, 'span.thumbtime span', 1) + time_span = self.parser.select(span, 'span.thumbtime span', 1) time_txt = time_span.text.strip().replace(';', ':') if time_txt == 'N/A': minutes, seconds = 0, 0 elif ':' in time_txt: minutes, seconds = (int(v) for v in time_txt.split(':')) else: - raise SelectElementException('Unable to parse the video duration: %s' % time_txt) + raise BrokenPageError('Unable to parse the video duration: %s' % time_txt) yield YoujizzVideo(_id, diff --git a/weboob/backends/youjizz/pages/video.py b/weboob/backends/youjizz/pages/video.py index 0b5929d6..99bc8817 100644 --- a/weboob/backends/youjizz/pages/video.py +++ b/weboob/backends/youjizz/pages/video.py @@ -23,9 +23,8 @@ import lxml.html import re from weboob.capabilities.base import NotAvailable -from weboob.tools.browser import BasePage +from weboob.tools.browser import BasePage, BrokenPageError from weboob.tools.misc import to_unicode -from weboob.tools.parsers.lxmlparser import select, SelectElementException from ..video import YoujizzVideo @@ -39,7 +38,7 @@ class VideoPage(BasePage): _id = to_unicode(self.group_dict['id']) if video is None: video = YoujizzVideo(_id) - title_el = select(self.document.getroot(), 'title', 1) + title_el = self.parser.select(self.document.getroot(), 'title', 1) video.title = to_unicode(title_el.text.strip()) # youjizz HTML is crap, we must parse it with regexps @@ -53,13 +52,13 @@ class VideoPage(BasePage): minutes, seconds = (int(v) for v in to_unicode(txt).split(':')) video.duration = datetime.timedelta(minutes=minutes, seconds=seconds) else: - raise SelectElementException('Unable to retrieve video duration') + raise BrokenPageError('Unable to retrieve video duration') video_file_urls = re.findall(r'"(http://media[^ ,]+\.flv)"', data) if len(video_file_urls) == 0: - raise SelectElementException('Video URL not found') + raise BrokenPageError('Video URL not found') elif len(video_file_urls) > 1: - raise SelectElementException('Many video file URL found') + raise BrokenPageError('Many video file URL found') else: video.url = video_file_urls[0] diff --git a/weboob/backends/youporn/pages/video.py b/weboob/backends/youporn/pages/video.py index 51bd9632..9391f908 100644 --- a/weboob/backends/youporn/pages/video.py +++ b/weboob/backends/youporn/pages/video.py @@ -21,7 +21,7 @@ import re import datetime -from weboob.tools.parsers.lxmlparser import select + from .base import PornPage from ..video import YoupornVideo @@ -39,19 +39,19 @@ class VideoPage(PornPage): return video def get_url(self): - download_div = select(self.document.getroot(), '#download', 1) - a = select(download_div, 'a', 1) + download_div = self.parser.select(self.document.getroot(), '#download', 1) + a = self.parser.select(download_div, 'a', 1) return a.attrib['href'] def get_title(self): - element = select(self.document.getroot(), '#videoArea h1', 1) + element = self.parser.select(self.document.getroot(), '#videoArea h1', 1) return unicode(element.getchildren()[0].tail).strip() DATE_REGEXP = re.compile("\w+ (\w+) (\d+) (\d+):(\d+):(\d+) (\d+)") MONTH2I = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] def set_details(self, v): - details_div = select(self.document.getroot(), '#details', 1) + details_div = self.parser.select(self.document.getroot(), '#details', 1) for li in details_div.getiterator('li'): span = li.find('span') name = span.text.strip() diff --git a/weboob/backends/youtube/pages.py b/weboob/backends/youtube/pages.py index 98586b40..9b47f18b 100644 --- a/weboob/backends/youtube/pages.py +++ b/weboob/backends/youtube/pages.py @@ -19,7 +19,7 @@ from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select + __all__ = ['ForbiddenVideo', 'ForbiddenVideoPage', 'VerifyAgePage', 'VideoPage'] @@ -31,7 +31,7 @@ class ForbiddenVideo(Exception): class ForbiddenVideoPage(BasePage): def get_video(self, video=None): - element = select(self.document.getroot(), '.yt-alert-content', 1) + element = self.parser.select(self.document.getroot(), '.yt-alert-content', 1) raise ForbiddenVideo(element.text.strip()) diff --git a/weboob/tools/application/formatters/iformatter.py b/weboob/tools/application/formatters/iformatter.py index 9775a7a7..3041ee7e 100644 --- a/weboob/tools/application/formatters/iformatter.py +++ b/weboob/tools/application/formatters/iformatter.py @@ -25,7 +25,7 @@ import sys import subprocess if sys.platform == 'win32': import WConio - + try: import tty, termios except ImportError: diff --git a/weboob/tools/browser/__init__.py b/weboob/tools/browser/__init__.py index 7593d01d..8e177569 100644 --- a/weboob/tools/browser/__init__.py +++ b/weboob/tools/browser/__init__.py @@ -21,8 +21,8 @@ from weboob.tools.browser.browser import BrowserIncorrectPassword, BrowserBanned, \ BrowserUnavailable, BrowserRetry, \ BrowserHTTPNotFound, BrowserHTTPError, \ - BasePage, BaseBrowser + BasePage, BaseBrowser, BrokenPageError __all__ = ['BrowserIncorrectPassword', 'BrowserBanned', 'BrowserUnavailable', 'BrowserRetry', - 'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser'] + 'BrowserHTTPNotFound', 'BrowserHTTPError', 'BasePage', 'BaseBrowser', 'BrokenPageError'] diff --git a/weboob/tools/browser/browser.py b/weboob/tools/browser/browser.py index 4a0cb697..f701e2f4 100644 --- a/weboob/tools/browser/browser.py +++ b/weboob/tools/browser/browser.py @@ -93,6 +93,8 @@ class NoHistory(object): def close(self): pass +class BrokenPageError(Exception): + pass class BasePage(object): """ @@ -100,6 +102,7 @@ class BasePage(object): """ def __init__(self, browser, document, url='', groups=None, group_dict=None, logger=None): self.browser = browser + self.parser = browser.parser self.document = document self.url = url self.groups = groups diff --git a/weboob/tools/genericArticle.py b/weboob/tools/genericArticle.py index 5aca640c..0140d8da 100644 --- a/weboob/tools/genericArticle.py +++ b/weboob/tools/genericArticle.py @@ -16,32 +16,33 @@ # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . + from weboob.tools.browser import BasePage -from weboob.tools.parsers.lxmlparser import select, SelectElementException +from weboob.tools.browser import BrokenPageError from lxml.etree import Comment -def try_remove(base_element, selector): +def try_remove(parser, base_element, selector): try : - base_element.remove(select(base_element, selector, 1 )) - except (SelectElementException, ValueError): + base_element.remove(parser.select(base_element, selector, 1 )) + except (BrokenPageError, ValueError): pass -def try_drop_tree(base_element, selector): +def try_drop_tree(parser, base_element, selector): try: - select(base_element, selector, 1).drop_tree() - except SelectElementException: + parser.select(base_element, selector, 1).drop_tree() + except BrokenPageError: pass -def remove_from_selector_list(base_element, selector_list): +def remove_from_selector_list(parser, base_element, selector_list): for selector in selector_list: - base_element.remove(select(base_element, selector, 1)) + base_element.remove(parser.select(base_element, selector, 1)) -def try_remove_from_selector_list(base_element, selector_list): +def try_remove_from_selector_list(parser, base_element, selector_list): for selector in selector_list: - try_remove(base_element, selector) + try_remove(parser, base_element, selector) def drop_comments(base_element): for comment in base_element.getiterator(Comment): @@ -49,13 +50,13 @@ def drop_comments(base_element): -class NoAuthorElement(SelectElementException): +class NoAuthorElement(BrokenPageError): pass -class NoBodyElement(SelectElementException): +class NoBodyElement(BrokenPageError): pass -class NoTitleException(SelectElementException): +class NoTitleException(BrokenPageError): pass class NoneMainDiv(AttributeError): @@ -75,13 +76,13 @@ class Article(object): class GenericNewsPage(BasePage): __element_body = NotImplementedError __article = Article - element_title_selector = NotImplementedError + element_title_selector = NotImplementedError main_div = NotImplementedError element_body_selector = NotImplementedError element_author_selector = NotImplementedError def get_body(self): - return self.browser.parser.tostring(self.get_element_body()) + return self.parser.tostring(self.get_element_body()) def get_author(self): try: @@ -92,7 +93,7 @@ class GenericNewsPage(BasePage): def get_title(self): try : - return select( + return self.parser.select( self.main_div, self.element_title_selector, 1).text_content().strip() @@ -102,17 +103,17 @@ class GenericNewsPage(BasePage): return self.__article.title else: raise - except SelectElementException: + except BrokenPageError: try : self.element_title_selector = "h1" return self.get_title() - except SelectElementException: + except BrokenPageError: raise NoTitleException("no title on %s" % (self.browser)) def get_element_body(self): try : - return select(self.main_div, self.element_body_selector, 1) - except SelectElementException: + return self.parser.select(self.main_div, self.element_body_selector, 1) + except BrokenPageError: raise NoBodyElement("no body on %s" % (self.browser)) except AttributeError: if self.main_div == None: @@ -122,8 +123,8 @@ class GenericNewsPage(BasePage): def get_element_author(self): try: - return select(self.main_div, self.element_author_selector, 1) - except SelectElementException: + return self.parser.select(self.main_div, self.element_author_selector, 1) + except BrokenPageError: raise NoAuthorElement() except AttributeError: if self.main_div == None: diff --git a/weboob/tools/parsers/lxmlparser.py b/weboob/tools/parsers/lxmlparser.py index 0174f9aa..84dc6a11 100644 --- a/weboob/tools/parsers/lxmlparser.py +++ b/weboob/tools/parsers/lxmlparser.py @@ -21,50 +21,10 @@ import lxml.html from .iparser import IParser +from ..browser import BrokenPageError -__all__ = ['LxmlHtmlParser', 'select', 'SelectElementException'] - - -class SelectElementException(Exception): - pass - - -def select(element, selector, nb=None, method='cssselect'): - """ - Select one or many elements from an element, using lxml cssselect by default. - - Raises SelectElementException if not found. - - @param element [obj] element on which to apply selector - @param selector [str] CSS or XPath expression - @param method [str] (cssselect|xpath) - @param nb [int] number of elements expected to be found. - Use None for undefined number, and 'many' for 1 to infinite. - @return one or many Element - """ - if method == 'cssselect': - results = element.cssselect(selector) - if nb is None: - return results - elif isinstance(nb, basestring) and nb == 'many': - if results is None or len(results) == 0: - raise SelectElementException('Element not found with selector "%s"' % selector) - elif len(results) == 1: - raise SelectElementException('Only one element found with selector "%s"' % selector) - else: - return results - elif isinstance(nb, int) and nb > 0: - if results is None: - raise SelectElementException('Element not found with selector "%s"' % selector) - elif len(results) < nb: - raise SelectElementException('Not enough elements found (%d expected) with selector "%s"' % (nb, selector)) - else: - return results[0] if nb == 1 else results - else: - raise Exception('Unhandled value for kwarg "nb": %s' % nb) - else: - raise NotImplementedError('Only cssselect method is implemented for the moment') +__all__ = ['LxmlHtmlParser'] class LxmlHtmlParser(IParser): @@ -83,3 +43,40 @@ class LxmlHtmlParser(IParser): def tostring(self, element): return lxml.html.tostring(element, encoding=unicode) + + @classmethod + def select(cls, element, selector, nb=None, method='cssselect'): + """ + Select one or many elements from an element, using lxml cssselect by default. + + Raises BrokenPageError if not found. + + @param element [obj] element on which to apply selector + @param selector [str] CSS or XPath expression + @param method [str] (cssselect|xpath) + @param nb [int] number of elements expected to be found. + Use None for undefined number, and 'many' for 1 to infinite. + @return one or many Element + """ + if method == 'cssselect': + results = element.cssselect(selector) + if nb is None: + return results + elif isinstance(nb, basestring) and nb == 'many': + if results is None or len(results) == 0: + raise BrokenPageError('Element not found with selector "%s"' % selector) + elif len(results) == 1: + raise BrokenPageError('Only one element found with selector "%s"' % selector) + else: + return results + elif isinstance(nb, int) and nb > 0: + if results is None: + raise BrokenPageError('Element not found with selector "%s"' % selector) + elif len(results) < nb: + raise BrokenPageError('Not enough elements found (%d expected) with selector "%s"' % (nb, selector)) + else: + return results[0] if nb == 1 else results + else: + raise Exception('Unhandled value for kwarg "nb": %s' % nb) + else: + raise NotImplementedError('Only cssselect method is implemented for the moment')