From 3cb78d1729a8ed01062d48e01dc92d3d3d3ca46d Mon Sep 17 00:00:00 2001 From: Florent Date: Wed, 7 Nov 2012 18:13:03 +0100 Subject: [PATCH] Add clean_relativ_urls in genericArticle.py and use it for ecrans.fr --- modules/ecrans/pages/article.py | 10 ++-------- weboob/tools/capabilities/messages/genericArticle.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/modules/ecrans/pages/article.py b/modules/ecrans/pages/article.py index 65df62a5..a29635cf 100644 --- a/modules/ecrans/pages/article.py +++ b/modules/ecrans/pages/article.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage, remove_from_selector_list, try_remove_from_selector_list, try_drop_tree +from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage, remove_from_selector_list, try_remove_from_selector_list, try_drop_tree, clean_relativ_urls class ArticlePage(GenericNewsPage): @@ -34,12 +34,6 @@ class ArticlePage(GenericNewsPage): remove_from_selector_list(self.parser, element_body, ["p.auteur", "h4"]) try_remove_from_selector_list(self.parser, element_body, ["p.tag", "div.alire", self.element_title_selector, "h4"]) try_drop_tree(self.parser, element_body, "script") - for a in element_body.findall('.//a'): - if a.attrib["href"][0:7] != "http://": - a.attrib["href"] = "http://ecrans.fr/" + a.attrib["href"] - for img in element_body.xpath('.//img'): - if img.attrib["src"][0:7] != "http://": - img.attrib["src"] = "http://ecrans.fr/" + img.attrib["src"] - + clean_relativ_urls(element_body, "http://ecrans.fr") return self.parser.tostring(element_body) diff --git a/weboob/tools/capabilities/messages/genericArticle.py b/weboob/tools/capabilities/messages/genericArticle.py index 2a29e3c7..aab15141 100644 --- a/weboob/tools/capabilities/messages/genericArticle.py +++ b/weboob/tools/capabilities/messages/genericArticle.py @@ -48,6 +48,16 @@ def drop_comments(base_element): for comment in base_element.getiterator(Comment): comment.drop_tree() +# Replace relative url in link and image with a complete url +# Arguments: the html element to clean, and the domain name (with http:// prefix) +def clean_relativ_urls(base_element, domain): + for a in base_element.findall('.//a'): + if a.attrib["href"][0:7] != "http://" and a.attrib["href"][0:7] != "https://": + a.attrib["href"] = domain + a.attrib["href"] + for img in base_element.findall('.//img'): + if img.attrib["src"][0:7] != "http://" and img.attrib["src"][0:7] != "https://": + img.attrib["src"] = domain + img.attrib["src"] + class NoAuthorElement(BrokenPageError): pass