Use clean_relative_urls

This commit is contained in:
Florent 2013-01-24 22:21:12 +01:00
commit fcd8432045

View file

@ -19,7 +19,7 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage,\
try_drop_tree
try_drop_tree, clean_relativ_urls
class ArticlePage(GenericNewsPage):
@ -33,15 +33,7 @@ class ArticlePage(GenericNewsPage):
def get_body(self):
div = self.document.getroot().find('.//div[@class="sectbody"]')
try_drop_tree(self.parser, div, "div.anchor")
for a in div.findall('.//a'):
try:
if a.attrib["href"][0:7] != "http://":
a.attrib["href"] = "http://taz.de/" + a.attrib["href"]
except:
continue
for img in div.findall('.//img'):
if img.attrib["src"][0:7] != "http://":
img.attrib["src"] = "http://taz.de/" + img.attrib["src"]
clean_relativ_urls(div, "http://taz.de")
return self.parser.tostring(div)