Use clean_relative_urls
This commit is contained in:
parent
b100f77f95
commit
fcd8432045
1 changed files with 2 additions and 10 deletions
|
|
@ -19,7 +19,7 @@
|
||||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage,\
|
from weboob.tools.capabilities.messages.genericArticle import GenericNewsPage,\
|
||||||
try_drop_tree
|
try_drop_tree, clean_relativ_urls
|
||||||
|
|
||||||
|
|
||||||
class ArticlePage(GenericNewsPage):
|
class ArticlePage(GenericNewsPage):
|
||||||
|
|
@ -33,15 +33,7 @@ class ArticlePage(GenericNewsPage):
|
||||||
def get_body(self):
|
def get_body(self):
|
||||||
div = self.document.getroot().find('.//div[@class="sectbody"]')
|
div = self.document.getroot().find('.//div[@class="sectbody"]')
|
||||||
try_drop_tree(self.parser, div, "div.anchor")
|
try_drop_tree(self.parser, div, "div.anchor")
|
||||||
for a in div.findall('.//a'):
|
clean_relativ_urls(div, "http://taz.de")
|
||||||
try:
|
|
||||||
if a.attrib["href"][0:7] != "http://":
|
|
||||||
a.attrib["href"] = "http://taz.de/" + a.attrib["href"]
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
for img in div.findall('.//img'):
|
|
||||||
if img.attrib["src"][0:7] != "http://":
|
|
||||||
img.attrib["src"] = "http://taz.de/" + img.attrib["src"]
|
|
||||||
|
|
||||||
return self.parser.tostring(div)
|
return self.parser.tostring(div)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue