fix gazelle to work with broken websites

This commit is contained in:
Romain Bignon 2011-05-08 11:38:27 +02:00
commit c8025bfe5b
2 changed files with 61 additions and 51 deletions

View file

@ -63,8 +63,9 @@ class GazelleBrowser(BaseBrowser):
assert self.is_on_page(TorrentsPage)
return self.page.iter_torrents()
def get_torrent(self, id):
self.location('/torrents.php?torrentid=%s' % id)
def get_torrent(self, fullid):
id, torrentid = fullid.split('.', 1)
self.location(self.buildurl('/torrents.php', id=id, torrentid=torrentid))
assert self.is_on_page(TorrentsPage)
return self.page.get_torrent(id)
return self.page.get_torrent(fullid)

View file

@ -19,9 +19,10 @@
import re
import urlparse
from logging import warning, debug
from weboob.tools.misc import html2text
from weboob.tools.misc import html2text, get_bytes_size
from weboob.tools.browser import BasePage
from weboob.capabilities.torrent import Torrent
from weboob.capabilities.base import NotLoaded
@ -32,14 +33,6 @@ __all__ = ['TorrentsPage']
class TorrentsPage(BasePage):
TORRENTID_REGEXP = re.compile('torrents\.php\?action=download&id=(\d+)')
def unit(self, n, u):
m = {'KB': 1024,
'MB': 1024*1024,
'GB': 1024*1024*1024,
'TB': 1024*1024*1024*1024,
}
return float(n.replace(',', '')) * m.get(u, 1)
def format_url(self, url):
return '%s://%s/%s' % (self.browser.PROTOCOL,
self.browser.DOMAIN,
@ -89,11 +82,9 @@ class TorrentsPage(BasePage):
title += u' (%s)' % tds[i].find('a').text
else:
title = tds[i].find('a').text
url = tds[i].find('span').find('a').attrib['href']
id = self.TORRENTID_REGEXP.match(url)
if not id:
continue
id = id.group(1)
url = urlparse.urlparse(tds[i].find('a').attrib['href'])
params = urlparse.parse_qs(url.query)
id = '%s.%s' % (params['id'][0], params['torrentid'][0])
size = self.unit(*tds[i+3].text.split())
seeders = int(tds[-2].text)
leechers = int(tds[-1].text)
@ -109,73 +100,91 @@ class TorrentsPage(BasePage):
debug('unknown attrib: %s' % tr.attrib)
def get_torrent(self, id):
table = self.document.getroot().cssselect('div.thin')
if not table:
warning('No div.thin found')
return None
table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1)
h2 = table[0].find('h2')
title = h2.text or ''
if h2.find('a') != None:
title += h2.find('a').text + h2.find('a').tail
h2 = table.find('h2')
if h2 is not None:
title = h2.text or ''
if h2.find('a') != None:
title += h2.find('a').text + h2.find('a').tail
else:
title = self.browser.parser.select(table, 'div.title_text', 1).text
torrent = Torrent(id, title)
table = self.document.getroot().cssselect('table.torrent_table')
if not table:
warning('No table found')
return None
torrentid = id.split('.', 1)[1]
table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table')
if len(table) == 0:
table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1)
is_table = False
else:
table = table[0]
is_table = True
for tr in table[0].findall('tr'):
if tr.attrib.get('class', '').startswith('group_torrent'):
for tr in table.findall('tr' if is_table else 'div'):
if is_table and tr.attrib.get('class', '').startswith('group_torrent'):
tds = tr.findall('td')
if not len(tds) == 5:
continue
url = tds[0].find('span').find('a').attrib['href']
id = self.TORRENTID_REGEXP.match(url)
if not id:
m = self.TORRENTID_REGEXP.match(url)
if not m:
warning('ID not found')
continue
id = id.group(1)
if id != torrent.id:
if m.group(1) != torrentid:
continue
torrent.url = self.format_url(url)
torrent.size = self.unit(*tds[1].text.split())
size, unit = tds[1].text.split()
torrent.size = get_bytes_size(float(size.replace(',', '')), unit)
torrent.seeders = int(tds[3].text)
torrent.leechers = int(tds[4].text)
break
elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') and \
tr.attrib.get('class', '').endswith('pad'):
url = tr.cssselect('a[title=Download]')[0].attrib['href']
m = self.TORRENTID_REGEXP.match(url)
if not m:
warning('ID not found')
continue
if m.group(1) != torrentid:
print
continue
torrent.url = self.format_url(url)
size, unit = tr.cssselect('div.details_title strong')[0].text.strip('()').split()
torrent.size = get_bytes_size(float(size), unit)
torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail)
torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail)
break
if not torrent.url:
warning('Torrent %d not found in list' % torrent.id)
warning('Torrent %d not found in list' % torrentid)
return None
div = self.document.getroot().cssselect('div.main_column')
if not div:
warning('WTF')
return None
for box in div[0].cssselect('div.box'):
div = self.parser.select(self.document.getroot(), 'div.main_column', 1)
for box in div.cssselect('div.box'):
title = None
body = None
title_t = box.cssselect('div.head')
if title_t:
title = title_t[0].find('strong').text.strip()
body_t = box.cssselect('div.body')
if len(title_t) > 0:
title_t = title_t[0]
if title_t.find('strong') is not None:
title_t = title_t.find('strong')
title = title_t.text.strip()
body_t = box.cssselect('div.body,div.desc')
if body_t:
body = html2text(self.parser.tostring(body_t[0])).strip()
body = html2text(self.parser.tostring(body_t[-1])).strip()
if title and body:
if torrent.description is NotLoaded:
torrent.description = u''
torrent.description += u'%s\n\n%s\n' % (title, body)
div = self.document.getroot().cssselect('div#files_%s' % torrent.id)
div = self.document.getroot().cssselect('div#files_%s,div#filelist_%s' % (torrentid, torrentid))
if div:
torrent.files = []
for tr in div[0].find('table'):