diff --git a/weboob/backends/gazelle/browser.py b/weboob/backends/gazelle/browser.py index a199e663..5c525530 100644 --- a/weboob/backends/gazelle/browser.py +++ b/weboob/backends/gazelle/browser.py @@ -63,8 +63,9 @@ class GazelleBrowser(BaseBrowser): assert self.is_on_page(TorrentsPage) return self.page.iter_torrents() - def get_torrent(self, id): - self.location('/torrents.php?torrentid=%s' % id) + def get_torrent(self, fullid): + id, torrentid = fullid.split('.', 1) + self.location(self.buildurl('/torrents.php', id=id, torrentid=torrentid)) assert self.is_on_page(TorrentsPage) - return self.page.get_torrent(id) + return self.page.get_torrent(fullid) diff --git a/weboob/backends/gazelle/pages/torrents.py b/weboob/backends/gazelle/pages/torrents.py index 575efad2..d84b608b 100644 --- a/weboob/backends/gazelle/pages/torrents.py +++ b/weboob/backends/gazelle/pages/torrents.py @@ -19,9 +19,10 @@ import re +import urlparse from logging import warning, debug -from weboob.tools.misc import html2text +from weboob.tools.misc import html2text, get_bytes_size from weboob.tools.browser import BasePage from weboob.capabilities.torrent import Torrent from weboob.capabilities.base import NotLoaded @@ -32,14 +33,6 @@ __all__ = ['TorrentsPage'] class TorrentsPage(BasePage): TORRENTID_REGEXP = re.compile('torrents\.php\?action=download&id=(\d+)') - def unit(self, n, u): - m = {'KB': 1024, - 'MB': 1024*1024, - 'GB': 1024*1024*1024, - 'TB': 1024*1024*1024*1024, - } - return float(n.replace(',', '')) * m.get(u, 1) - def format_url(self, url): return '%s://%s/%s' % (self.browser.PROTOCOL, self.browser.DOMAIN, @@ -89,11 +82,9 @@ class TorrentsPage(BasePage): title += u' (%s)' % tds[i].find('a').text else: title = tds[i].find('a').text - url = tds[i].find('span').find('a').attrib['href'] - id = self.TORRENTID_REGEXP.match(url) - if not id: - continue - id = id.group(1) + url = urlparse.urlparse(tds[i].find('a').attrib['href']) + params = urlparse.parse_qs(url.query) + id = '%s.%s' % (params['id'][0], params['torrentid'][0]) size = self.unit(*tds[i+3].text.split()) seeders = int(tds[-2].text) leechers = int(tds[-1].text) @@ -109,73 +100,91 @@ class TorrentsPage(BasePage): debug('unknown attrib: %s' % tr.attrib) def get_torrent(self, id): - table = self.document.getroot().cssselect('div.thin') - if not table: - warning('No div.thin found') - return None + table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1) - h2 = table[0].find('h2') - title = h2.text or '' - if h2.find('a') != None: - title += h2.find('a').text + h2.find('a').tail + h2 = table.find('h2') + if h2 is not None: + title = h2.text or '' + if h2.find('a') != None: + title += h2.find('a').text + h2.find('a').tail + else: + title = self.browser.parser.select(table, 'div.title_text', 1).text torrent = Torrent(id, title) - table = self.document.getroot().cssselect('table.torrent_table') - if not table: - warning('No table found') - return None + torrentid = id.split('.', 1)[1] + table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table') + if len(table) == 0: + table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1) + is_table = False + else: + table = table[0] + is_table = True - for tr in table[0].findall('tr'): - if tr.attrib.get('class', '').startswith('group_torrent'): + for tr in table.findall('tr' if is_table else 'div'): + if is_table and tr.attrib.get('class', '').startswith('group_torrent'): tds = tr.findall('td') if not len(tds) == 5: continue url = tds[0].find('span').find('a').attrib['href'] - id = self.TORRENTID_REGEXP.match(url) - - if not id: + m = self.TORRENTID_REGEXP.match(url) + if not m: warning('ID not found') continue - - id = id.group(1) - - if id != torrent.id: + if m.group(1) != torrentid: continue torrent.url = self.format_url(url) - torrent.size = self.unit(*tds[1].text.split()) + size, unit = tds[1].text.split() + torrent.size = get_bytes_size(float(size.replace(',', '')), unit) torrent.seeders = int(tds[3].text) torrent.leechers = int(tds[4].text) break + elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') and \ + tr.attrib.get('class', '').endswith('pad'): + url = tr.cssselect('a[title=Download]')[0].attrib['href'] + m = self.TORRENTID_REGEXP.match(url) + if not m: + warning('ID not found') + continue + if m.group(1) != torrentid: + print + continue + + torrent.url = self.format_url(url) + size, unit = tr.cssselect('div.details_title strong')[0].text.strip('()').split() + torrent.size = get_bytes_size(float(size), unit) + torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail) + torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail) + break if not torrent.url: - warning('Torrent %d not found in list' % torrent.id) + warning('Torrent %d not found in list' % torrentid) return None - div = self.document.getroot().cssselect('div.main_column') - if not div: - warning('WTF') - return None - - for box in div[0].cssselect('div.box'): + div = self.parser.select(self.document.getroot(), 'div.main_column', 1) + for box in div.cssselect('div.box'): title = None body = None title_t = box.cssselect('div.head') - if title_t: - title = title_t[0].find('strong').text.strip() - body_t = box.cssselect('div.body') + if len(title_t) > 0: + title_t = title_t[0] + if title_t.find('strong') is not None: + title_t = title_t.find('strong') + title = title_t.text.strip() + + body_t = box.cssselect('div.body,div.desc') if body_t: - body = html2text(self.parser.tostring(body_t[0])).strip() + body = html2text(self.parser.tostring(body_t[-1])).strip() if title and body: if torrent.description is NotLoaded: torrent.description = u'' torrent.description += u'%s\n\n%s\n' % (title, body) - div = self.document.getroot().cssselect('div#files_%s' % torrent.id) + div = self.document.getroot().cssselect('div#files_%s,div#filelist_%s' % (torrentid, torrentid)) if div: torrent.files = [] for tr in div[0].find('table'):