fix gazelle to work with broken websites

2011-05-08 11:38:27 +02:00 · 2011-05-08 11:38:27 +02:00 · c8025bfe5b
commit c8025bfe5b
parent 6243a60f68
2 changed files with 61 additions and 51 deletions
--- a/weboob/backends/gazelle/browser.py
+++ b/weboob/backends/gazelle/browser.py
@ -63,8 +63,9 @@ class GazelleBrowser(BaseBrowser):
        assert self.is_on_page(TorrentsPage)
        return self.page.iter_torrents()

-    def get_torrent(self, id):
-        self.location('/torrents.php?torrentid=%s' % id)
+    def get_torrent(self, fullid):
+        id, torrentid = fullid.split('.', 1)
+        self.location(self.buildurl('/torrents.php', id=id, torrentid=torrentid))

        assert self.is_on_page(TorrentsPage)
-        return self.page.get_torrent(id)
+        return self.page.get_torrent(fullid)
--- a/weboob/backends/gazelle/pages/torrents.py
+++ b/weboob/backends/gazelle/pages/torrents.py
@ -19,9 +19,10 @@


 import re
+import urlparse
 from logging import warning, debug

-from weboob.tools.misc import html2text
+from weboob.tools.misc import html2text, get_bytes_size
 from weboob.tools.browser import BasePage
 from weboob.capabilities.torrent import Torrent
 from weboob.capabilities.base import NotLoaded
@ -32,14 +33,6 @@ __all__ = ['TorrentsPage']

 class TorrentsPage(BasePage):
    TORRENTID_REGEXP = re.compile('torrents\.php\?action=download&id=(\d+)')
-    def unit(self, n, u):
-        m = {'KB': 1024,
-             'MB': 1024*1024,
-             'GB': 1024*1024*1024,
-             'TB': 1024*1024*1024*1024,
-            }
-        return float(n.replace(',', '')) * m.get(u, 1)
-
    def format_url(self, url):
        return '%s://%s/%s' % (self.browser.PROTOCOL,
                               self.browser.DOMAIN,
@ -89,11 +82,9 @@ class TorrentsPage(BasePage):
                        title += u' (%s)' % tds[i].find('a').text
                    else:
                        title = tds[i].find('a').text
-                    url = tds[i].find('span').find('a').attrib['href']
-                    id = self.TORRENTID_REGEXP.match(url)
-                    if not id:
-                        continue
-                    id = id.group(1)
+                    url = urlparse.urlparse(tds[i].find('a').attrib['href'])
+                    params = urlparse.parse_qs(url.query)
+                    id = '%s.%s' % (params['id'][0], params['torrentid'][0])
                    size = self.unit(*tds[i+3].text.split())
                    seeders = int(tds[-2].text)
                    leechers = int(tds[-1].text)
@ -109,73 +100,91 @@ class TorrentsPage(BasePage):
                    debug('unknown attrib: %s' % tr.attrib)

    def get_torrent(self, id):
-        table = self.document.getroot().cssselect('div.thin')
-        if not table:
-            warning('No div.thin found')
-            return None
+        table = self.browser.parser.select(self.document.getroot(), 'div.thin', 1)

-        h2 = table[0].find('h2')
-        title = h2.text or ''
-        if h2.find('a') != None:
-            title += h2.find('a').text + h2.find('a').tail
+        h2 = table.find('h2')
+        if h2 is not None:
+            title = h2.text or ''
+            if h2.find('a') != None:
+                title += h2.find('a').text + h2.find('a').tail
+        else:
+            title = self.browser.parser.select(table, 'div.title_text', 1).text

        torrent = Torrent(id, title)
-        table = self.document.getroot().cssselect('table.torrent_table')
-        if not table:
-            warning('No table found')
-            return None
+        torrentid = id.split('.', 1)[1]
+        table = self.browser.parser.select(self.document.getroot(), 'table.torrent_table')
+        if len(table) == 0:
+            table = self.browser.parser.select(self.document.getroot(), 'div.main_column', 1)
+            is_table = False
+        else:
+            table = table[0]
+            is_table = True

-        for tr in table[0].findall('tr'):
-            if tr.attrib.get('class', '').startswith('group_torrent'):
+        for tr in table.findall('tr' if is_table else 'div'):
+            if is_table and tr.attrib.get('class', '').startswith('group_torrent'):
                tds = tr.findall('td')

                if not len(tds) == 5:
                    continue

                url = tds[0].find('span').find('a').attrib['href']
-                id = self.TORRENTID_REGEXP.match(url)
-
-                if not id:
+                m = self.TORRENTID_REGEXP.match(url)
+                if not m:
                    warning('ID not found')
                    continue
-
-                id = id.group(1)
-
-                if id != torrent.id:
+                if m.group(1) != torrentid:
                    continue

                torrent.url = self.format_url(url)
-                torrent.size = self.unit(*tds[1].text.split())
+                size, unit = tds[1].text.split()
+                torrent.size = get_bytes_size(float(size.replace(',', '')), unit)
                torrent.seeders = int(tds[3].text)
                torrent.leechers = int(tds[4].text)
                break
+            elif not is_table and tr.attrib.get('class', '').startswith('torrent_widget') and \
+                                  tr.attrib.get('class', '').endswith('pad'):
+                url = tr.cssselect('a[title=Download]')[0].attrib['href']
+                m = self.TORRENTID_REGEXP.match(url)
+                if not m:
+                    warning('ID not found')
+                    continue
+                if m.group(1) != torrentid:
+                    print
+                    continue
+
+                torrent.url = self.format_url(url)
+                size, unit = tr.cssselect('div.details_title strong')[0].text.strip('()').split()
+                torrent.size = get_bytes_size(float(size), unit)
+                torrent.seeders = int(tr.cssselect('img[title=Seeders]')[0].tail)
+                torrent.leechers = int(tr.cssselect('img[title=Leechers]')[0].tail)
+                break

        if not torrent.url:
-            warning('Torrent %d not found in list' % torrent.id)
+            warning('Torrent %d not found in list' % torrentid)
            return None

-        div = self.document.getroot().cssselect('div.main_column')
-        if not div:
-            warning('WTF')
-            return None
-
-        for box in div[0].cssselect('div.box'):
+        div = self.parser.select(self.document.getroot(), 'div.main_column', 1)
+        for box in div.cssselect('div.box'):
            title = None
            body = None

            title_t = box.cssselect('div.head')
-            if title_t:
-                title = title_t[0].find('strong').text.strip()
-            body_t = box.cssselect('div.body')
+            if len(title_t) > 0:
+                title_t = title_t[0]
+                if title_t.find('strong') is not None:
+                    title_t = title_t.find('strong')
+                title = title_t.text.strip()
+
+            body_t = box.cssselect('div.body,div.desc')
            if body_t:
-                body = html2text(self.parser.tostring(body_t[0])).strip()
+                body = html2text(self.parser.tostring(body_t[-1])).strip()

            if title and body:
                if torrent.description is NotLoaded:
                    torrent.description = u''
                torrent.description += u'%s\n\n%s\n' % (title, body)

-        div = self.document.getroot().cssselect('div#files_%s' % torrent.id)
+        div = self.document.getroot().cssselect('div#files_%s,div#filelist_%s' % (torrentid, torrentid))
        if div:
            torrent.files = []
            for tr in div[0].find('table'):