Fix piratebay

Move to .se, remove https for downloads, use the correct link for downloads. And some cosmetic enhancements
2012-02-05 16:40:43 +01:00 · 2012-02-05 16:40:43 +01:00 · 988542463d
commit 988542463d
parent 664e4cc094
4 changed files with 30 additions and 28 deletions
--- a/modules/piratebay/backend.py
+++ b/modules/piratebay/backend.py
@ -49,4 +49,4 @@ class PiratebayBackend(BaseBackend, ICapTorrent):
        return self.browser.openurl(torrent.url.encode('utf-8')).read()

    def iter_torrents(self, pattern):
-        return self.browser.iter_torrents(pattern.replace(' ','+'))
+        return self.browser.iter_torrents(pattern.replace(' ', '+'))
--- a/modules/piratebay/browser.py
+++ b/modules/piratebay/browser.py
@ -30,26 +30,26 @@ __all__ = ['PiratebayBrowser']


 class PiratebayBrowser(BaseBrowser):
-    DOMAIN = 'thepiratebay.org'
+    DOMAIN = 'thepiratebay.se'
    PROTOCOL = 'https'
    ENCODING = 'utf-8'
    USER_AGENT = BaseBrowser.USER_AGENTS['wget']
-    PAGES = {'https://thepiratebay.org' : IndexPage,
-             'https://thepiratebay.org/search/.*/0/7/0' : TorrentsPage,
-             'https://thepiratebay.org/torrent/.*' : TorrentPage
+    PAGES = {'https://thepiratebay.se': IndexPage,
+             'https://thepiratebay.se/search/.*/0/7/0': TorrentsPage,
+             'https://thepiratebay.se/torrent/.*': TorrentPage
             }

    def home(self):
-        return self.location('https://thepiratebay.org')
+        return self.location('https://thepiratebay.se')

    def iter_torrents(self, pattern):
-        self.location('https://thepiratebay.org/search/%s/0/7/0' % urllib.quote_plus(pattern.encode('utf-8')))
+        self.location('https://thepiratebay.se/search/%s/0/7/0' % urllib.quote_plus(pattern.encode('utf-8')))

        assert self.is_on_page(TorrentsPage)
        return self.page.iter_torrents()

    def get_torrent(self, id):
-        self.location('https://thepiratebay.org/torrent/%s/' % id)
+        self.location('https://thepiratebay.se/torrent/%s/' % id)

        assert self.is_on_page(TorrentPage)
        return self.page.get_torrent(id)
--- a/modules/piratebay/pages/index.py
+++ b/modules/piratebay/pages/index.py
@ -27,4 +27,3 @@ __all__ = ['IndexPage']
 class IndexPage(BasePage):
    def is_logged(self):
        return 'id' in self.document.find('body').attrib
-
--- a/modules/piratebay/pages/torrents.py
+++ b/modules/piratebay/pages/torrents.py
@ -18,7 +18,6 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-
 from weboob.tools.browser import BasePage
 from weboob.capabilities.torrent import Torrent

@ -30,17 +29,16 @@ class TorrentsPage(BasePage):
    def unit(self, n, u):
        m = {'B': 1,
                'KB': 1024,
-                'MB': 1024*1024,
-                'GB': 1024*1024*1024,
-                'TB': 1024*1024*1024*1024,
+                'MB': 1024 * 1024,
+                'GB': 1024 * 1024 * 1024,
+                'TB': 1024 * 1024 * 1024 * 1024,
                }
-        #return float(n.replace(',', '')) * m.get(u, 1)
-        return float(n*m[u])
+        return float(n * m[u])

    def iter_torrents(self):
        table = self.parser.select(self.document.getroot(), 'table#searchResult', 1)
        for tr in table.getiterator('tr'):
-            if tr.get('class','') != "header":
+            if tr.get('class', '') != "header":
                td = tr.getchildren()[1]
                div = td.getchildren()[0]
                link = div.find('a').attrib['href']
@ -51,7 +49,7 @@ class TorrentsPage(BasePage):
                url = a.attrib['href']

                size = td.find('font').text.split(',')[1].strip()
-                u = size.split(' ')[1].split(u'\xa0')[1].replace('i','')
+                u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '')
                size = size.split(' ')[1].split(u'\xa0')[0]

                seed = tr.getchildren()[2].text
@ -60,42 +58,47 @@ class TorrentsPage(BasePage):
                torrent = Torrent(idt,
                                  title,
                                  url=url,
-                                  size=self.unit(float(size),u),
+                                  size=self.unit(float(size), u),
                                  seeders=int(seed),
                                  leechers=int(leech))
                yield torrent

+
 class TorrentPage(BasePage):
    def get_torrent(self, id):
        for div in self.document.getiterator('div'):
-            if div.attrib.get('id','') == 'title':
+            if div.attrib.get('id', '') == 'title':
                title = div.text.strip()
-            elif div.attrib.get('class','') == 'download':
-                url = div.getchildren()[0].attrib.get('href','')
-            elif div.attrib.get('id','') == 'details':
+            elif div.attrib.get('class', '') == 'download':
+                # the last link is now the one with http
+                url = self.parser.select(div, 'a')[-1].attrib.get('href', '')
+                # https fails on the download server, so strip it
+                if url.startswith('https://'):
+                    url = url.replace('https://', 'http://', 1)
+            elif div.attrib.get('id', '') == 'details':
                size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0])
                if len(div.getchildren()) > 1 \
-                        and div.getchildren()[1].attrib.get('class','') == 'col2' :
+                and div.getchildren()[1].attrib.get('class', '') == 'col2':
                    child_to_explore = div.getchildren()[1]
                else:
                    child_to_explore = div.getchildren()[0]
                prev_child_txt = "none"
-                seed="-1"
-                leech="-1"
+                seed = "-1"
+                leech = "-1"
                for ch in child_to_explore.getchildren():
                    if prev_child_txt == "Seeders:":
                        seed = ch.text
                    if prev_child_txt == "Leechers:":
                        leech = ch.text
                    prev_child_txt = ch.text
-            elif div.attrib.get('class','') == 'nfo':
+            elif div.attrib.get('class', '') == 'nfo':
                description = div.getchildren()[0].text
        torrent = Torrent(id, title)
-        torrent.url = url
+        torrent.url = url or None
        torrent.size = size
        torrent.seeders = int(seed)
        torrent.leechers = int(leech)
-        torrent.description = description
+        torrent.description = description.strip()
        torrent.files = ['NYI']

        return torrent