new calendar module: sueurdemetal

2013-11-07 12:41:36 +01:00 · 2013-11-07 12:41:36 +01:00 · 47d4105d8e
commit 47d4105d8e
parent 299c0369e6
6 changed files with 341 additions and 0 deletions
--- a/modules/sueurdemetal/pages.py
+++ b/modules/sueurdemetal/pages.py
@ -0,0 +1,124 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2013      Vincent A
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.browser import BasePage
+from weboob.tools.date import parse_french_date
+import re
+
+
+__all__ = ['PageCity', 'PageConcert', 'PageCityList']
+
+
+class PageWithConcerts(BasePage):
+    def extract_concert(self, concert_table):
+        d = {}
+        date_h3 = concert_table.iter('h3').next()
+        d['date'] = parse_french_date(date_h3.text)
+
+        cancel_h2 = next(date_h3.itersiblings('h2'), None)
+        if cancel_h2 is not None and cancel_h2.text.startswith('ANNUL'):
+            d['active'] = False
+        else:
+            d['active'] = True
+
+        performers_table = concert_table.iterdescendants('table').next()
+        d['performers'] = list(self.extract_performers(performers_table))
+        d['summary'] = ' + '.join(p['name'] for p in d['performers'])
+        d['description'] = d['summary']
+
+        return d
+
+    def extract_performers(self, performers_table):
+        for performer_tr in performers_table.findall('tr'):
+            performer_td = performer_tr.find('td')
+            d = {'name': performer_td.find('strong').text.strip(' \t\r\n+')} # handle '+ GUESTS'
+            rest = performer_td.tail
+            if rest:
+                d['genre'] = rest
+            yield d
+
+    def extract_id_from_url(self, url):
+        return re.search(r'c=(\d+)', url).group(1)
+
+    def extract_city_from_url(self, url):
+        re.search('metal-(.+).htm$', url).group(1)
+
+
+class PageCity(PageWithConcerts):
+    def get_concerts(self):
+        for concert_table in self.document.xpath('//div[@id="centre-page"]//div/table'):
+            yield self.extract_concert(concert_table)
+
+    def extract_concert(self, concert_table):
+        d = PageWithConcerts.extract_concert(self, concert_table)
+        infos_a = concert_table.xpath('.//a[starts-with(@href, "detail-concert-metal.php")]')[0]
+        d['id'] = self.extract_id_from_url(infos_a.get('href'))
+        d['url'] = 'http://www.sueurdemetal.com/detail-concert-metal.php?c=%s' % d['id']
+        d['city_id'] = self.extract_city_from_url(self.url)
+        return d
+
+
+class PageConcert(PageWithConcerts):
+    def get_concert(self):
+        concert_table = self.document.xpath('//div[@id="centre-page"]//div/table')[0]
+        d = self.extract_concert(concert_table)
+        d['id'] = self.extract_id_from_url(self.url)
+        d['url'] = self.url
+
+        it = concert_table.iterdescendants('table')
+        it.next() # ignore performers table
+        infos_table = it.next()
+        self.infos_table = infos_table
+        info_trs = infos_table.findall('tr')
+        d['room'] = (info_trs[3].findall('td')[1].text or '').strip()
+        d['address'] = (info_trs[4].findall('td')[1].text or '').strip()
+
+        price = self.parse_price(info_trs[5].findall('td')[1].text)
+        if price is not None: # "None" is different from "0€"
+            d['price'] = price
+
+        city_a = self.document.xpath('//a[starts-with(@href, "ville-metal-")]')[0]
+        d['city_id'] = self.extract_city_from_url(city_a.get('href'))
+        d['city'] = city_a.text
+        return d
+
+    def parse_price(self, s):
+        if not s:
+            return
+        parts = filter(None, re.split(r'[^\d.]+', s.strip()))
+        if not parts:
+            return
+        return float(parts[-1])
+
+
+class PageCityList(BasePage):
+    def get_cities(self):
+        cities = {}
+        for option in self.document.xpath('//select[@name="ville"]/option'):
+            v = option.get('value')
+            if not v:
+                continue
+            d = {}
+            d['code'], d['dept'] = re.search(r'ville-metal-(.*)-(\d+).htm$', v).groups()
+            d['id'] = '%s-%s' % (d['code'], d['dept'])
+            d['name'] = option.text.split('(')[0].strip()
+
+            cities[d['name']] = d
+        return cities