# -*- coding: utf-8 -*- # Copyright(C) 2013 Vincent A # # This file is part of weboob. # # weboob is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # weboob is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . from weboob.tools.browser import BasePage from weboob.tools.date import parse_french_date import re from urlparse import urljoin __all__ = ['PageCity', 'PageConcert', 'PageCityList', 'PageDate', 'PageDates'] class PageWithConcerts(BasePage): def extract_concert(self, concert_table): d = {} date_h3 = concert_table.iter('h3').next() d['date'] = parse_french_date(date_h3.text) cancel_h2 = next(date_h3.itersiblings('h2'), None) if cancel_h2 is not None and cancel_h2.text.startswith('ANNUL'): d['active'] = False else: d['active'] = True performers_table = concert_table.iterdescendants('table').next() d['performers'] = list(self.extract_performers(performers_table)) d['summary'] = ' + '.join(p['name'] for p in d['performers']) d['description'] = d['summary'] return d def extract_performers(self, performers_table): for performer_tr in performers_table.findall('tr'): performer_td = performer_tr.find('td') d = {'name': performer_td.find('strong').text.strip(' \t\r\n+')} # handle '+ GUESTS' rest = performer_td.tail if rest: d['genre'] = rest yield d def extract_id_from_url(self, url): return re.search(r'c=(\d+)', url).group(1) def extract_city_from_url(self, url): return re.search('metal-(.+).htm$', url).group(1) def extract_concert_link(self, concert_table, d): infos_a = concert_table.xpath('.//a[starts-with(@href, "detail-concert-metal.php")]')[0] infos_a = concert_table.xpath('.//a[starts-with(@href, "detail-concert-metal.php")]')[0] d['id'] = self.extract_id_from_url(infos_a.get('href')) d['url'] = 'http://www.sueurdemetal.com/detail-concert-metal.php?c=%s' % d['id'] class PageCity(PageWithConcerts): def get_concerts(self): for concert_table in self.document.xpath('//div[@id="centre-page"]//div/table'): yield self.extract_concert(concert_table) def extract_concert(self, concert_table): d = PageWithConcerts.extract_concert(self, concert_table) self.extract_concert_link(concert_table, d) d['city_id'] = self.extract_city_from_url(self.url) return d class PageDate(PageWithConcerts): def get_concerts(self): for concert_table in self.document.xpath('//div[@id="centre-page"]//div/table'): yield self.extract_concert(concert_table) def extract_concert(self, concert_table): d = PageWithConcerts.extract_concert(self, concert_table) self.extract_concert_link(concert_table, d) city_a = concert_table.xpath('.//a[starts-with(@href, "ville-metal-")]')[0] d['city_id'] = self.extract_city_from_url(city_a.get('href')) return d class PageConcert(PageWithConcerts): def get_concert(self): concert_table = self.document.xpath('//div[@id="centre-page"]//div/table')[0] d = self.extract_concert(concert_table) d['id'] = self.extract_id_from_url(self.url) d['url'] = self.url it = concert_table.iterdescendants('table') it.next() # ignore performers table infos_table = it.next() self.infos_table = infos_table info_trs = infos_table.findall('tr') d['room'] = (info_trs[3].findall('td')[1].text or '').strip() d['address'] = (info_trs[4].findall('td')[1].text or '').strip() price = self.parse_price(info_trs[5].findall('td')[1].text) if price is not None: # "None" is different from "0€" d['price'] = price city_a = self.document.xpath('//a[starts-with(@href, "ville-metal-")]')[0] d['city_id'] = self.extract_city_from_url(city_a.get('href')) d['city'] = city_a.text return d def parse_price(self, s): if not s: return parts = filter(None, re.split(r'[^\d.]+', s.strip())) if not parts: return return float(parts[-1]) class PageCityList(BasePage): def get_cities(self): cities = {} for option in self.document.xpath('//select[@name="ville"]/option'): v = option.get('value') if not v: continue d = {} d['code'], d['dept'] = re.search(r'ville-metal-(.*)-(\d+).htm$', v).groups() d['id'] = '%s-%s' % (d['code'], d['dept']) d['name'] = option.text.split('(')[0].strip() cities[d['name']] = d return cities class PageDates(BasePage): def get_dates(self): for a in self.document.xpath('//div[@id="dateconcerts"]//a'): d = {} d['date'] = parse_french_date(a.text.strip()) d['url'] = urljoin(self.url, a.get('href')) yield d def get_dates_filtered(self, date_from=None, date_end=None): for d in self.get_dates(): date = d['date'] if (not date_from or date_from <= date) and \ (not date_end or date <= date_end): yield d