diff --git a/modules/senscritique/backend.py b/modules/senscritique/backend.py index 0f42856e..498594c3 100644 --- a/modules/senscritique/backend.py +++ b/modules/senscritique/backend.py @@ -73,7 +73,6 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent): 16: [49, 46, 2, 36, 59, 54, 32, 24, 34, 37, 53, 47], } - """ dict that represents ids list of cinema channels included in a tv package {'tv package id': ['cinema channels ids list']} @@ -108,29 +107,25 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent): def search_events(self, query): if self.has_matching_categories(query): - with self.browser: - package, channels = self.get_package_and_channels() - return self.browser.list_events(query.start_date, - query.end_date, - package, - channels) + package, channels = self.get_package_and_channels() + return self.browser.list_events(query.start_date, + query.end_date, + package, + channels) def list_events(self, date_from, date_to=None): - with self.browser: - items = [] - package, channels = self.get_package_and_channels() - for item in self.browser.list_events(date_from, date_to, package, channels): - items.append(item) + items = [] + package, channels = self.get_package_and_channels() + for item in self.browser.list_events(date_from, date_to, package, channels): + items.append(item) - items.sort(cmp=cmp_start_date) - return items + items.sort(cmp=cmp_start_date) + return items def get_event(self, _id): - with self.browser: - return self.browser.get_event(_id) + return self.browser.get_event(_id) def fill_obj(self, event, fields): - with self.browser: - return self.browser.get_event(event.id, event) + return self.browser.get_event(event.id, event) OBJECTS = {SensCritiquenCalendarEvent: fill_obj} diff --git a/modules/senscritique/browser.py b/modules/senscritique/browser.py index ed771033..c96e3b42 100644 --- a/modules/senscritique/browser.py +++ b/modules/senscritique/browser.py @@ -17,46 +17,65 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . -from weboob.tools.browser import BaseBrowser +from weboob.tools.browser2 import PagesBrowser, URL, Profile, Firefox from weboob.tools.json import json as simplejson from .calendar import SensCritiquenCalendarEvent -from .pages import ProgramPage, EventPage +from .pages import AjaxPage, EventPage, JsonResumePage import urllib import urllib2 +import re __all__ = ['SenscritiqueBrowser'] -class SenscritiqueBrowser(BaseBrowser): - PROTOCOL = 'http' - DOMAIN = 'www.senscritique.com' +class SensCritiqueAjaxProfile(Profile): + def setup_session(self, session): + session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows " + "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" + " GTB7.1 (.NET CLR 3.5.30729)", + "Accept": "text/html, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": "http://www.senscritique.com/sc/tv_guides", + "Origin": "http://www.senscritique.com", + "Accept-Language": "fr-fr;q=0.667", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + }) + + +class SensCritiqueJsonProfile(Profile): + def setup_session(self, session): + session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows " + "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" + " GTB7.1 (.NET CLR 3.5.30729)", + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + }) + + +class SenscritiqueBrowser(PagesBrowser): ENCODING = 'utf-8' - PAGES = { - '%s://%s/sc/tv_guides' % (PROTOCOL, DOMAIN): ProgramPage, - '%s://%s/film/(.*?)' % (PROTOCOL, DOMAIN): EventPage, - } + BASEURL = 'http://www.senscritique.com' + + program_page = URL('/sc/tv_guides') + ajax_page = URL('/sc/tv_guides/gridContent.ajax', AjaxPage) + event_page = URL('/film/(?P<_id>.*)', EventPage) + json_page = URL('/sc/products/storyline/(?P<_id>.*).json', JsonResumePage) LIMIT = 25 # number of results returned for each ajax call (defined in the website). - LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll) - - HEADER_AJAX = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows " - "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" - " GTB7.1 (.NET CLR 3.5.30729)", - "Accept": "gzip, deflate", - "X-Requested-With": "XMLHttpRequest", - "Referer": "http://www.senscritique.com/sc/tv_guides", - } + LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll) + """ HEADER_RESUME = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows " "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" " GTB7.1 (.NET CLR 3.5.30729)", "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", } + """ DATA = {'order': 'chrono', 'without_product_done': '0', @@ -64,66 +83,70 @@ class SenscritiqueBrowser(BaseBrowser): 'limit': '%d' % LIMIT, } - URL = "http://www.senscritique.com/sc/tv_guides/gridContent.ajax" - - def home(self): - self.location("http://www.senscritique.com/sc/tv_guides") - assert self.is_on_page(ProgramPage) - - def list_events(self, date_from, date_to=None, package=None, channels=None): - self.home() - page = 1 - - if package and channels: - self.set_package_settings(package, channels) - - while True: - self.DATA['page'] = '%d' % page - self.page.document = self.get_ajax_content() - nb_events = self.page.count_events() - events = self.page.list_events(date_from, date_to) - - for event in events: - yield event - - if nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES: - break - - page += 1 - def set_package_settings(self, package, channels): url = 'http://www.senscritique.com/sc/tv_guides/saveSettings.json' params = "network=%s" % package params += ''.join(["&channels%%5B%%5D=%d" % (channel) for channel in channels]) - self.openurl(url, params) + self.open(url, data=params) - def get_ajax_content(self): - req = urllib2.Request(self.URL, urllib.urlencode(self.DATA), headers=self.HEADER_AJAX) - response = self.open(req) - return self.get_document(response) + def list_events(self, date_from, date_to=None, package=None, channels=None): + self.program_page.stay_or_go() + page_nb = 1 + + if package and channels: + self.set_package_settings(package, channels) + + self._setup_session(SensCritiqueAjaxProfile()) + while True: + self.DATA['page'] = '%d' % page_nb + page = self.ajax_page.open(data=urllib.urlencode(self.DATA)) + nb_events = page.count_events() + events = page.list_events(date_from=date_from, date_to=date_to) + + for event in events: + yield event + + if nb_events < self.LIMIT or page_nb >= self.LIMIT_NB_PAGES: + break + + page_nb += 1 def get_event(self, _id, event=None): if not event: - self.home() - page = 1 + self.program_page.stay_or_go() + page_nb = 1 + self._setup_session(SensCritiqueAjaxProfile()) while True: - self.DATA['page'] = '%d' % page - self.page.document = self.get_ajax_content() - event = self.page.find_event(_id) - nb_events = self.page.count_events() + self.DATA['page'] = '%d' % page_nb + page = self.ajax_page.open(data=urllib.urlencode(self.DATA)) + event = page.list_events(_id=_id) + nb_events = page.count_events() if event or nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES: break page += 1 if event: - url = SensCritiquenCalendarEvent.id2url(_id) - self.location(url) - assert self.is_on_page(EventPage) - return self.page.get_event(url, event) + if not isinstance(event, SensCritiquenCalendarEvent): + event = event.next() - def get_resume(self, url, _id): + event.resume = self.get_resume(_id) + + self._setup_session(Firefox()) + event = self.event_page.go(_id=_id).get_event(obj=event) + + return event + + def get_resume(self, _id): + self._setup_session(SensCritiqueJsonProfile()) + re_id = re.compile('/(.*)/(.*?).json', re.DOTALL) + a_id = re_id.search(_id).group(1) + print a_id + + return self.json_page.go(_id=a_id).get_resume() + # return "get resume" + """ self.HEADER_RESUME['Referer'] = url req = urllib2.Request('http://www.senscritique.com/sc/products/storyline/%s.json' % _id, headers=self.HEADER_RESUME) @@ -131,3 +154,4 @@ class SenscritiqueBrowser(BaseBrowser): result = simplejson.loads(response.read(), self.ENCODING) if result['json']['success']: return result['json']['data'] + """ diff --git a/modules/senscritique/calendar.py b/modules/senscritique/calendar.py index de0e8269..923fb0a9 100644 --- a/modules/senscritique/calendar.py +++ b/modules/senscritique/calendar.py @@ -22,13 +22,10 @@ from weboob.capabilities.calendar import BaseCalendarEvent, TRANSP, STATUS, CATE class SensCritiquenCalendarEvent(BaseCalendarEvent): - def __init__(self, _id): - BaseCalendarEvent.__init__(self, _id) + def __init__(self): + BaseCalendarEvent.__init__(self) self.sequence = 1 self.transp = TRANSP.TRANSPARENT self.status = STATUS.CONFIRMED self.category = CATEGORIES.TELE - - @classmethod - def id2url(cls, _id): - return 'http://www.senscritique.com%s' % _id + self.resume = None diff --git a/modules/senscritique/pages.py b/modules/senscritique/pages.py index 9f0ae74d..187ff62c 100644 --- a/modules/senscritique/pages.py +++ b/modules/senscritique/pages.py @@ -23,11 +23,14 @@ from .calendar import SensCritiquenCalendarEvent from datetime import date, datetime, time - -__all__ = ['ProgramPage'] +from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement, JsonPage +from weboob.tools.browser2.filters import Filter, Link, CleanText, Env, Attr, Regexp -class ProgramPage(BasePage): +__all__ = ['AjaxPage', 'EventPage', 'JsonResumePage'] + + +class AjaxPage(HTMLPage): CHANNELS_PARAM = { 'einst-3 elgr-data-logo': u'Action', @@ -57,106 +60,124 @@ class ProgramPage(BasePage): 'einst-4055 elgr-data-logo': u'Paramount Channel', } - def find_event(self, _id): - a = self.document.getroot().xpath("//a[@href='%s']" % _id, method='xpath') - if a: - event_date = self.get_event_date(a[0]) - return self.create_event(a[0], event_date) - def count_events(self): - return len(self.document.getroot().xpath("//a")) + return len(self.doc.xpath("//a")) - def list_events(self, date_from, date_to=None): - for a in self.document.getroot().xpath("//a"): - event_date = self.get_event_date(a) - if self.is_valid_event(date_from, date_to, event_date): - yield self.create_event(a, event_date) + @method + class list_events(ListElement): + item_xpath = '//a' - def create_event(self, a, event_date): - event = SensCritiquenCalendarEvent(a.attrib['href']) - title = self.parser.select(a, "div/img", 1, method='xpath').attrib['alt'].replace('Affiche ', '') - channel_info = self.parser.select(a, "div/div[@class='elgr-data-channel']", method='xpath') - if channel_info: - channel = channel_info[0].text.strip() - else: - channel_info = self.parser.select(a, - 'div[@class="elgr-product-data"]/span', - 1, - method='xpath').attrib['class'] - channel = self.CHANNELS_PARAM.get(channel_info) - event.summary = u'%s - %s' % (title, channel) + class item(ItemElement): + klass = SensCritiquenCalendarEvent - event.start_date = event_date - event.end_date = datetime.combine(event_date.date(), time.max) - return event - - def is_valid_event(self, date_from, date_to, event_date): - if event_date >= date_from: - if not date_to: + def condition(self): + if '_id' in self.env and self.env['_id']: + return Regexp(Link('.'), '/film/(.*)')(self) == self.env['_id'] return True - else: - if event_date < date_to: + + def validate(self, obj): + if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']: + if not self.env['date_to']: + return True + else: + if obj.end_date < self.env['date_to']: + return True + + if '_id' in self.env: return True - return False - def get_event_date(self, a): - div_date = self.parser.select(a, "div/div[@class='elgr-data-diffusion']", 1, method='xpath') - _date = self.parse_start_date(div_date) + return False - str_time = self.parser.select(div_date, "time", 1, method='xpath').attrib['datetime'][:-6] - _time = datetime.strptime(str_time, '%H:%M:%S') + class Date(Filter): + def filter(self, el): + spans_date = el[0].xpath("span[@class='d-date']") + _date = date.today() + if len(spans_date) == 2: + day_number = int(spans_date[1].text) - return datetime.combine(_date, _time.time()) + month = _date.month + year = _date.year + if day_number < _date.day: + month = _date.month + 1 + if _date.month == 12: + year = _date.year + 1 - def parse_start_date(self, div_date): - spans_date = self.parser.select(div_date, "span[@class='d-date']", method='xpath') + _date = date(day=day_number, month=month, year=year) - _date = date.today() - if len(spans_date) == 2: - day_number = int(spans_date[1].text) + str_time = el[0].xpath("time")[0].attrib['datetime'][:-6] + _time = datetime.strptime(str_time, '%H:%M:%S') - month = _date.month - year = _date.year - if day_number < _date.day: - month = _date.month + 1 - if _date.month == 12: - year = _date.year + 1 + return datetime.combine(_date, _time.time()) - _date = date(day=day_number, month=month, year=year) + class CombineDate(Filter): + def filter(self, _date): + return datetime.combine(_date, time.max) - return _date + class Summary(Filter): + def filter(self, el): + title = el[0].xpath("div/img")[0].attrib['alt'].replace('Affiche ', '') + channel_info = el[0].xpath("div/div[@class='elgr-data-channel']") + if channel_info: + channel = channel_info[0].text.strip() + else: + channel_info = el[0].xpath('div[@class="elgr-product-data"]/span')[0].attrib['class'] + channel = self.page.CHANNELS_PARAM.get(channel_info) + return u'%s - %s' % (title, channel) + + obj_id = Regexp(Link('.'), '/film/(.*)') + obj_start_date = Date('div/div[@class="elgr-data-diffusion"]') + obj_end_date = CombineDate(obj_start_date) + obj_summary = CleanText(Summary('.')) -class EventPage(BasePage): - def get_event(self, url, event): +class Description(Filter): + def filter(self, el): + header = el[0].xpath("//div[@class='pvi-hero-product']")[0] - event.url = url + title = header.xpath("div[@class='d-rubric-inner']/h1")[0].text.strip() + year = header.xpath("div[@class='d-rubric-inner']/small")[0].text.strip() - header = self.document.getroot().xpath("//div[@class='pvi-hero-product']")[0] - - title = self.parser.select(header, "div[@class='d-rubric-inner']/h1", 1, method='xpath').text.strip() - year = self.parser.select(header, "div[@class='d-rubric-inner']/small", 1, method='xpath').text.strip() - - _infos = self.parser.select(header, "ul[@class='pvi-product-specs']/li", method='xpath') + _infos = header.xpath("ul[@class='pvi-product-specs']/li") infos = '' for li in _infos: - infos += u'- %s\n' % self.parser.tocleanstring(li) + infos += u'- %s\n' % CleanText(li)(self) - section = self.document.getroot().xpath("//section[@class='pvi-productDetails']")[0] - _infos = self.parser.select(section, "ul/li", method='xpath') + section = "//section[@class='pvi-productDetails']" + _infos = el[0].xpath("%s/ul/li" % section) for li in _infos: - infos += u'- %s\n' % self.parser.tocleanstring(li) + infos += u'- %s\n' % CleanText(li)(self) - _resume = self.parser.select(section, "p[@data-rel='full-resume']", method='xpath') + return u'%s %s\n\n%s\n\n' % (title, year, infos) + + +class Resume(Filter): + def filter(self, el): + _resume = el[0].xpath("p[@data-rel='full-resume']") if not _resume: - _resume = self.parser.select(section, "p[@data-rel='small-resume']", method='xpath') + _resume = el[0].xpath("p[@data-rel='small-resume']") if _resume: - resume = html2text(self.parser.tostring(_resume[0])) - else: - resume = "" - else: - _id = self.parser.select(_resume[0], 'button', 1, method='xpath').attrib['data-sc-product-id'] - resume = self.browser.get_resume(url, _id) + resume = html2text(CleanText(_resume[0])(self))[6:] + return resume - event.description = u'%s %s\n\n%s\n\n%s' % (title, year, infos, resume) - return event + +class EventPage(HTMLPage): + @method + class get_event(ItemElement): + klass = SensCritiquenCalendarEvent + + def parse(self, el): + event = self.obj + event.url = self.page.url + resume = Resume('//section[@class="pvi-productDetails"]')(self) + if not resume: + resume = self.obj.resume + description = Description('.')(self) + event.description = u'%s%s' % (description, resume) + return event + + +class JsonResumePage(JsonPage): + def get_resume(self): + print self.doc + if self.doc['json']['success']: + return self.doc['json']['data'] diff --git a/modules/senscritique/test.py b/modules/senscritique/test.py index 5dfabe59..ea3fedfa 100644 --- a/modules/senscritique/test.py +++ b/modules/senscritique/test.py @@ -21,6 +21,7 @@ from weboob.tools.test import BackendTest from datetime import datetime + class SenscritiqueTest(BackendTest): BACKEND = 'senscritique'