[senscritique] adapt to browser2

This commit is contained in:
Bezleputh 2014-03-20 00:38:16 +01:00
commit 640504d79c
5 changed files with 197 additions and 159 deletions

View file

@ -73,7 +73,6 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent):
16: [49, 46, 2, 36, 59, 54, 32, 24, 34, 37, 53, 47], 16: [49, 46, 2, 36, 59, 54, 32, 24, 34, 37, 53, 47],
} }
""" """
dict that represents ids list of cinema channels included in a tv package dict that represents ids list of cinema channels included in a tv package
{'tv package id': ['cinema channels ids list']} {'tv package id': ['cinema channels ids list']}
@ -108,29 +107,25 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent):
def search_events(self, query): def search_events(self, query):
if self.has_matching_categories(query): if self.has_matching_categories(query):
with self.browser: package, channels = self.get_package_and_channels()
package, channels = self.get_package_and_channels() return self.browser.list_events(query.start_date,
return self.browser.list_events(query.start_date, query.end_date,
query.end_date, package,
package, channels)
channels)
def list_events(self, date_from, date_to=None): def list_events(self, date_from, date_to=None):
with self.browser: items = []
items = [] package, channels = self.get_package_and_channels()
package, channels = self.get_package_and_channels() for item in self.browser.list_events(date_from, date_to, package, channels):
for item in self.browser.list_events(date_from, date_to, package, channels): items.append(item)
items.append(item)
items.sort(cmp=cmp_start_date) items.sort(cmp=cmp_start_date)
return items return items
def get_event(self, _id): def get_event(self, _id):
with self.browser: return self.browser.get_event(_id)
return self.browser.get_event(_id)
def fill_obj(self, event, fields): def fill_obj(self, event, fields):
with self.browser: return self.browser.get_event(event.id, event)
return self.browser.get_event(event.id, event)
OBJECTS = {SensCritiquenCalendarEvent: fill_obj} OBJECTS = {SensCritiquenCalendarEvent: fill_obj}

View file

@ -17,46 +17,65 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser from weboob.tools.browser2 import PagesBrowser, URL, Profile, Firefox
from weboob.tools.json import json as simplejson from weboob.tools.json import json as simplejson
from .calendar import SensCritiquenCalendarEvent from .calendar import SensCritiquenCalendarEvent
from .pages import ProgramPage, EventPage from .pages import AjaxPage, EventPage, JsonResumePage
import urllib import urllib
import urllib2 import urllib2
import re
__all__ = ['SenscritiqueBrowser'] __all__ = ['SenscritiqueBrowser']
class SenscritiqueBrowser(BaseBrowser): class SensCritiqueAjaxProfile(Profile):
PROTOCOL = 'http' def setup_session(self, session):
DOMAIN = 'www.senscritique.com' session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "text/html, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.senscritique.com/sc/tv_guides",
"Origin": "http://www.senscritique.com",
"Accept-Language": "fr-fr;q=0.667",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
})
class SensCritiqueJsonProfile(Profile):
def setup_session(self, session):
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
})
class SenscritiqueBrowser(PagesBrowser):
ENCODING = 'utf-8' ENCODING = 'utf-8'
PAGES = { BASEURL = 'http://www.senscritique.com'
'%s://%s/sc/tv_guides' % (PROTOCOL, DOMAIN): ProgramPage,
'%s://%s/film/(.*?)' % (PROTOCOL, DOMAIN): EventPage, program_page = URL('/sc/tv_guides')
} ajax_page = URL('/sc/tv_guides/gridContent.ajax', AjaxPage)
event_page = URL('/film/(?P<_id>.*)', EventPage)
json_page = URL('/sc/products/storyline/(?P<_id>.*).json', JsonResumePage)
LIMIT = 25 # number of results returned for each ajax call (defined in the website). LIMIT = 25 # number of results returned for each ajax call (defined in the website).
LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll) LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll)
HEADER_AJAX = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "gzip, deflate",
"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.senscritique.com/sc/tv_guides",
}
"""
HEADER_RESUME = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows " HEADER_RESUME = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)", " GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
} }
"""
DATA = {'order': 'chrono', DATA = {'order': 'chrono',
'without_product_done': '0', 'without_product_done': '0',
@ -64,66 +83,70 @@ class SenscritiqueBrowser(BaseBrowser):
'limit': '%d' % LIMIT, 'limit': '%d' % LIMIT,
} }
URL = "http://www.senscritique.com/sc/tv_guides/gridContent.ajax"
def home(self):
self.location("http://www.senscritique.com/sc/tv_guides")
assert self.is_on_page(ProgramPage)
def list_events(self, date_from, date_to=None, package=None, channels=None):
self.home()
page = 1
if package and channels:
self.set_package_settings(package, channels)
while True:
self.DATA['page'] = '%d' % page
self.page.document = self.get_ajax_content()
nb_events = self.page.count_events()
events = self.page.list_events(date_from, date_to)
for event in events:
yield event
if nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES:
break
page += 1
def set_package_settings(self, package, channels): def set_package_settings(self, package, channels):
url = 'http://www.senscritique.com/sc/tv_guides/saveSettings.json' url = 'http://www.senscritique.com/sc/tv_guides/saveSettings.json'
params = "network=%s" % package params = "network=%s" % package
params += ''.join(["&channels%%5B%%5D=%d" % (channel) for channel in channels]) params += ''.join(["&channels%%5B%%5D=%d" % (channel) for channel in channels])
self.openurl(url, params) self.open(url, data=params)
def get_ajax_content(self): def list_events(self, date_from, date_to=None, package=None, channels=None):
req = urllib2.Request(self.URL, urllib.urlencode(self.DATA), headers=self.HEADER_AJAX) self.program_page.stay_or_go()
response = self.open(req) page_nb = 1
return self.get_document(response)
if package and channels:
self.set_package_settings(package, channels)
self._setup_session(SensCritiqueAjaxProfile())
while True:
self.DATA['page'] = '%d' % page_nb
page = self.ajax_page.open(data=urllib.urlencode(self.DATA))
nb_events = page.count_events()
events = page.list_events(date_from=date_from, date_to=date_to)
for event in events:
yield event
if nb_events < self.LIMIT or page_nb >= self.LIMIT_NB_PAGES:
break
page_nb += 1
def get_event(self, _id, event=None): def get_event(self, _id, event=None):
if not event: if not event:
self.home() self.program_page.stay_or_go()
page = 1 page_nb = 1
self._setup_session(SensCritiqueAjaxProfile())
while True: while True:
self.DATA['page'] = '%d' % page self.DATA['page'] = '%d' % page_nb
self.page.document = self.get_ajax_content() page = self.ajax_page.open(data=urllib.urlencode(self.DATA))
event = self.page.find_event(_id) event = page.list_events(_id=_id)
nb_events = self.page.count_events() nb_events = page.count_events()
if event or nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES: if event or nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES:
break break
page += 1 page += 1
if event: if event:
url = SensCritiquenCalendarEvent.id2url(_id) if not isinstance(event, SensCritiquenCalendarEvent):
self.location(url) event = event.next()
assert self.is_on_page(EventPage)
return self.page.get_event(url, event)
def get_resume(self, url, _id): event.resume = self.get_resume(_id)
self._setup_session(Firefox())
event = self.event_page.go(_id=_id).get_event(obj=event)
return event
def get_resume(self, _id):
self._setup_session(SensCritiqueJsonProfile())
re_id = re.compile('/(.*)/(.*?).json', re.DOTALL)
a_id = re_id.search(_id).group(1)
print a_id
return self.json_page.go(_id=a_id).get_resume()
# return "get resume"
"""
self.HEADER_RESUME['Referer'] = url self.HEADER_RESUME['Referer'] = url
req = urllib2.Request('http://www.senscritique.com/sc/products/storyline/%s.json' % _id, req = urllib2.Request('http://www.senscritique.com/sc/products/storyline/%s.json' % _id,
headers=self.HEADER_RESUME) headers=self.HEADER_RESUME)
@ -131,3 +154,4 @@ class SenscritiqueBrowser(BaseBrowser):
result = simplejson.loads(response.read(), self.ENCODING) result = simplejson.loads(response.read(), self.ENCODING)
if result['json']['success']: if result['json']['success']:
return result['json']['data'] return result['json']['data']
"""

View file

@ -22,13 +22,10 @@ from weboob.capabilities.calendar import BaseCalendarEvent, TRANSP, STATUS, CATE
class SensCritiquenCalendarEvent(BaseCalendarEvent): class SensCritiquenCalendarEvent(BaseCalendarEvent):
def __init__(self, _id): def __init__(self):
BaseCalendarEvent.__init__(self, _id) BaseCalendarEvent.__init__(self)
self.sequence = 1 self.sequence = 1
self.transp = TRANSP.TRANSPARENT self.transp = TRANSP.TRANSPARENT
self.status = STATUS.CONFIRMED self.status = STATUS.CONFIRMED
self.category = CATEGORIES.TELE self.category = CATEGORIES.TELE
self.resume = None
@classmethod
def id2url(cls, _id):
return 'http://www.senscritique.com%s' % _id

View file

@ -23,11 +23,14 @@ from .calendar import SensCritiquenCalendarEvent
from datetime import date, datetime, time from datetime import date, datetime, time
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement, JsonPage
__all__ = ['ProgramPage'] from weboob.tools.browser2.filters import Filter, Link, CleanText, Env, Attr, Regexp
class ProgramPage(BasePage): __all__ = ['AjaxPage', 'EventPage', 'JsonResumePage']
class AjaxPage(HTMLPage):
CHANNELS_PARAM = { CHANNELS_PARAM = {
'einst-3 elgr-data-logo': u'Action', 'einst-3 elgr-data-logo': u'Action',
@ -57,106 +60,124 @@ class ProgramPage(BasePage):
'einst-4055 elgr-data-logo': u'Paramount Channel', 'einst-4055 elgr-data-logo': u'Paramount Channel',
} }
def find_event(self, _id):
a = self.document.getroot().xpath("//a[@href='%s']" % _id, method='xpath')
if a:
event_date = self.get_event_date(a[0])
return self.create_event(a[0], event_date)
def count_events(self): def count_events(self):
return len(self.document.getroot().xpath("//a")) return len(self.doc.xpath("//a"))
def list_events(self, date_from, date_to=None): @method
for a in self.document.getroot().xpath("//a"): class list_events(ListElement):
event_date = self.get_event_date(a) item_xpath = '//a'
if self.is_valid_event(date_from, date_to, event_date):
yield self.create_event(a, event_date)
def create_event(self, a, event_date): class item(ItemElement):
event = SensCritiquenCalendarEvent(a.attrib['href']) klass = SensCritiquenCalendarEvent
title = self.parser.select(a, "div/img", 1, method='xpath').attrib['alt'].replace('Affiche ', '')
channel_info = self.parser.select(a, "div/div[@class='elgr-data-channel']", method='xpath')
if channel_info:
channel = channel_info[0].text.strip()
else:
channel_info = self.parser.select(a,
'div[@class="elgr-product-data"]/span',
1,
method='xpath').attrib['class']
channel = self.CHANNELS_PARAM.get(channel_info)
event.summary = u'%s - %s' % (title, channel)
event.start_date = event_date def condition(self):
event.end_date = datetime.combine(event_date.date(), time.max) if '_id' in self.env and self.env['_id']:
return event return Regexp(Link('.'), '/film/(.*)')(self) == self.env['_id']
def is_valid_event(self, date_from, date_to, event_date):
if event_date >= date_from:
if not date_to:
return True return True
else:
if event_date < date_to: def validate(self, obj):
if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']:
if not self.env['date_to']:
return True
else:
if obj.end_date < self.env['date_to']:
return True
if '_id' in self.env:
return True return True
return False
def get_event_date(self, a): return False
div_date = self.parser.select(a, "div/div[@class='elgr-data-diffusion']", 1, method='xpath')
_date = self.parse_start_date(div_date)
str_time = self.parser.select(div_date, "time", 1, method='xpath').attrib['datetime'][:-6] class Date(Filter):
_time = datetime.strptime(str_time, '%H:%M:%S') def filter(self, el):
spans_date = el[0].xpath("span[@class='d-date']")
_date = date.today()
if len(spans_date) == 2:
day_number = int(spans_date[1].text)
return datetime.combine(_date, _time.time()) month = _date.month
year = _date.year
if day_number < _date.day:
month = _date.month + 1
if _date.month == 12:
year = _date.year + 1
def parse_start_date(self, div_date): _date = date(day=day_number, month=month, year=year)
spans_date = self.parser.select(div_date, "span[@class='d-date']", method='xpath')
_date = date.today() str_time = el[0].xpath("time")[0].attrib['datetime'][:-6]
if len(spans_date) == 2: _time = datetime.strptime(str_time, '%H:%M:%S')
day_number = int(spans_date[1].text)
month = _date.month return datetime.combine(_date, _time.time())
year = _date.year
if day_number < _date.day:
month = _date.month + 1
if _date.month == 12:
year = _date.year + 1
_date = date(day=day_number, month=month, year=year) class CombineDate(Filter):
def filter(self, _date):
return datetime.combine(_date, time.max)
return _date class Summary(Filter):
def filter(self, el):
title = el[0].xpath("div/img")[0].attrib['alt'].replace('Affiche ', '')
channel_info = el[0].xpath("div/div[@class='elgr-data-channel']")
if channel_info:
channel = channel_info[0].text.strip()
else:
channel_info = el[0].xpath('div[@class="elgr-product-data"]/span')[0].attrib['class']
channel = self.page.CHANNELS_PARAM.get(channel_info)
return u'%s - %s' % (title, channel)
obj_id = Regexp(Link('.'), '/film/(.*)')
obj_start_date = Date('div/div[@class="elgr-data-diffusion"]')
obj_end_date = CombineDate(obj_start_date)
obj_summary = CleanText(Summary('.'))
class EventPage(BasePage): class Description(Filter):
def get_event(self, url, event): def filter(self, el):
header = el[0].xpath("//div[@class='pvi-hero-product']")[0]
event.url = url title = header.xpath("div[@class='d-rubric-inner']/h1")[0].text.strip()
year = header.xpath("div[@class='d-rubric-inner']/small")[0].text.strip()
header = self.document.getroot().xpath("//div[@class='pvi-hero-product']")[0] _infos = header.xpath("ul[@class='pvi-product-specs']/li")
title = self.parser.select(header, "div[@class='d-rubric-inner']/h1", 1, method='xpath').text.strip()
year = self.parser.select(header, "div[@class='d-rubric-inner']/small", 1, method='xpath').text.strip()
_infos = self.parser.select(header, "ul[@class='pvi-product-specs']/li", method='xpath')
infos = '' infos = ''
for li in _infos: for li in _infos:
infos += u'- %s\n' % self.parser.tocleanstring(li) infos += u'- %s\n' % CleanText(li)(self)
section = self.document.getroot().xpath("//section[@class='pvi-productDetails']")[0] section = "//section[@class='pvi-productDetails']"
_infos = self.parser.select(section, "ul/li", method='xpath') _infos = el[0].xpath("%s/ul/li" % section)
for li in _infos: for li in _infos:
infos += u'- %s\n' % self.parser.tocleanstring(li) infos += u'- %s\n' % CleanText(li)(self)
_resume = self.parser.select(section, "p[@data-rel='full-resume']", method='xpath') return u'%s %s\n\n%s\n\n' % (title, year, infos)
class Resume(Filter):
def filter(self, el):
_resume = el[0].xpath("p[@data-rel='full-resume']")
if not _resume: if not _resume:
_resume = self.parser.select(section, "p[@data-rel='small-resume']", method='xpath') _resume = el[0].xpath("p[@data-rel='small-resume']")
if _resume: if _resume:
resume = html2text(self.parser.tostring(_resume[0])) resume = html2text(CleanText(_resume[0])(self))[6:]
else: return resume
resume = ""
else:
_id = self.parser.select(_resume[0], 'button', 1, method='xpath').attrib['data-sc-product-id']
resume = self.browser.get_resume(url, _id)
event.description = u'%s %s\n\n%s\n\n%s' % (title, year, infos, resume)
return event class EventPage(HTMLPage):
@method
class get_event(ItemElement):
klass = SensCritiquenCalendarEvent
def parse(self, el):
event = self.obj
event.url = self.page.url
resume = Resume('//section[@class="pvi-productDetails"]')(self)
if not resume:
resume = self.obj.resume
description = Description('.')(self)
event.description = u'%s%s' % (description, resume)
return event
class JsonResumePage(JsonPage):
def get_resume(self):
print self.doc
if self.doc['json']['success']:
return self.doc['json']['data']

View file

@ -21,6 +21,7 @@
from weboob.tools.test import BackendTest from weboob.tools.test import BackendTest
from datetime import datetime from datetime import datetime
class SenscritiqueTest(BackendTest): class SenscritiqueTest(BackendTest):
BACKEND = 'senscritique' BACKEND = 'senscritique'