[senscritique] adapt to browser2

This commit is contained in:
Bezleputh 2014-03-20 00:38:16 +01:00
commit 640504d79c
5 changed files with 197 additions and 159 deletions

View file

@ -73,7 +73,6 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent):
16: [49, 46, 2, 36, 59, 54, 32, 24, 34, 37, 53, 47],
}
"""
dict that represents ids list of cinema channels included in a tv package
{'tv package id': ['cinema channels ids list']}
@ -108,29 +107,25 @@ class SenscritiqueBackend(BaseBackend, ICapCalendarEvent):
def search_events(self, query):
if self.has_matching_categories(query):
with self.browser:
package, channels = self.get_package_and_channels()
return self.browser.list_events(query.start_date,
query.end_date,
package,
channels)
package, channels = self.get_package_and_channels()
return self.browser.list_events(query.start_date,
query.end_date,
package,
channels)
def list_events(self, date_from, date_to=None):
with self.browser:
items = []
package, channels = self.get_package_and_channels()
for item in self.browser.list_events(date_from, date_to, package, channels):
items.append(item)
items = []
package, channels = self.get_package_and_channels()
for item in self.browser.list_events(date_from, date_to, package, channels):
items.append(item)
items.sort(cmp=cmp_start_date)
return items
items.sort(cmp=cmp_start_date)
return items
def get_event(self, _id):
with self.browser:
return self.browser.get_event(_id)
return self.browser.get_event(_id)
def fill_obj(self, event, fields):
with self.browser:
return self.browser.get_event(event.id, event)
return self.browser.get_event(event.id, event)
OBJECTS = {SensCritiquenCalendarEvent: fill_obj}

View file

@ -17,46 +17,65 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.browser import BaseBrowser
from weboob.tools.browser2 import PagesBrowser, URL, Profile, Firefox
from weboob.tools.json import json as simplejson
from .calendar import SensCritiquenCalendarEvent
from .pages import ProgramPage, EventPage
from .pages import AjaxPage, EventPage, JsonResumePage
import urllib
import urllib2
import re
__all__ = ['SenscritiqueBrowser']
class SenscritiqueBrowser(BaseBrowser):
PROTOCOL = 'http'
DOMAIN = 'www.senscritique.com'
class SensCritiqueAjaxProfile(Profile):
def setup_session(self, session):
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "text/html, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.senscritique.com/sc/tv_guides",
"Origin": "http://www.senscritique.com",
"Accept-Language": "fr-fr;q=0.667",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
})
class SensCritiqueJsonProfile(Profile):
def setup_session(self, session):
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
})
class SenscritiqueBrowser(PagesBrowser):
ENCODING = 'utf-8'
PAGES = {
'%s://%s/sc/tv_guides' % (PROTOCOL, DOMAIN): ProgramPage,
'%s://%s/film/(.*?)' % (PROTOCOL, DOMAIN): EventPage,
}
BASEURL = 'http://www.senscritique.com'
program_page = URL('/sc/tv_guides')
ajax_page = URL('/sc/tv_guides/gridContent.ajax', AjaxPage)
event_page = URL('/film/(?P<_id>.*)', EventPage)
json_page = URL('/sc/products/storyline/(?P<_id>.*).json', JsonResumePage)
LIMIT = 25 # number of results returned for each ajax call (defined in the website).
LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll)
HEADER_AJAX = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "gzip, deflate",
"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.senscritique.com/sc/tv_guides",
}
LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll)
"""
HEADER_RESUME = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest",
}
"""
DATA = {'order': 'chrono',
'without_product_done': '0',
@ -64,66 +83,70 @@ class SenscritiqueBrowser(BaseBrowser):
'limit': '%d' % LIMIT,
}
URL = "http://www.senscritique.com/sc/tv_guides/gridContent.ajax"
def home(self):
self.location("http://www.senscritique.com/sc/tv_guides")
assert self.is_on_page(ProgramPage)
def list_events(self, date_from, date_to=None, package=None, channels=None):
self.home()
page = 1
if package and channels:
self.set_package_settings(package, channels)
while True:
self.DATA['page'] = '%d' % page
self.page.document = self.get_ajax_content()
nb_events = self.page.count_events()
events = self.page.list_events(date_from, date_to)
for event in events:
yield event
if nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES:
break
page += 1
def set_package_settings(self, package, channels):
url = 'http://www.senscritique.com/sc/tv_guides/saveSettings.json'
params = "network=%s" % package
params += ''.join(["&channels%%5B%%5D=%d" % (channel) for channel in channels])
self.openurl(url, params)
self.open(url, data=params)
def get_ajax_content(self):
req = urllib2.Request(self.URL, urllib.urlencode(self.DATA), headers=self.HEADER_AJAX)
response = self.open(req)
return self.get_document(response)
def list_events(self, date_from, date_to=None, package=None, channels=None):
self.program_page.stay_or_go()
page_nb = 1
if package and channels:
self.set_package_settings(package, channels)
self._setup_session(SensCritiqueAjaxProfile())
while True:
self.DATA['page'] = '%d' % page_nb
page = self.ajax_page.open(data=urllib.urlencode(self.DATA))
nb_events = page.count_events()
events = page.list_events(date_from=date_from, date_to=date_to)
for event in events:
yield event
if nb_events < self.LIMIT or page_nb >= self.LIMIT_NB_PAGES:
break
page_nb += 1
def get_event(self, _id, event=None):
if not event:
self.home()
page = 1
self.program_page.stay_or_go()
page_nb = 1
self._setup_session(SensCritiqueAjaxProfile())
while True:
self.DATA['page'] = '%d' % page
self.page.document = self.get_ajax_content()
event = self.page.find_event(_id)
nb_events = self.page.count_events()
self.DATA['page'] = '%d' % page_nb
page = self.ajax_page.open(data=urllib.urlencode(self.DATA))
event = page.list_events(_id=_id)
nb_events = page.count_events()
if event or nb_events < self.LIMIT or page >= self.LIMIT_NB_PAGES:
break
page += 1
if event:
url = SensCritiquenCalendarEvent.id2url(_id)
self.location(url)
assert self.is_on_page(EventPage)
return self.page.get_event(url, event)
if not isinstance(event, SensCritiquenCalendarEvent):
event = event.next()
def get_resume(self, url, _id):
event.resume = self.get_resume(_id)
self._setup_session(Firefox())
event = self.event_page.go(_id=_id).get_event(obj=event)
return event
def get_resume(self, _id):
self._setup_session(SensCritiqueJsonProfile())
re_id = re.compile('/(.*)/(.*?).json', re.DOTALL)
a_id = re_id.search(_id).group(1)
print a_id
return self.json_page.go(_id=a_id).get_resume()
# return "get resume"
"""
self.HEADER_RESUME['Referer'] = url
req = urllib2.Request('http://www.senscritique.com/sc/products/storyline/%s.json' % _id,
headers=self.HEADER_RESUME)
@ -131,3 +154,4 @@ class SenscritiqueBrowser(BaseBrowser):
result = simplejson.loads(response.read(), self.ENCODING)
if result['json']['success']:
return result['json']['data']
"""

View file

@ -22,13 +22,10 @@ from weboob.capabilities.calendar import BaseCalendarEvent, TRANSP, STATUS, CATE
class SensCritiquenCalendarEvent(BaseCalendarEvent):
def __init__(self, _id):
BaseCalendarEvent.__init__(self, _id)
def __init__(self):
BaseCalendarEvent.__init__(self)
self.sequence = 1
self.transp = TRANSP.TRANSPARENT
self.status = STATUS.CONFIRMED
self.category = CATEGORIES.TELE
@classmethod
def id2url(cls, _id):
return 'http://www.senscritique.com%s' % _id
self.resume = None

View file

@ -23,11 +23,14 @@ from .calendar import SensCritiquenCalendarEvent
from datetime import date, datetime, time
__all__ = ['ProgramPage']
from weboob.tools.browser2.page import HTMLPage, method, ItemElement, SkipItem, ListElement, JsonPage
from weboob.tools.browser2.filters import Filter, Link, CleanText, Env, Attr, Regexp
class ProgramPage(BasePage):
__all__ = ['AjaxPage', 'EventPage', 'JsonResumePage']
class AjaxPage(HTMLPage):
CHANNELS_PARAM = {
'einst-3 elgr-data-logo': u'Action',
@ -57,106 +60,124 @@ class ProgramPage(BasePage):
'einst-4055 elgr-data-logo': u'Paramount Channel',
}
def find_event(self, _id):
a = self.document.getroot().xpath("//a[@href='%s']" % _id, method='xpath')
if a:
event_date = self.get_event_date(a[0])
return self.create_event(a[0], event_date)
def count_events(self):
return len(self.document.getroot().xpath("//a"))
return len(self.doc.xpath("//a"))
def list_events(self, date_from, date_to=None):
for a in self.document.getroot().xpath("//a"):
event_date = self.get_event_date(a)
if self.is_valid_event(date_from, date_to, event_date):
yield self.create_event(a, event_date)
@method
class list_events(ListElement):
item_xpath = '//a'
def create_event(self, a, event_date):
event = SensCritiquenCalendarEvent(a.attrib['href'])
title = self.parser.select(a, "div/img", 1, method='xpath').attrib['alt'].replace('Affiche ', '')
channel_info = self.parser.select(a, "div/div[@class='elgr-data-channel']", method='xpath')
if channel_info:
channel = channel_info[0].text.strip()
else:
channel_info = self.parser.select(a,
'div[@class="elgr-product-data"]/span',
1,
method='xpath').attrib['class']
channel = self.CHANNELS_PARAM.get(channel_info)
event.summary = u'%s - %s' % (title, channel)
class item(ItemElement):
klass = SensCritiquenCalendarEvent
event.start_date = event_date
event.end_date = datetime.combine(event_date.date(), time.max)
return event
def is_valid_event(self, date_from, date_to, event_date):
if event_date >= date_from:
if not date_to:
def condition(self):
if '_id' in self.env and self.env['_id']:
return Regexp(Link('.'), '/film/(.*)')(self) == self.env['_id']
return True
else:
if event_date < date_to:
def validate(self, obj):
if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']:
if not self.env['date_to']:
return True
else:
if obj.end_date < self.env['date_to']:
return True
if '_id' in self.env:
return True
return False
def get_event_date(self, a):
div_date = self.parser.select(a, "div/div[@class='elgr-data-diffusion']", 1, method='xpath')
_date = self.parse_start_date(div_date)
return False
str_time = self.parser.select(div_date, "time", 1, method='xpath').attrib['datetime'][:-6]
_time = datetime.strptime(str_time, '%H:%M:%S')
class Date(Filter):
def filter(self, el):
spans_date = el[0].xpath("span[@class='d-date']")
_date = date.today()
if len(spans_date) == 2:
day_number = int(spans_date[1].text)
return datetime.combine(_date, _time.time())
month = _date.month
year = _date.year
if day_number < _date.day:
month = _date.month + 1
if _date.month == 12:
year = _date.year + 1
def parse_start_date(self, div_date):
spans_date = self.parser.select(div_date, "span[@class='d-date']", method='xpath')
_date = date(day=day_number, month=month, year=year)
_date = date.today()
if len(spans_date) == 2:
day_number = int(spans_date[1].text)
str_time = el[0].xpath("time")[0].attrib['datetime'][:-6]
_time = datetime.strptime(str_time, '%H:%M:%S')
month = _date.month
year = _date.year
if day_number < _date.day:
month = _date.month + 1
if _date.month == 12:
year = _date.year + 1
return datetime.combine(_date, _time.time())
_date = date(day=day_number, month=month, year=year)
class CombineDate(Filter):
def filter(self, _date):
return datetime.combine(_date, time.max)
return _date
class Summary(Filter):
def filter(self, el):
title = el[0].xpath("div/img")[0].attrib['alt'].replace('Affiche ', '')
channel_info = el[0].xpath("div/div[@class='elgr-data-channel']")
if channel_info:
channel = channel_info[0].text.strip()
else:
channel_info = el[0].xpath('div[@class="elgr-product-data"]/span')[0].attrib['class']
channel = self.page.CHANNELS_PARAM.get(channel_info)
return u'%s - %s' % (title, channel)
obj_id = Regexp(Link('.'), '/film/(.*)')
obj_start_date = Date('div/div[@class="elgr-data-diffusion"]')
obj_end_date = CombineDate(obj_start_date)
obj_summary = CleanText(Summary('.'))
class EventPage(BasePage):
def get_event(self, url, event):
class Description(Filter):
def filter(self, el):
header = el[0].xpath("//div[@class='pvi-hero-product']")[0]
event.url = url
title = header.xpath("div[@class='d-rubric-inner']/h1")[0].text.strip()
year = header.xpath("div[@class='d-rubric-inner']/small")[0].text.strip()
header = self.document.getroot().xpath("//div[@class='pvi-hero-product']")[0]
title = self.parser.select(header, "div[@class='d-rubric-inner']/h1", 1, method='xpath').text.strip()
year = self.parser.select(header, "div[@class='d-rubric-inner']/small", 1, method='xpath').text.strip()
_infos = self.parser.select(header, "ul[@class='pvi-product-specs']/li", method='xpath')
_infos = header.xpath("ul[@class='pvi-product-specs']/li")
infos = ''
for li in _infos:
infos += u'- %s\n' % self.parser.tocleanstring(li)
infos += u'- %s\n' % CleanText(li)(self)
section = self.document.getroot().xpath("//section[@class='pvi-productDetails']")[0]
_infos = self.parser.select(section, "ul/li", method='xpath')
section = "//section[@class='pvi-productDetails']"
_infos = el[0].xpath("%s/ul/li" % section)
for li in _infos:
infos += u'- %s\n' % self.parser.tocleanstring(li)
infos += u'- %s\n' % CleanText(li)(self)
_resume = self.parser.select(section, "p[@data-rel='full-resume']", method='xpath')
return u'%s %s\n\n%s\n\n' % (title, year, infos)
class Resume(Filter):
def filter(self, el):
_resume = el[0].xpath("p[@data-rel='full-resume']")
if not _resume:
_resume = self.parser.select(section, "p[@data-rel='small-resume']", method='xpath')
_resume = el[0].xpath("p[@data-rel='small-resume']")
if _resume:
resume = html2text(self.parser.tostring(_resume[0]))
else:
resume = ""
else:
_id = self.parser.select(_resume[0], 'button', 1, method='xpath').attrib['data-sc-product-id']
resume = self.browser.get_resume(url, _id)
resume = html2text(CleanText(_resume[0])(self))[6:]
return resume
event.description = u'%s %s\n\n%s\n\n%s' % (title, year, infos, resume)
return event
class EventPage(HTMLPage):
@method
class get_event(ItemElement):
klass = SensCritiquenCalendarEvent
def parse(self, el):
event = self.obj
event.url = self.page.url
resume = Resume('//section[@class="pvi-productDetails"]')(self)
if not resume:
resume = self.obj.resume
description = Description('.')(self)
event.description = u'%s%s' % (description, resume)
return event
class JsonResumePage(JsonPage):
def get_resume(self):
print self.doc
if self.doc['json']['success']:
return self.doc['json']['data']

View file

@ -21,6 +21,7 @@
from weboob.tools.test import BackendTest
from datetime import datetime
class SenscritiqueTest(BackendTest):
BACKEND = 'senscritique'