[senscritique] site changed (we cannot choose channels packages anymore)

This commit is contained in:
Bezleputh 2015-09-02 12:00:58 +02:00
commit 7094d65d9c
3 changed files with 73 additions and 223 deletions

View file

@ -17,137 +17,52 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.capabilities.base import UserError
from weboob.browser import PagesBrowser, URL from weboob.browser import PagesBrowser, URL
from .pages import FilmsPage, EventPage, JsonResumePage
from weboob.browser.profiles import Firefox from weboob.browser.profiles import Firefox
from .pages import AjaxPage, EventPage, JsonResumePage, SettingsPage
import re
from lxml.etree import XMLSyntaxError
__all__ = ['SenscritiqueBrowser'] __all__ = ['SenscritiqueBrowser']
class SenscritiqueBrowser(PagesBrowser): class SenscritiqueBrowser(PagesBrowser):
def set_ajax_header(self): BASEURL = 'http://www.senscritique.com'
self.session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" films_page = URL('/everymovie/programme-tv/chrono', FilmsPage)
" GTB7.1 (.NET CLR 3.5.30729)", event_page = URL('/film/(?P<_id>.*)', EventPage)
"Accept": "text/html, */*; q=0.01", json_page = URL('/sc/products/storyline/(?P<_id>.*).json', JsonResumePage)
"X-Requested-With": "XMLHttpRequest",
"Referer": "http://www.senscritique.com/sc/tv_guides",
"Origin": "http://www.senscritique.com",
"Accept-Language": "fr-fr;q=0.667",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
})
def set_json_header(self): def set_json_header(self):
self.session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows " self.session.headers.update({"User-Agent": "Mozilla/5.0 (Windows; U; Windows "
"NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8" "NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8"
" GTB7.1 (.NET CLR 3.5.30729)", " GTB7.1 (.NET CLR 3.5.30729)",
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept": "application/json, text/javascript, */*; q=0.01",
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
}) })
ENCODING = 'utf-8' def list_events(self, date_from, date_to=None):
CHANNELS = None return self.films_page.go().iter_films(date_from=date_from, date_to=date_to)
BASEURL = 'http://www.senscritique.com'
program_page = URL('/sc/tv_guides') def get_event(self, _id, event=None):
ajax_page = URL('/sc/tv_guides/gridContent.ajax', AjaxPage)
event_page = URL('/film/(?P<_id>.*)', EventPage)
json_page = URL('/sc/products/storyline/(?P<_id>.*).json', JsonResumePage)
setting_page = URL('/sc/tv_guides/settings.ajax', SettingsPage)
LIMIT = 25 # number of results returned for each ajax call (defined in the website).
LIMIT_NB_PAGES = 10 # arbitrary limit to avoid infinitive loop that can occurs if total number of films is a multiple of LIMIT (in website it causes an infinite scroll)
DATA = {'order': 'chrono',
'without_product_done': '0',
'period': 'cette-semaine',
'limit': '%d' % LIMIT,
}
def get_channels(self):
if not self.CHANNELS:
self.CHANNELS = list(self.setting_page.go().get_channels())
return self.CHANNELS
def get_selected_channels(self, package, general=False, cinema=False):
for channel in self.get_channels():
if (package == 0 or u'%s' % package in channel._networks) and\
((general and channel._thema in ('1', '2')) or (cinema and channel._thema == '3')):
yield channel.id
def set_package_settings(self, package, channels):
url = 'http://www.senscritique.com/sc/tv_guides/saveSettings.json'
# do not use a dict because there are several same keys
params = "network=%s" % package
params += ''.join(["&channels%%5B%%5D=%s" % (channel) for channel in channels])
self.open(url, data=params)
def list_events(self, date_from, date_to=None, package=None, channels=None):
self.set_profile(Firefox())
self.program_page.go()
page_nb = 1
self.set_ajax_header()
if package and channels:
self.set_package_settings(package, channels)
while True:
try:
self.DATA['page'] = '%d' % page_nb
page = self.ajax_page.open(data=self.DATA)
nb_events = page.count_events()
events = page.list_events(date_from=date_from, date_to=date_to)
for event in events:
yield event
except XMLSyntaxError:
break
if nb_events < self.LIMIT or page_nb >= self.LIMIT_NB_PAGES:
break
page_nb += 1
def get_event(self, _id, event=None, package=None, channels=None):
if not event: if not event:
self.set_profile(Firefox()) try:
self.program_page.go() event = self.films_page.go().iter_films(_id=_id).next()
page_nb = 1 except StopIteration:
raise UserError('This event (%s) does not exists' % _id)
self.set_ajax_header() film_id = _id.split('#')[0]
if package and channels: event = self.event_page.go(_id=film_id).get_event(obj=event)
self.set_package_settings(package, channels)
while True: resume = self.get_resume(film_id)
self.DATA['page'] = '%d' % page_nb if resume:
page = self.ajax_page.open(data=self.DATA) event.description += resume
try:
event = page.list_events(_id=_id).next()
except StopIteration:
event = None
nb_events = page.count_events() return event
if event or nb_events < self.LIMIT or page_nb >= self.LIMIT_NB_PAGES:
break
page_nb += 1 def get_resume(self, film_id):
if event:
_id = _id.split('#')[0]
self.set_profile(Firefox())
event = self.event_page.go(_id=_id).get_event(obj=event)
resume = self.get_resume(_id)
if resume:
event.description += self.get_resume(_id)
return event
def get_resume(self, _id):
self.set_json_header() self.set_json_header()
re_id = re.compile('^/?.*/(.*)', re.DOTALL) _id = film_id.split('/')[-1]
_id = re_id.search(_id).group(1) resume = self.json_page.go(_id=_id).get_resume()
return self.json_page.go(_id=_id).get_resume() self.set_profile(Firefox())
return resume

View file

@ -17,9 +17,7 @@
# You should have received a copy of the GNU Affero General Public License # You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>. # along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.tools.backend import Module, BackendConfig from weboob.tools.backend import Module
from weboob.tools.ordereddict import OrderedDict
from weboob.tools.value import Value, ValueBool
from weboob.capabilities.calendar import CapCalendarEvent, CATEGORIES from weboob.capabilities.calendar import CapCalendarEvent, CATEGORIES
from .browser import SenscritiqueBrowser from .browser import SenscritiqueBrowser
@ -38,49 +36,21 @@ class SenscritiqueModule(Module, CapCalendarEvent):
ASSOCIATED_CATEGORIES = [CATEGORIES.TELE] ASSOCIATED_CATEGORIES = [CATEGORIES.TELE]
BROWSER = SenscritiqueBrowser BROWSER = SenscritiqueBrowser
tv_settings_choices = OrderedDict([(k, u'%s' % (v)) for k, v in sorted({
'000000': u'-- Indifférent --',
'9': u'TNT',
'1': u'Canalsat',
'2': u'Numericable',
'10': u'Orange',
'11': u'Free',
'12': u'SFR',
'15': u'Darty box via ADSL',
'16': u'Bouygues',
}.iteritems())])
CONFIG = BackendConfig(Value('tv_settings', label=u'T.V. package', choices=tv_settings_choices),
ValueBool('general', label='General', default=True),
ValueBool('cinema', label='Cinema', default=False),
)
def get_package_and_channels(self):
package = int(self.config['tv_settings'].get())
channels = self.browser.get_selected_channels(package, self.config['general'].get(),
self.config['cinema'].get())
return package, channels
def search_events(self, query): def search_events(self, query):
if self.has_matching_categories(query): if self.has_matching_categories(query):
package, channels = self.get_package_and_channels() return self.list_events(query.start_date,
return self.browser.list_events(query.start_date, query.end_date)
query.end_date,
package,
channels)
def list_events(self, date_from, date_to=None): def list_events(self, date_from, date_to=None):
items = [] items = []
package, channels = self.get_package_and_channels() for item in self.browser.list_events(date_from, date_to):
for item in self.browser.list_events(date_from, date_to, package, channels):
items.append(item) items.append(item)
items.sort(key=lambda o: o.start_date) items.sort(key=lambda o: o.start_date)
return items return items
def get_event(self, _id, event=None): def get_event(self, _id, event=None):
package, channels = self.get_package_and_channels() return self.browser.get_event(_id, event)
return self.browser.get_event(_id, event, package=package, channels=channels)
def fill_obj(self, event, fields): def fill_obj(self, event, fields):
return self.get_event(event.id, event) return self.get_event(event.id, event)

View file

@ -20,29 +20,31 @@
from .calendar import SensCritiquenCalendarEvent from .calendar import SensCritiquenCalendarEvent
from datetime import date, datetime, timedelta from datetime import date, datetime, timedelta
from weboob.capabilities.base import empty, BaseObject from weboob.capabilities.base import empty
from weboob.browser.pages import HTMLPage, JsonPage from weboob.browser.pages import HTMLPage, JsonPage
from weboob.browser.elements import ItemElement, ListElement, method from weboob.browser.elements import ItemElement, ListElement, method
from weboob.browser.filters.standard import Filter, CleanText, Regexp, Join, Format, BrowserURL, Env from weboob.browser.filters.standard import Filter, CleanText, Regexp, Join, Format, BrowserURL, Env
from weboob.browser.filters.html import Link from weboob.browser.filters.html import Link
class Channel(Filter): class Description(Filter):
def filter(self, el):
header = "//div[@class='pvi-hero-product']"
section = "//section[@class='pvi-productDetails']"
return Format(u'%s %s\n\n%s%s\n\n',
CleanText("%s/div[@class='d-rubric-inner']/h1" % header),
CleanText("%s/div[@class='d-rubric-inner']/small" % header),
Join(u'- ', "%s/ul[@class='pvi-product-specs']/li" % header, newline=True),
Join(u'- ', "%s/ul/li" % section, newline=True, addBefore='- '))(el[0])
def __call__(self, item):
channels = item.page.browser.get_channels()
return self.filter(self.select(self.selector, item, key=self._key, obj=self._obj), channels)
def filter(self, el, channels): class FormatDate(Filter):
channel_info = el[0].xpath('div/div[@class="elgr-data-channel"]') def __init__(self, pattern, selector):
if channel_info: super(FormatDate, self).__init__(selector)
return CleanText('.', children=False)(channel_info[0]) self.pattern = pattern
else:
channel_id = Regexp(CleanText('div[@class="elgr-product-data"]/span/@class'), def filter(self, _date):
'einst-(.*) elgr-data-logo')(el[0]) return _date.strftime(self.pattern)
for channel in channels:
if channel_id == channel.id:
return channel._name
class Date(Filter): class Date(Filter):
@ -65,68 +67,10 @@ class Date(Filter):
return datetime.combine(_date, _time.time()) return datetime.combine(_date, _time.time())
class FormatDate(Filter): class JsonResumePage(JsonPage):
def __init__(self, pattern, selector): def get_resume(self):
super(FormatDate, self).__init__(selector) if self.doc['json']['success']:
self.pattern = pattern return self.doc['json']['data']
def filter(self, date):
return date.strftime(self.pattern)
class AjaxPage(HTMLPage):
def count_events(self):
return len(self.doc.xpath("//a"))
@method
class list_events(ListElement):
item_xpath = '//a'
ignore_duplicate = True
class item(ItemElement):
klass = SensCritiquenCalendarEvent
def condition(self):
if '_id' in self.env and self.env['_id']:
return Format(u'%s#%s#%s',
Regexp(Link('.'), '/film/(.*)'),
FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')),
CleanText(Channel('.'), replace=[(' ', '-')]))(self) == self.env['_id']
return True
def validate(self, obj):
if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']:
if not self.env['date_to']:
return True
else:
if empty(obj.end_date) or obj.end_date <= self.env['date_to']:
return True
if '_id' in self.env:
return True
return False
obj_id = Format(u'%s#%s#%s',
Regexp(Link('.'), '/film/(.*)'),
FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')),
CleanText(Channel('.'), replace=[(' ', '-')]))
obj_start_date = Date('div/div[@class="elgr-data-diffusion"]')
obj_summary = Format('%s - %s',
Regexp(CleanText('./div/img/@alt'), '^Affiche(.*)'),
Channel('.'))
class Description(Filter):
def filter(self, el):
header = "//div[@class='pvi-hero-product']"
section = "//section[@class='pvi-productDetails']"
return Format(u'%s %s\n\n%s%s\n\n',
CleanText("%s/div[@class='d-rubric-inner']/h1" % header),
CleanText("%s/div[@class='d-rubric-inner']/small" % header),
Join(u'- ', "%s/ul[@class='pvi-product-specs']/li" % header, newline=True),
Join(u'- ', "%s/ul/li" % section, newline=True, addBefore=' - '))(el[0])
class EventPage(HTMLPage): class EventPage(HTMLPage):
@ -138,24 +82,45 @@ class EventPage(HTMLPage):
obj_description = Description('.') obj_description = Description('.')
class JsonResumePage(JsonPage): class FilmsPage(HTMLPage):
def get_resume(self):
if self.doc['json']['success']:
return self.doc['json']['data']
class SettingsPage(HTMLPage):
@method @method
class get_channels(ListElement): class iter_films(ListElement):
item_xpath = '//li[@class="tse-channels-item hide"]' item_xpath = '//li[@class="elgr-mosaic "]/a'
class item(ItemElement): class item(ItemElement):
klass = BaseObject klass = SensCritiquenCalendarEvent
obj_id = CleanText('./@data-sc-channel-id') def condition(self):
if '_id' in self.env and self.env['_id']:
return Format(u'%s#%s#%s',
Regexp(Link('.'), '/film/(.*)'),
FormatDate("%Y%m%d%H%M",
Date('div/div[@class="elgr-data-diffusion"]')),
CleanText('./div/span[@class="d-offset"]',
replace=[(' ', '-')]))(self) == self.env['_id']
return True
def obj__networks(self): def validate(self, obj):
return CleanText('./@data-sc-networks')(self).split(',') if 'date_from' in self.env and self.env['date_from'] and obj.start_date > self.env['date_from']:
if not self.env['date_to']:
return True
else:
if empty(obj.end_date):
if obj.start_date < self.env['date_to']:
return True
elif obj.end_date <= self.env['date_to']:
return True
obj__thema = CleanText('./@data-sc-thema-id') if '_id' in self.env:
obj__name = CleanText('./label') return True
return False
obj_id = Format(u'%s#%s#%s',
Regexp(Link('.'), '/film/(.*)'),
FormatDate("%Y%m%d%H%M", Date('div/div[@class="elgr-data-diffusion"]')),
CleanText('./div/span[@class="d-offset"]', replace=[(' ', '-')]))
obj_start_date = Date('div/div[@class="elgr-data-diffusion"]')
obj_summary = Format('%s - %s',
Regexp(CleanText('./div/img/@alt'), '^Affiche(.*)'),
CleanText('./div/span[@class="d-offset"]'))