From 729b5e9c8fe7618488a80da93ce5de14cdf92663 Mon Sep 17 00:00:00 2001 From: Romain Bignon Date: Tue, 28 Jun 2011 14:07:14 +0200 Subject: [PATCH] rewrite transilien to use lxmlparser (closes #271) --- weboob/backends/transilien/browser.py | 125 +--------------------- weboob/backends/transilien/pages/route.py | 37 ++++++- 2 files changed, 38 insertions(+), 124 deletions(-) diff --git a/weboob/backends/transilien/browser.py b/weboob/backends/transilien/browser.py index b49f4a01..6ec1dfcd 100644 --- a/weboob/backends/transilien/browser.py +++ b/weboob/backends/transilien/browser.py @@ -18,115 +18,17 @@ # along with weboob. If not, see . -from datetime import datetime, date, time -import HTMLParser - from weboob.tools.browser import BaseBrowser -from weboob.tools.misc import to_unicode from .pages.route import RoutePage -class Route(object): - "une ligne code_mission | time" - def __init__(self, code_mission, time, destination, platform): - self.code_mission = code_mission - self.time = time - self.destination = destination - self.platform = platform - - def __repr__(self): - return "" % (self.code_mission, - self.time, self.destination, self.platform) - -class Parser(HTMLParser.HTMLParser): - "Parse les tableaux html contenant les horaires" - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - self.__table_horaires3 = False - self.__code_de_mission = False - self.__a_code_de_mission = False - self.__time = False - self.__destination = False - self.__platform = False - self.__liste_train = [] - self.__liste_horaire = [] - self.__liste_destination = [] - self.__liste_platform = [] - - @classmethod - def parse(cls, data, encoding): - parser = cls() - parser.feed(data.read()) - return parser - - def handle_starttag(self, tag, attrs): - "execute a chaque balise ouvrante" - if (tag == 'table' and (dict(attrs)['class'] == 'horaires3')): - self.__table_horaires3 = True - - elif self.__table_horaires3 and tag == 'td': - try: - self.__code_de_mission = ( - dict(attrs)['headers'] == 'Code_de_mission') - self.__time = ( - dict(attrs)['headers'] == 'Heure_de_passage') - self.__destination = ( - dict(attrs)['headers'] == 'Destination') - self.__platform = ( - dict(attrs)['headers'] == 'Voie') - except KeyError: - if dict(attrs).has_key('headers'): - raise - else: - pass - else: - self.__a_code_de_mission = (tag == 'a' and self.__code_de_mission) - - def handle_data(self, data): - "execute pour chaque contenu de balise" - if self.__a_code_de_mission: - self.__liste_train.append(data.strip()) - if self.__time and data.strip() != '*': - self.__liste_horaire.append(data.strip()) - if self.__destination: - self.__liste_destination.append(data.strip()) - if self.__platform: - self.__liste_platform.append(data.strip()) - - def handle_endtag(self, tag): - "execute à chaque balise fermante" - self.__a_code_de_mission ^= (self.__a_code_de_mission and tag == 'a') - self.__time ^= (self.__time and tag == 'td') - self.__destination ^= (self.__destination and tag == 'td') - self.__platform ^= (self.__platform and tag == 'td') - - - @property - def list_route(self): - "getter" - __list_route = [] - __curseur_horaire = 0 - for __i in self.__liste_train: - __list_route.append(Route( - code_mission=__i, - time=self.__liste_horaire[__curseur_horaire], - destination=self.__liste_destination[__curseur_horaire], - platform=self.__liste_platform[__curseur_horaire] - )) - __curseur_horaire += 1 - return __list_route - class Transilien(BaseBrowser): DOMAIN = 'www.transilien.com' - PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage, - 'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*': RoutePage - } PROTOCOL = 'https' USER_AGENT = BaseBrowser.USER_AGENTS['microb'] - - def __init__(self, **kwargs): - kwargs['parser'] = Parser - BaseBrowser.__init__(self, '', **kwargs) + PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage, + 'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*': RoutePage, + } def iter_station_search(self, pattern): pass @@ -136,27 +38,8 @@ class Transilien(BaseBrowser): self.location('https://www.transilien.com/web/ITProchainsTrainsAvecDest.do?codeTr3aDepart=%s&codeTr3aDest=%s&urlModule=/site/pid/184&gareAcc=true' % (station_id, arrival_id)) else: self.location('https://www.transilien.com/web/ITProchainsTrains.do?tr3a=%s&urlModule=/site/pid/184' % station_id) - for route in self.page.document.list_route: - _late_reason = None - try : - _time = datetime.combine(date.today(), time(*[int(x) for x in route.time.split(':')])) - except ValueError: - _time = None - _late_reason = route.time - else: - yield {'type': to_unicode(route.code_mission), - 'time': _time, - 'departure': to_unicode(station_id), - 'arrival': to_unicode(route.destination), - 'late': time(), - 'late_reason': _late_reason, - 'plateform': to_unicode(route.platform)} - def home(self): - pass - - def login(self): - pass + return self.page.iter_routes() def is_logged(self): """ Do not need to be logged """ diff --git a/weboob/backends/transilien/pages/route.py b/weboob/backends/transilien/pages/route.py index 075238fc..f069c688 100644 --- a/weboob/backends/transilien/pages/route.py +++ b/weboob/backends/transilien/pages/route.py @@ -17,9 +17,40 @@ # You should have received a copy of the GNU Affero General Public License # along with weboob. If not, see . +import datetime -from weboob.tools.browser import BasePage +from weboob.tools.misc import to_unicode +from weboob.tools.browser import BasePage, BrokenPageError + +class StationNotFound(Exception): + pass class RoutePage(BasePage): - def on_loaded(self): - return + def iter_routes(self): + try: + table = self.parser.select(self.document.getroot(), 'table.horaires3', 1) + except BrokenPageError: + raise StationNotFound('Station not found') + + departure = self.parser.select(table, 'td.caption strong', 1).text + for tr in table.findall('tr'): + if len(tr.findall('td')) != 4: + continue + + code_mission = self.parser.select(tr, 'td[headers=Code_de_mission] a', 1).text.strip() + time = self.parser.select(tr, 'td[headers=Heure_de_passage]', 1).text.strip() + destination = self.parser.select(tr, 'td[headers=Destination]', 1).text.strip() + plateform = self.parser.select(tr, 'td[headers=Voie]', 1).text.strip() + + try : + time = datetime.datetime.combine(datetime.date.today(), datetime.time(*[int(x) for x in time.split(':')])) + except ValueError: + self.logger.warning('Unable to parse datetime') + + yield {'type': to_unicode(code_mission), + 'time': time, + 'departure': to_unicode(departure), + 'arrival': to_unicode(destination), + 'late': datetime.time(), + 'late_reason': None, + 'plateform': to_unicode(plateform)}