rewrite transilien to use lxmlparser (closes #271)

This commit is contained in:
Romain Bignon 2011-06-28 14:07:14 +02:00
commit 729b5e9c8f
2 changed files with 38 additions and 124 deletions

View file

@ -18,115 +18,17 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from datetime import datetime, date, time
import HTMLParser
from weboob.tools.browser import BaseBrowser
from weboob.tools.misc import to_unicode
from .pages.route import RoutePage
class Route(object):
"une ligne code_mission | time"
def __init__(self, code_mission, time, destination, platform):
self.code_mission = code_mission
self.time = time
self.destination = destination
self.platform = platform
def __repr__(self):
return "<Route %s %s %s %s>" % (self.code_mission,
self.time, self.destination, self.platform)
class Parser(HTMLParser.HTMLParser):
"Parse les tableaux html contenant les horaires"
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.__table_horaires3 = False
self.__code_de_mission = False
self.__a_code_de_mission = False
self.__time = False
self.__destination = False
self.__platform = False
self.__liste_train = []
self.__liste_horaire = []
self.__liste_destination = []
self.__liste_platform = []
@classmethod
def parse(cls, data, encoding):
parser = cls()
parser.feed(data.read())
return parser
def handle_starttag(self, tag, attrs):
"execute a chaque balise ouvrante"
if (tag == 'table' and (dict(attrs)['class'] == 'horaires3')):
self.__table_horaires3 = True
elif self.__table_horaires3 and tag == 'td':
try:
self.__code_de_mission = (
dict(attrs)['headers'] == 'Code_de_mission')
self.__time = (
dict(attrs)['headers'] == 'Heure_de_passage')
self.__destination = (
dict(attrs)['headers'] == 'Destination')
self.__platform = (
dict(attrs)['headers'] == 'Voie')
except KeyError:
if dict(attrs).has_key('headers'):
raise
else:
pass
else:
self.__a_code_de_mission = (tag == 'a' and self.__code_de_mission)
def handle_data(self, data):
"execute pour chaque contenu de balise"
if self.__a_code_de_mission:
self.__liste_train.append(data.strip())
if self.__time and data.strip() != '*':
self.__liste_horaire.append(data.strip())
if self.__destination:
self.__liste_destination.append(data.strip())
if self.__platform:
self.__liste_platform.append(data.strip())
def handle_endtag(self, tag):
"execute à chaque balise fermante"
self.__a_code_de_mission ^= (self.__a_code_de_mission and tag == 'a')
self.__time ^= (self.__time and tag == 'td')
self.__destination ^= (self.__destination and tag == 'td')
self.__platform ^= (self.__platform and tag == 'td')
@property
def list_route(self):
"getter"
__list_route = []
__curseur_horaire = 0
for __i in self.__liste_train:
__list_route.append(Route(
code_mission=__i,
time=self.__liste_horaire[__curseur_horaire],
destination=self.__liste_destination[__curseur_horaire],
platform=self.__liste_platform[__curseur_horaire]
))
__curseur_horaire += 1
return __list_route
class Transilien(BaseBrowser):
DOMAIN = 'www.transilien.com'
PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage,
'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*': RoutePage
}
PROTOCOL = 'https'
USER_AGENT = BaseBrowser.USER_AGENTS['microb']
def __init__(self, **kwargs):
kwargs['parser'] = Parser
BaseBrowser.__init__(self, '', **kwargs)
PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage,
'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*': RoutePage,
}
def iter_station_search(self, pattern):
pass
@ -136,27 +38,8 @@ class Transilien(BaseBrowser):
self.location('https://www.transilien.com/web/ITProchainsTrainsAvecDest.do?codeTr3aDepart=%s&codeTr3aDest=%s&urlModule=/site/pid/184&gareAcc=true' % (station_id, arrival_id))
else:
self.location('https://www.transilien.com/web/ITProchainsTrains.do?tr3a=%s&urlModule=/site/pid/184' % station_id)
for route in self.page.document.list_route:
_late_reason = None
try :
_time = datetime.combine(date.today(), time(*[int(x) for x in route.time.split(':')]))
except ValueError:
_time = None
_late_reason = route.time
else:
yield {'type': to_unicode(route.code_mission),
'time': _time,
'departure': to_unicode(station_id),
'arrival': to_unicode(route.destination),
'late': time(),
'late_reason': _late_reason,
'plateform': to_unicode(route.platform)}
def home(self):
pass
def login(self):
pass
return self.page.iter_routes()
def is_logged(self):
""" Do not need to be logged """

View file

@ -17,9 +17,40 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import datetime
from weboob.tools.browser import BasePage
from weboob.tools.misc import to_unicode
from weboob.tools.browser import BasePage, BrokenPageError
class StationNotFound(Exception):
pass
class RoutePage(BasePage):
def on_loaded(self):
return
def iter_routes(self):
try:
table = self.parser.select(self.document.getroot(), 'table.horaires3', 1)
except BrokenPageError:
raise StationNotFound('Station not found')
departure = self.parser.select(table, 'td.caption strong', 1).text
for tr in table.findall('tr'):
if len(tr.findall('td')) != 4:
continue
code_mission = self.parser.select(tr, 'td[headers=Code_de_mission] a', 1).text.strip()
time = self.parser.select(tr, 'td[headers=Heure_de_passage]', 1).text.strip()
destination = self.parser.select(tr, 'td[headers=Destination]', 1).text.strip()
plateform = self.parser.select(tr, 'td[headers=Voie]', 1).text.strip()
try :
time = datetime.datetime.combine(datetime.date.today(), datetime.time(*[int(x) for x in time.split(':')]))
except ValueError:
self.logger.warning('Unable to parse datetime')
yield {'type': to_unicode(code_mission),
'time': time,
'departure': to_unicode(departure),
'arrival': to_unicode(destination),
'late': datetime.time(),
'late_reason': None,
'plateform': to_unicode(plateform)}