rewrite transilien to use lxmlparser (closes #271)

2011-06-28 14:07:14 +02:00 · 2011-06-28 14:07:14 +02:00 · 729b5e9c8f
commit 729b5e9c8f
parent 090fb38feb
2 changed files with 38 additions and 124 deletions
--- a/weboob/backends/transilien/browser.py
+++ b/weboob/backends/transilien/browser.py
@ -18,115 +18,17 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-from datetime import datetime, date, time
-import HTMLParser
-
 from weboob.tools.browser import BaseBrowser
-from weboob.tools.misc import to_unicode

 from .pages.route import RoutePage

-class Route(object):
-    "une ligne code_mission | time"
-    def __init__(self, code_mission, time, destination, platform):
-        self.code_mission = code_mission
-        self.time = time
-        self.destination = destination
-        self.platform = platform
-
-    def __repr__(self):
-        return "<Route %s %s %s %s>" % (self.code_mission,
-            self.time, self.destination, self.platform)
-
-class Parser(HTMLParser.HTMLParser):
-    "Parse les tableaux html contenant les horaires"
-    def __init__(self):
-        HTMLParser.HTMLParser.__init__(self)
-        self.__table_horaires3 = False
-        self.__code_de_mission = False
-        self.__a_code_de_mission = False
-        self.__time = False
-        self.__destination = False
-        self.__platform = False
-        self.__liste_train = []
-        self.__liste_horaire = []
-        self.__liste_destination = []
-        self.__liste_platform = []
-
-    @classmethod
-    def parse(cls, data, encoding):
-        parser = cls()
-        parser.feed(data.read())
-        return parser
-
-    def handle_starttag(self, tag, attrs):
-        "execute a chaque balise ouvrante"
-        if (tag == 'table' and (dict(attrs)['class'] == 'horaires3')):
-            self.__table_horaires3 = True
-
-        elif self.__table_horaires3 and tag == 'td':
-            try:
-                self.__code_de_mission = (
-                    dict(attrs)['headers'] == 'Code_de_mission')
-                self.__time = (
-                    dict(attrs)['headers'] == 'Heure_de_passage')
-                self.__destination = (
-                    dict(attrs)['headers'] == 'Destination')
-                self.__platform = (
-                    dict(attrs)['headers'] == 'Voie')
-            except KeyError:
-                if dict(attrs).has_key('headers'):
-                    raise
-                else:
-                    pass
-        else:
-            self.__a_code_de_mission = (tag == 'a' and self.__code_de_mission)
-
-    def handle_data(self, data):
-        "execute pour chaque contenu de balise"
-        if self.__a_code_de_mission:
-            self.__liste_train.append(data.strip())
-        if self.__time and data.strip() != '*':
-            self.__liste_horaire.append(data.strip())
-        if self.__destination:
-            self.__liste_destination.append(data.strip())
-        if self.__platform:
-            self.__liste_platform.append(data.strip())
-
-    def handle_endtag(self, tag):
-        "execute à chaque balise fermante"
-        self.__a_code_de_mission ^= (self.__a_code_de_mission and tag == 'a')
-        self.__time ^= (self.__time and tag == 'td')
-        self.__destination ^= (self.__destination and tag == 'td')
-        self.__platform ^= (self.__platform and tag == 'td')
-
-
-    @property
-    def list_route(self):
-        "getter"
-        __list_route = []
-        __curseur_horaire = 0
-        for __i in self.__liste_train:
-            __list_route.append(Route(
-                code_mission=__i,
-                time=self.__liste_horaire[__curseur_horaire],
-                destination=self.__liste_destination[__curseur_horaire],
-                platform=self.__liste_platform[__curseur_horaire]
-                ))
-            __curseur_horaire += 1
-        return __list_route
-
 class Transilien(BaseBrowser):
    DOMAIN = 'www.transilien.com'
-    PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage,
-             'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*': RoutePage
-            }
    PROTOCOL = 'https'
    USER_AGENT = BaseBrowser.USER_AGENTS['microb']
-
-    def __init__(self, **kwargs):
-        kwargs['parser'] = Parser
-        BaseBrowser.__init__(self, '', **kwargs)
+    PAGES = {'https://www\.transilien\.com/web/ITProchainsTrainsAvecDest\.do\?.*': RoutePage,
+             'https://www\.transilien\.com/web/ITProchainsTrains\.do\?.*':         RoutePage,
+            }

    def iter_station_search(self, pattern):
        pass
@ -136,27 +38,8 @@ class Transilien(BaseBrowser):
            self.location('https://www.transilien.com/web/ITProchainsTrainsAvecDest.do?codeTr3aDepart=%s&codeTr3aDest=%s&urlModule=/site/pid/184&gareAcc=true' % (station_id, arrival_id))
        else:
            self.location('https://www.transilien.com/web/ITProchainsTrains.do?tr3a=%s&urlModule=/site/pid/184' % station_id)
-        for route in self.page.document.list_route:
-            _late_reason = None
-            try :
-                _time = datetime.combine(date.today(), time(*[int(x) for x in route.time.split(':')]))
-            except ValueError:
-                _time = None
-                _late_reason = route.time
-            else:
-                yield {'type':        to_unicode(route.code_mission),
-                       'time':        _time,
-                       'departure':   to_unicode(station_id),
-                       'arrival':     to_unicode(route.destination),
-                       'late':        time(),
-                       'late_reason': _late_reason,
-                       'plateform':   to_unicode(route.platform)}

-    def home(self):
-        pass
-
-    def login(self):
-        pass
+        return self.page.iter_routes()

    def is_logged(self):
        """ Do not need to be logged """
--- a/weboob/backends/transilien/pages/route.py
+++ b/weboob/backends/transilien/pages/route.py
@ -17,9 +17,40 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.

+import datetime

-from weboob.tools.browser import BasePage
+from weboob.tools.misc import to_unicode
+from weboob.tools.browser import BasePage, BrokenPageError
+
+class StationNotFound(Exception):
+    pass

 class RoutePage(BasePage):
-    def on_loaded(self):
-        return
+    def iter_routes(self):
+        try:
+            table = self.parser.select(self.document.getroot(), 'table.horaires3', 1)
+        except BrokenPageError:
+            raise StationNotFound('Station not found')
+
+        departure = self.parser.select(table, 'td.caption strong', 1).text
+        for tr in table.findall('tr'):
+            if len(tr.findall('td')) != 4:
+                continue
+
+            code_mission = self.parser.select(tr, 'td[headers=Code_de_mission] a', 1).text.strip()
+            time = self.parser.select(tr, 'td[headers=Heure_de_passage]', 1).text.strip()
+            destination = self.parser.select(tr, 'td[headers=Destination]', 1).text.strip()
+            plateform = self.parser.select(tr, 'td[headers=Voie]', 1).text.strip()
+
+            try :
+                time = datetime.datetime.combine(datetime.date.today(), datetime.time(*[int(x) for x in time.split(':')]))
+            except ValueError:
+                self.logger.warning('Unable to parse datetime')
+
+            yield {'type':        to_unicode(code_mission),
+                   'time':        time,
+                   'departure':   to_unicode(departure),
+                   'arrival':     to_unicode(destination),
+                   'late':        datetime.time(),
+                   'late_reason': None,
+                   'plateform':   to_unicode(plateform)}