From 79d1bcfea6cdf7dc5eac276f342a9e0abe70e292 Mon Sep 17 00:00:00 2001 From: Mathieu Jourdan Date: Wed, 8 May 2013 19:45:09 +0200 Subject: [PATCH] gdfsuez-dolcevita.fr website Signed-off-by: Mathieu Jourdan Signed-off-by: Romain Bignon --- modules/gdfsuez/__init__.py | 3 + modules/gdfsuez/backend.py | 95 +++++++++++++ modules/gdfsuez/browser.py | 99 ++++++++++++++ modules/gdfsuez/pages/__init__.py | 23 ++++ modules/gdfsuez/pages/history.py | 213 ++++++++++++++++++++++++++++++ modules/gdfsuez/pages/homepage.py | 68 ++++++++++ modules/gdfsuez/test.py | 35 +++++ 7 files changed, 536 insertions(+) create mode 100644 modules/gdfsuez/__init__.py create mode 100644 modules/gdfsuez/backend.py create mode 100644 modules/gdfsuez/browser.py create mode 100644 modules/gdfsuez/pages/__init__.py create mode 100644 modules/gdfsuez/pages/history.py create mode 100644 modules/gdfsuez/pages/homepage.py create mode 100644 modules/gdfsuez/test.py diff --git a/modules/gdfsuez/__init__.py b/modules/gdfsuez/__init__.py new file mode 100644 index 00000000..392e99f0 --- /dev/null +++ b/modules/gdfsuez/__init__.py @@ -0,0 +1,3 @@ +from .backend import GdfSuezBackend + +__all__ = ['GdfSuezBackend'] diff --git a/modules/gdfsuez/backend.py b/modules/gdfsuez/backend.py new file mode 100644 index 00000000..ab57a849 --- /dev/null +++ b/modules/gdfsuez/backend.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Mathieu Jourdan +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from weboob.capabilities.bill import ICapBill, SubscriptionNotFound,\ + BillNotFound, Subscription, Bill +from weboob.tools.backend import BaseBackend, BackendConfig +from weboob.tools.value import ValueBackendPassword +from .browser import GdfSuez + +__all__ = ['GdfSuezBackend'] + +class GdfSuezBackend(BaseBackend, ICapBill): + NAME = 'gdfsuez' + MAINTAINER = u'Mathieu Jourdan' + EMAIL = 'mathieu.jourdan@gresille.org' + VERSION = '0.g' + LICENSE = 'AGPLv3+' + DESCRIPTION = 'GdF-Suez website' + CONFIG = BackendConfig(ValueBackendPassword('login', + label='Account ID (e-mail)', + masked=False), + ValueBackendPassword('password', + label='Password', + masked=True) + ) + BROWSER = GdfSuez + + def create_default_browser(self): + return self.create_browser(self.config['login'].get(), + self.config['password'].get()) + + def iter_subscription(self): + for subscription in self.browser.get_subscription_list(): + yield subscription + + def get_subscription(self, _id): + if not _id.isdigit(): + raise SubscriptionnotFound() + with self.browser: + subscription = self.browser.get_subscription(_id) + if not subscription: + raise SubscriptionNotFound() + else: + return subscription + + def iter_bills_history(self, subscription): + if not isinstance(subscription, Subscription): + subscription = self.get_subscription(subscription) + with self.browser: + for history in self.browser.get_history(subscription): + yield history + + def get_details(self, subscription): + if not isinstance(subscription, Subscription): + subscription = self.get_subscription(subscription) + with self.browser: + for detail in self.browser.get_details(subscription): + yield detail + + def iter_bills(self, subscription): + if not isinstance(subscription, Subscription): + subscription = self.get_subscription(subscription) + with self.browser: + for bill in self.browser.iter_bills(): + yield bill + + def get_bill(self, id): + with self.browser: + bill = self.browser.get_bill(id) + if not bill: + raise BillNotFound() + else: + return bill + + def download_bill(self, bill): + if not isinstance(bill, Bill): + bill = self.get_bill(bill) + with self.browser: + return self.browser.readurl(bill._url) diff --git a/modules/gdfsuez/browser.py b/modules/gdfsuez/browser.py new file mode 100644 index 00000000..d000267d --- /dev/null +++ b/modules/gdfsuez/browser.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Mathieu Jourdan +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import StringIO +from weboob.tools.browser import BaseBrowser, BrowserIncorrectPassword +from .pages import LoginPage, HomePage, AccountPage, TimeoutPage, HistoryPage, PdfPage + +__all__ = ['GdfSuez'] + +class GdfSuez(BaseBrowser): + PROTOCOL = 'https' + DOMAIN = 'www.gdfsuez-dolcevita.fr' + PAGES = {'.*portail/clients.*?_nfpb=true&_pageLabel=page_identification': LoginPage, + '.*portail/clients.*?_nfpb=true&_pageLabel=page_accueil_compte_en_ligne': HomePage, + '.*p/visualiser_mes_contrats.*?_nfpb=true': AccountPage, + '.*p/page_historique_de_mes_factures': HistoryPage, + '.*clients.*?_nfpb=true&_nfls=false&_pageLabel=page_erreur_timeout_session': TimeoutPage + } + + loginp = '/portailClients/appmanager/portail/clients' + homep = '/portailClients/appmanager/portail/clients?_nfpb=true&_pageLabel=page_accueil_compte_en_ligne' + accountp = '/portailClients/client/p/visualiser_mes_contrats?_nfpb=true' + historyp = '/portailClients/client/p/page_historique_de_mes_factures' + + def __init__(self, *args, **kwargs): + BaseBrowser.__init__(self, *args, **kwargs) + + def home(self): + self.location(self.homep) + + def is_logged(self): + if self.is_on_page(LoginPage) or self.is_on_page(TimeoutPage): + return False + return True + + def login(self): + assert isinstance(self.username, basestring) + assert isinstance(self.password, basestring) + #assert isemail(self.username) + if not self.is_on_page(LoginPage): + self.location(self.loginp) + self.page.login(self.username, self.password) + if self.is_on_page(LoginPage): + raise BrowserIncorrectPassword() + + def get_subscription_list(self): + if not self.is_on_page(AccountPage): + self.location(self.accountp) + return self.page.get_subscription_list() + + def get_subscription(self, id): + assert isinstance(id, basestring) + for sub in self.get_subscription_list(): + if sub.id == id: + return sub + + def get_history(self, subscription): + if not self.is_on_page(HistoryPage): + self.location(self.historyp) + return self.page.get_history() + + def get_details(self, subscription): + bills = self.iter_bills() + id = bills[0].id + if not self.is_on_page(HistoryPage): + self.location(self.historyp) + url = 'https://www.gdfsuez-dolcevita.fr/' + self.get_bill(id)._url + response = self.openurl(url) + pdf = PdfPage(StringIO.StringIO(response.read())) + for detail in pdf.get_details(subscription.label): + yield detail + + def iter_bills(self): + if not self.is_on_page(HistoryPage): + self.location(self.historyp) + return self.page.get_bills() + + def get_bill(self, id): + assert isinstance(id, basestring) + for b in self.iter_bills(): + if b.id == id: + return b + diff --git a/modules/gdfsuez/pages/__init__.py b/modules/gdfsuez/pages/__init__.py new file mode 100644 index 00000000..f77d2690 --- /dev/null +++ b/modules/gdfsuez/pages/__init__.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Mathieu Jourdan +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from .history import HistoryPage, PdfPage +from .homepage import LoginPage, HomePage, AccountPage, TimeoutPage + +__all__ = ['LoginPage', 'HomePage', 'AccountPage', 'HistoryPage', 'PdfPage', 'TimeoutPage'] diff --git a/modules/gdfsuez/pages/history.py b/modules/gdfsuez/pages/history.py new file mode 100644 index 00000000..270d0b6a --- /dev/null +++ b/modules/gdfsuez/pages/history.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Mathieu Jourdan +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +import re +import os +import subprocess +import tempfile +import shutil + +from datetime import datetime, date +from decimal import Decimal + +from weboob.tools.browser import BasePage +from weboob.capabilities.base import NotAvailable +from weboob.capabilities.bill import Detail, Bill + +__all__ = ['HistoryPage', 'PdfPage'] + +class HistoryPage(BasePage): + + def on_loaded(self): + self.details = [] + self.bills = [] + + # Latest bill + div = self.document.xpath('//div[@class="consulter_dernierefacture"]')[0] + bdate = div.xpath('p[@class="date"]/span[@class="textetertiaire"]')[0].text + bprice = div.xpath('p[@class="montant"]/span[@class="textetertiaire"]')[0].text + link = div.xpath('a[@id="display_popin"]')[0].attrib['href'] + mydate = date(*reversed([int(x) for x in bdate.split("/")])) + price = Decimal(bprice.strip(u' € TTC').replace(',', '.')) + self.bills.append(self._create_bill(mydate, price, link)) + + # Previous bills + table = self.document.xpath('//table[@class="afficher_factures"]')[0] + for tr in table[0].xpath('//tbody/tr'): + cells = tr.xpath('td') + bdate = unicode(cells[0].text.strip()) + mydate = date(*reversed([int(x) for x in bdate.split("/")])) + bprice = unicode(cells[1].text) + price = Decimal(bprice.strip(u' €').replace(',', '.')) + link = cells[3].xpath('a')[0].attrib['href'] + self.bills.append(self._create_bill(mydate, price, link)) + + def _create_bill(self, date, price, link): + bill = Bill() + bill.id = date.__str__().replace('-', '') + bill.date = date + bill._price = price + bill._url = link + bill.format = u'pdf' + bill.label = unicode(price) + return bill + + def get_details(self): + return self.details + + def get_bills(self): + return self.bills + +class PdfPage(): + + def __init__(self, file): + self.pdf = file + + def _parse_pdf(self): + pdffile = tempfile.NamedTemporaryFile(bufsize=100000, mode='w', suffix='.pdf') + temptxt = pdffile.name.replace('.pdf', '.txt') + cmd = "ebook-convert" + stdout = open("/dev/null", "w") + shutil.copyfileobj(self.pdf, pdffile) + pdffile.flush() + subprocess.call([cmd, pdffile.name, temptxt], stdout=stdout) + pdffile.close() + txtfile = open(temptxt, 'r') + txt = txtfile.read() + txtfile.close() + os.remove(temptxt) + return txt + + def _parse_page(self, page): + + # Regexp + footnote = re.compile(r'\([0-9]\) ') # (f) + ht = re.compile('HT par mois') + base = re.compile('la base de') + begindate = re.compile(' \d\d\/\d\d ') # MM/DD + enddate = re.compile('\d\d\/\d\d\/\d\d') # YY/MM/DD + endwithdigit = re.compile('\d+$') # blah blah 42 + endwitheuro = re.compile('€$') # blah 00,00 € + textwithcoma = re.compile('([a-z]|\d{4})\,') # blah 2012, blah blah + + # Parsing + details = [] + for title in [ 'Abonnement', + 'Consommation', + 'Contributions et taxes liées à l\'énergie']: + section = page.split(title,1)[1].split('Total ')[0] + + # When a line holds '(0)', a newline is missing. + section = re.sub(footnote,'\n', section) + + lines = section.split('\n') + lines = [x for x in lines if len(x) > 0] # Remove empty lines + detail = None + + for line in lines: + if re.match('[A-Za-z]', line[0]): + + # Things we want to merge with the one just before + if 'facturées' in line: + # Long lines are sometimes split, so we try to join them + # That is the case for: + # 'Déduction du montant des consommations + # estimées facturées du 00/00/00 au 00/00/00' + detail.label = detail.label + u' ' + unicode(line, encoding='utf-8') + + # Things for which we want a new detail + else: + # Entering here, we will instantiate a new detail. + # We hadn't so before because of fragmented lines. + if detail is not None and detail.label is not NotAvailable: + # We have a new element, return the other one + details.append(detail) + detail = Detail() + detail.price = Decimal(0) + + # If the coma is not a decimal separator, then + # this is is probably a loooong sentence. + # When it comes to jokes, keep it short and sweet. + line = re.split(textwithcoma, line)[0] + + # Things we want for sure + if re.findall(enddate, line): + # When a line has been badly split after a date, + # We want the label to end after the date, and maybe + # the second part to be the info + mydate = re.search(enddate, line).group(0) + mylist = line.rpartition(mydate) + label = mylist[0] + mylist[1] + detail.label = unicode(label, encoding='utf-8') + elif re.findall(endwithdigit, line): + # What is this stupid number at the end of the line? + # Line should have been split before the number + detail.label = unicode(re.split(endwithdigit, line)[0], encoding='utf-8') + # Things we don't want for sure + elif ')' in line and '(' not in line: + # First part of the parenthesis should have been drop before + # Avoid to create a new empty detail + detail.label = NotAvailable + elif re.match(base, line): + # This string should come always after a date, + # usually, it will match one of the cases above. + # Sometimes, it appears on a new line we don't need. + detail.label = NotAvailable + elif re.match(ht, line): + # '00,00 € HT par mois' may have been split after HT + # We don't need of the second line + detail.label = NotAvailable + # Things we probably want to keep + else: + # Well, maybe our line is correct, after all. + # Not much to do. + detail.label = unicode(line, encoding='utf-8') + detail.infos = NotAvailable + elif ' %' in line: + if isinstance(detail, Detail): + # Sometimes the vat is not on a new line: + # '00,00 00,0 %' instead of '00,0 %' + vat = line.split()[line.count(' ')-1].replace(',', '.') + detail.infos = unicode('TVA: ' + vat) + elif ' €' in line: + price = line.replace(',','.') + if isinstance(detail, Detail): + detail.price = Decimal(price.strip(' €')) + elif re.match(enddate, line): + # Line holding dates may have been mixed up + label = detail.label.split(' au ')[0] + u' au ' + unicode(line, encoding='utf-8') + detail.label = label + if detail.label is not NotAvailable: + # Do not append empty details to the list + # It seemed easier to create details anyway than dealing + # with None objects + details.append(detail) + return details + + def get_details(self, label): + txt = self._parse_pdf() + page = None + if label == u'Gaz naturel': + page = txt.split('GAZ NATUREL')[1].split('TOTAL GAZ NATUREL TTC')[0] + elif label == u'Electricité': + page = txt.split('ELECTRICITE')[1].split('TOTAL ELECTRICITE TTC')[0] + else: + pass + return self._parse_page(page) + diff --git a/modules/gdfsuez/pages/homepage.py b/modules/gdfsuez/pages/homepage.py new file mode 100644 index 00000000..1f7fecab --- /dev/null +++ b/modules/gdfsuez/pages/homepage.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2013 Mathieu Jourdan +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + +from datetime import datetime, date + +from weboob.tools.browser import BasePage +from weboob.tools.mech import ClientForm +from weboob.capabilities.bill import Subscription + +__all__ = ['LoginPage', 'HomePage', 'AccountPage', 'TimeoutPage'] + +class LoginPage(BasePage): + + def login(self, login, password): + self.browser.select_form('symConnexionForm') + self.browser["portlet_login_plein_page_3{pageFlow.mForm.login}"] = str(login) + self.browser["portlet_login_plein_page_3{pageFlow.mForm.password}"] = str(password) + self.browser.submit() + +class HomePage(BasePage): + + def on_loaded(self): + pass + +class AccountPage(BasePage): + + def get_subscription_list(self): + table = self.document.xpath('//table[@id="ensemble_contrat_N0"]')[0] + if len(table) > 0: + # some clients may have subscriptions to gas and electricity, + # but they receive a single bill + # to avoid "boobill details" and "boobill bills" returning the same + # table twice, we could return only one subscription for both. + # We do not, and "boobill details" will take care of parsing only the + # relevant section in the bill files. + for line in table[0].xpath('//tbody/tr'): + cells = line.xpath('td') + snumber = cells[2].attrib['id'].replace('Contrat_', '') + slabel = cells[0].xpath('a')[0].text.replace('offre', '').strip() + d = unicode(cells[3].xpath('strong')[0].text.strip()) + sdate = date(*reversed([int(x) for x in d.split("/")])) + sub = Subscription(snumber) + sub._id = snumber + sub.label = slabel + sub.subscriber = unicode(cells[1]) + sub.renewdate = sdate + yield sub + +class TimeoutPage(BasePage): + + def on_loaded(self): + pass diff --git a/modules/gdfsuez/test.py b/modules/gdfsuez/test.py new file mode 100644 index 00000000..31141858 --- /dev/null +++ b/modules/gdfsuez/test.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- + +# Copyright(C) 2012 Romain Bignon +# +# This file is part of weboob. +# +# weboob is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# weboob is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with weboob. If not, see . + + +# This is a clone of freemobile/test.py for the gdfsuez module +from weboob.tools.test import BackendTest + + +__all__ = ['GdfSuezTest'] + + +class GdfSuezTest(BackendTest): + BACKEND = 'gdfsuez' + + def test_gdfsuez(self): + for subscription in self.backend.iter_subscription(): + list(self.backend.iter_history(subscription.id)) + for bill in self.backend.iter_bills(subscription.id): + self.backend.download_bill(bill.id)