use cElementTree and elementtidy as default encoders, and added wrappers to ElementParser and HTMLParser if they are missing

This commit is contained in:
Romain Bignon 2010-04-03 17:14:59 +02:00
commit 51433d6549
4 changed files with 115 additions and 28 deletions

View file

@ -20,6 +20,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import time import time
from logging import warning from logging import warning
from html5lib import treebuilders, HTMLParser
from weboob.tools.browser import Browser from weboob.tools.browser import Browser
from weboob.backends.aum.exceptions import AdopteWait from weboob.backends.aum.exceptions import AdopteWait
@ -34,6 +35,13 @@ from weboob.backends.aum.pages.login import LoginPage, RedirectPage, BanPage, Er
from weboob.backends.aum.pages.edit import EditPhotoPage, EditPhotoCbPage, EditAnnouncePage, EditDescriptionPage, EditSexPage, EditPersonalityPage from weboob.backends.aum.pages.edit import EditPhotoPage, EditPhotoCbPage, EditAnnouncePage, EditDescriptionPage, EditSexPage, EditPersonalityPage
from weboob.backends.aum.pages.wait import WaitPage from weboob.backends.aum.pages.wait import WaitPage
class AdopteParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder("dom"))
def parse(self, data):
return HTMLParser.parse(self, data, encoding='iso-8859-1')
class AdopteUnMec(Browser): class AdopteUnMec(Browser):
DOMAIN = 'www.adopteunmec.com' DOMAIN = 'www.adopteunmec.com'
PROTOCOL = 'http' PROTOCOL = 'http'
@ -66,6 +74,7 @@ class AdopteUnMec(Browser):
} }
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
kwargs['parser'] = AdopteParser
Browser.__init__(self, *args, **kwargs) Browser.__init__(self, *args, **kwargs)
self.my_id = 0 self.my_id = 0

View file

@ -19,12 +19,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
""" """
from weboob.tools.browser import BrowserIncorrectPassword, BasePage from weboob.tools.browser import BrowserIncorrectPassword, BasePage
from weboob.capabilities.messages import Message
class DLFPPage(BasePage): class DLFPPage(BasePage):
def is_logged(self): def is_logged(self):
forms = self.document.getElementsByTagName('form') for form in self.document.getiterator('form'):
for form in forms: if form.attrib.get('id', None) == 'formulaire':
if form.getAttribute('id') == 'formulaire':
return False return False
return True return True
@ -38,9 +38,7 @@ class LoginPage(DLFPPage):
raise BrowserIncorrectPassword() raise BrowserIncorrectPassword()
def has_error(self): def has_error(self):
plist = self.document.getElementsByTagName('p') for p in self.document.getiterator('p'):
for p in plist: if p.text and p.text.startswith(u'Vous avez rentré un mauvais mot de passe'):
p = p.childNodes[0]
if hasattr(p, 'data') and p.data.startswith(u'Vous avez rentré un mauvais mot de passe'):
return True return True
return False return False

View file

@ -21,32 +21,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import mechanize import mechanize
import urllib2 import urllib2
import ClientForm import ClientForm
try:
from html5lib import treebuilders, HTMLParser
except ImportError:
# XXX change this to use another lib than html5lib
class StandardParser:
def parse(self, data):
return None
else:
class StandardParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder("dom"))
def parse(self, data):
return HTMLParser.parse(data, encoding='iso-8859-1')
import re import re
import time import time
from logging import warning, error, debug from logging import warning, error, debug
from copy import copy from copy import copy
from weboob.tools.parser import StandardParser
# Try to load cookies
try: try:
from weboob.tools.firefox_cookies import FirefoxCookieJar from weboob.tools.firefox_cookies import FirefoxCookieJar
HAVE_COOKIES = True
except ImportError, e: except ImportError, e:
warning("Unable to store cookies: %s" % e) warning("Unable to store cookies: %s" % e)
HAVE_COOKIES = False HAVE_COOKIES = False
else:
HAVE_COOKIES = True
# Exceptions
class BrowserIncorrectPassword(Exception): class BrowserIncorrectPassword(Exception):
pass pass
@ -57,6 +48,9 @@ class BrowserRetry(Exception):
pass pass
class NoHistory: class NoHistory:
"""
We don't want to fill memory with history
"""
def __init__(self): pass def __init__(self): pass
def add(self, request, response): pass def add(self, request, response): pass
def back(self, n, _response): pass def back(self, n, _response): pass
@ -64,15 +58,24 @@ class NoHistory:
def close(self): pass def close(self): pass
class BasePage: class BasePage:
"""
Base page
"""
def __init__(self, browser, document, url=''): def __init__(self, browser, document, url=''):
self.browser = browser self.browser = browser
self.document = document self.document = document
self.url = url self.url = url
def loaded(self): def loaded(self):
"""
Called when the page is loaded.
"""
pass pass
class Browser(mechanize.Browser): class Browser(mechanize.Browser):
"""
Base browser class to navigate on a website.
"""
# ------ Class attributes -------------------------------------- # ------ Class attributes --------------------------------------
@ -83,21 +86,27 @@ class Browser(mechanize.Browser):
# ------ Abstract methods -------------------------------------- # ------ Abstract methods --------------------------------------
# Go to home
def home(self): def home(self):
"""
Go to the home page.
"""
raise NotImplementedError() raise NotImplementedError()
# Login to the website
def login(self): def login(self):
"""
Login to the website.
"""
raise NotImplementedError() raise NotImplementedError()
# Return True if we are logged on website
def is_logged(self): def is_logged(self):
"""
Return True if we are loggen on website.
"""
raise NotImplementedError() raise NotImplementedError()
# ------ Browser methods --------------------------------------- # ------ Browser methods ---------------------------------------
def __init__(self, username, password=None, firefox_cookies=None, parser=StandardParser): def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser):
mechanize.Browser.__init__(self, history=NoHistory()) mechanize.Browser.__init__(self, history=NoHistory())
self.addheaders = [ self.addheaders = [
['User-agent', self.USER_AGENT] ['User-agent', self.USER_AGENT]
@ -127,7 +136,7 @@ class Browser(mechanize.Browser):
def pageaccess(func): def pageaccess(func):
def inner(self, *args, **kwargs): def inner(self, *args, **kwargs):
if not self.page or not self.page.is_logged() and self.password: if not self.page or self.password and not self.page.is_logged():
self.home() self.home()
return func(self, *args, **kwargs) return func(self, *args, **kwargs)
@ -227,7 +236,7 @@ class Browser(mechanize.Browser):
self.page = pageCls(self, document, result.geturl()) self.page = pageCls(self, document, result.geturl())
self.page.loaded() self.page.loaded()
if not self.is_logged() and self.password: if self.password and not self.is_logged():
print '!! Relogin !!' print '!! Relogin !!'
self.login() self.login()
return return

71
weboob/tools/parser.py Normal file
View file

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Romain Bignon
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
try:
from xml.etree import cElementTree as ElementTree
except ImportError:
from xml.etree import ElementTree
try:
from elementtidy import TidyHTMLTreeBuilder
TidyHTMLTreeBuilder.ElementTree = ElementTree # force cElementTree if using it.
class HTMLTreeBuilder(TidyHTMLTreeBuilder.TidyHTMLTreeBuilder):
def __init__(self):
TidyHTMLTreeBuilder.TidyHTMLTreeBuilder.__init__(self, 'utf-8')
except ImportError:
from HTMLParser import HTMLParser
class HTMLTreeBuilder(HTMLParser):
def __init__(self, html=0, target=None):
HTMLParser.__init__(self)
if target is None:
target = ElementTree.TreeBuilder()
self._target = target
def doctype(self, name, pubid, system):
pass
def close(self):
tree = self._target.close()
return tree
def handle_starttag(self, tag, attrs):
self._target.start(tag, dict(attrs))
def handle_startendtag(self, tag, attrs):
self._target.start(tag, dict(attrs))
self._target.end(tag)
def handle_data(self, data):
self._target.data(data)
def handle_endtag(self, tag):
self._target.end(tag)
class StandardParser(object):
def parse(self, data):
parser = HTMLTreeBuilder()
tree = ElementTree.parse(data, parser)
for elem in tree.getiterator():
if elem.tag.startswith('{'):
elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree