use cElementTree and elementtidy as default encoders, and added wrappers to ElementParser and HTMLParser if they are missing

This commit is contained in:
Romain Bignon 2010-04-03 17:14:59 +02:00
commit 51433d6549
4 changed files with 115 additions and 28 deletions

View file

@ -20,6 +20,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import time
from logging import warning
from html5lib import treebuilders, HTMLParser
from weboob.tools.browser import Browser
from weboob.backends.aum.exceptions import AdopteWait
@ -34,6 +35,13 @@ from weboob.backends.aum.pages.login import LoginPage, RedirectPage, BanPage, Er
from weboob.backends.aum.pages.edit import EditPhotoPage, EditPhotoCbPage, EditAnnouncePage, EditDescriptionPage, EditSexPage, EditPersonalityPage
from weboob.backends.aum.pages.wait import WaitPage
class AdopteParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder("dom"))
def parse(self, data):
return HTMLParser.parse(self, data, encoding='iso-8859-1')
class AdopteUnMec(Browser):
DOMAIN = 'www.adopteunmec.com'
PROTOCOL = 'http'
@ -66,6 +74,7 @@ class AdopteUnMec(Browser):
}
def __init__(self, *args, **kwargs):
kwargs['parser'] = AdopteParser
Browser.__init__(self, *args, **kwargs)
self.my_id = 0

View file

@ -19,12 +19,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
from weboob.tools.browser import BrowserIncorrectPassword, BasePage
from weboob.capabilities.messages import Message
class DLFPPage(BasePage):
def is_logged(self):
forms = self.document.getElementsByTagName('form')
for form in forms:
if form.getAttribute('id') == 'formulaire':
for form in self.document.getiterator('form'):
if form.attrib.get('id', None) == 'formulaire':
return False
return True
@ -38,9 +38,7 @@ class LoginPage(DLFPPage):
raise BrowserIncorrectPassword()
def has_error(self):
plist = self.document.getElementsByTagName('p')
for p in plist:
p = p.childNodes[0]
if hasattr(p, 'data') and p.data.startswith(u'Vous avez rentré un mauvais mot de passe'):
for p in self.document.getiterator('p'):
if p.text and p.text.startswith(u'Vous avez rentré un mauvais mot de passe'):
return True
return False

View file

@ -21,32 +21,23 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
import mechanize
import urllib2
import ClientForm
try:
from html5lib import treebuilders, HTMLParser
except ImportError:
# XXX change this to use another lib than html5lib
class StandardParser:
def parse(self, data):
return None
else:
class StandardParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder("dom"))
def parse(self, data):
return HTMLParser.parse(data, encoding='iso-8859-1')
import re
import time
from logging import warning, error, debug
from copy import copy
from weboob.tools.parser import StandardParser
# Try to load cookies
try:
from weboob.tools.firefox_cookies import FirefoxCookieJar
HAVE_COOKIES = True
except ImportError, e:
warning("Unable to store cookies: %s" % e)
HAVE_COOKIES = False
else:
HAVE_COOKIES = True
# Exceptions
class BrowserIncorrectPassword(Exception):
pass
@ -57,6 +48,9 @@ class BrowserRetry(Exception):
pass
class NoHistory:
"""
We don't want to fill memory with history
"""
def __init__(self): pass
def add(self, request, response): pass
def back(self, n, _response): pass
@ -64,15 +58,24 @@ class NoHistory:
def close(self): pass
class BasePage:
"""
Base page
"""
def __init__(self, browser, document, url=''):
self.browser = browser
self.document = document
self.url = url
def loaded(self):
"""
Called when the page is loaded.
"""
pass
class Browser(mechanize.Browser):
"""
Base browser class to navigate on a website.
"""
# ------ Class attributes --------------------------------------
@ -83,21 +86,27 @@ class Browser(mechanize.Browser):
# ------ Abstract methods --------------------------------------
# Go to home
def home(self):
"""
Go to the home page.
"""
raise NotImplementedError()
# Login to the website
def login(self):
"""
Login to the website.
"""
raise NotImplementedError()
# Return True if we are logged on website
def is_logged(self):
"""
Return True if we are loggen on website.
"""
raise NotImplementedError()
# ------ Browser methods ---------------------------------------
def __init__(self, username, password=None, firefox_cookies=None, parser=StandardParser):
def __init__(self, username=None, password=None, firefox_cookies=None, parser=StandardParser):
mechanize.Browser.__init__(self, history=NoHistory())
self.addheaders = [
['User-agent', self.USER_AGENT]
@ -127,7 +136,7 @@ class Browser(mechanize.Browser):
def pageaccess(func):
def inner(self, *args, **kwargs):
if not self.page or not self.page.is_logged() and self.password:
if not self.page or self.password and not self.page.is_logged():
self.home()
return func(self, *args, **kwargs)
@ -227,7 +236,7 @@ class Browser(mechanize.Browser):
self.page = pageCls(self, document, result.geturl())
self.page.loaded()
if not self.is_logged() and self.password:
if self.password and not self.is_logged():
print '!! Relogin !!'
self.login()
return

71
weboob/tools/parser.py Normal file
View file

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
"""
Copyright(C) 2010 Romain Bignon
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, version 3 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""
try:
from xml.etree import cElementTree as ElementTree
except ImportError:
from xml.etree import ElementTree
try:
from elementtidy import TidyHTMLTreeBuilder
TidyHTMLTreeBuilder.ElementTree = ElementTree # force cElementTree if using it.
class HTMLTreeBuilder(TidyHTMLTreeBuilder.TidyHTMLTreeBuilder):
def __init__(self):
TidyHTMLTreeBuilder.TidyHTMLTreeBuilder.__init__(self, 'utf-8')
except ImportError:
from HTMLParser import HTMLParser
class HTMLTreeBuilder(HTMLParser):
def __init__(self, html=0, target=None):
HTMLParser.__init__(self)
if target is None:
target = ElementTree.TreeBuilder()
self._target = target
def doctype(self, name, pubid, system):
pass
def close(self):
tree = self._target.close()
return tree
def handle_starttag(self, tag, attrs):
self._target.start(tag, dict(attrs))
def handle_startendtag(self, tag, attrs):
self._target.start(tag, dict(attrs))
self._target.end(tag)
def handle_data(self, data):
self._target.data(data)
def handle_endtag(self, tag):
self._target.end(tag)
class StandardParser(object):
def parse(self, data):
parser = HTMLTreeBuilder()
tree = ElementTree.parse(data, parser)
for elem in tree.getiterator():
if elem.tag.startswith('{'):
elem.tag = elem.tag[elem.tag.find('}')+1:]
return tree