[wordreference] Adapt to browser2

This commit is contained in:
Bezleputh 2014-10-23 17:15:17 +02:00 committed by Romain Bignon
commit a7684982f8
3 changed files with 34 additions and 72 deletions

View file

@ -18,34 +18,22 @@
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
import urllib
from weboob.deprecated.browser import Browser
from weboob.browser import PagesBrowser, URL
from .pages import TranslatePage
__all__ = ['WordReferenceBrowser']
class WordReferenceBrowser(Browser):
DOMAIN = 'www.wordreference.com'
ENCODING = 'UTF-8'
USER_AGENT = Browser.USER_AGENTS['desktop_firefox']
PAGES = {
'https?://www\.wordreference\.com/.*/.*': TranslatePage
}
def __init__(self, *args, **kwargs):
Browser.__init__(self, *args, **kwargs)
class WordReferenceBrowser(PagesBrowser):
BASEURL = 'http://www.wordreference.com'
translation_page = URL('(?P<sl>[a-z]{2})(?P<tl>[a-z]{2})/(?P<pattern>.*)', TranslatePage)
def translate(self, source, to, text):
"""
translate 'text' from 'source' language to 'to' language
"""
sl = source.encode('utf-8')
tl = to.encode('utf-8')
text = text.encode('utf-8')
self.location('http://'+self.DOMAIN+'/'+sl+tl+'/'+urllib.quote(text))
translation = self.page.get_translation()
return translation
return self.translation_page.go(sl=source.encode('utf-8'),
tl=to.encode('utf-8'),
pattern=text.encode('utf-8')).get_translation()

View file

@ -19,7 +19,7 @@
"backend for http://www.wordreference.com"
from weboob.capabilities.translate import CapTranslate, Translation, TranslationFail, LanguageNotSupported
from weboob.capabilities.translate import CapTranslate, TranslationFail, LanguageNotSupported
from weboob.tools.backend import Module
from .browser import WordReferenceBrowser
@ -37,9 +37,9 @@ class WordReferenceModule(Module, CapTranslate):
DESCRIPTION = u'Free online translator'
BROWSER = WordReferenceBrowser
WRLANGUAGE = {
'Arabic':'ar', 'Chinese':'zh', 'Czech':'cz', 'English':'en', 'French':'fr', 'Greek':'gr',
'Italian':'it', 'Japanese':'ja', 'Korean':'ko', 'Polish':'pl', 'Portuguese':'pt',
'Romanian':'ro', 'Spanish':'es', 'Turkish':'tr',
'Arabic': 'ar', 'Chinese': 'zh', 'Czech': 'cz', 'English': 'en', 'French': 'fr', 'Greek': 'gr',
'Italian': 'it', 'Japanese': 'ja', 'Korean': 'ko', 'Polish': 'pl', 'Portuguese': 'pt',
'Romanian': 'ro', 'Spanish': 'es', 'Turkish': 'tr',
}
def translate(self, lan_from, lan_to, text):
@ -49,12 +49,12 @@ class WordReferenceModule(Module, CapTranslate):
if lan_to not in self.WRLANGUAGE.keys():
raise LanguageNotSupported()
translation = Translation(0)
translation.lang_src = unicode(self.WRLANGUAGE[lan_from])
translation.lang_dst = unicode(self.WRLANGUAGE[lan_to])
translation.text = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
translations = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
has_translation = False
if translation.text is None:
for translation in translations:
has_translation = True
yield translation
if not has_translation:
raise TranslationFail()
return translation

View file

@ -17,48 +17,22 @@
# You should have received a copy of the GNU Affero General Public License
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
from weboob.deprecated.browser import Page
import re
from weboob.browser.pages import HTMLPage
from weboob.browser.elements import ItemElement, ListElement, method
from weboob.capabilities.translate import Translation
from weboob.browser.filters.standard import CleanText, Regexp, Env
from weboob.browser.filters.html import CleanHTML
LAST_THING_IN_PARENTHESIS = re.compile("\([^)]\)$")
class TranslatePage(HTMLPage):
@method
class get_translation(ListElement):
item_xpath = '//table[@class="WRD" and not(@id)]/tr[@id]'
class item(ItemElement):
klass = Translation
class TranslatePage(Page):
def get_translation(self):
trs = self.document.getroot().xpath("//table[@class='WRD']/tr[@class='even']")
if trs and len(trs) > 0:
# taking the first signification in the case several were found
return self.parser.select(trs[0], "td[@class='ToWrd']", 1, method='xpath').text
"""
# taking the first signification in the case several were found
for tr in self.document.getiterator('tr'):
prev_was_nums1 = False
for td in tr.getiterator('td'):
if prev_was_nums1:
result = u''+td.text_content().split(';')[0].strip()
result = LAST_THING_IN_PARENTHESIS.sub("",result)
return result
if td.attrib.get('class','') == 'nums1':
prev_was_nums1 = True
# if only one signification is found
for div in self.document.getiterator('div'):
if div.attrib.get('class','') == "trans clickable":
if ']' in div.text_content():
tnames = div.text_content().split(']')[1].split()[1:]
else:
tnames = div.text_content().split()[1:]
names = u''+" ".join(tnames).split(';')[0]
names = LAST_THING_IN_PARENTHESIS.sub("",names)
return names.strip()
# another numerotation possibility...
for table in self.document.getiterator('table'):
if table.attrib.get('class','') == "trans clickable":
prev_was_roman1 = False
for td in table.getiterator('td'):
if prev_was_roman1:
return u''+td.text_content().split(';')[0].strip()
if td.attrib.get('class','') == 'roman1':
prev_was_roman1 = True
"""
obj_id = Regexp(CleanText('./@id'), '.*:(.*)')
obj_lang_src = Env('sl')
obj_lang_dst = Env('tl')
obj_text = CleanHTML('./td[@class="ToWrd"]')