[wordreference] Adapt to browser2
This commit is contained in:
parent
8688e266b5
commit
a7684982f8
3 changed files with 34 additions and 72 deletions
|
|
@ -18,34 +18,22 @@
|
|||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
import urllib
|
||||
|
||||
from weboob.deprecated.browser import Browser
|
||||
|
||||
from weboob.browser import PagesBrowser, URL
|
||||
from .pages import TranslatePage
|
||||
|
||||
|
||||
__all__ = ['WordReferenceBrowser']
|
||||
|
||||
|
||||
class WordReferenceBrowser(Browser):
|
||||
DOMAIN = 'www.wordreference.com'
|
||||
ENCODING = 'UTF-8'
|
||||
USER_AGENT = Browser.USER_AGENTS['desktop_firefox']
|
||||
PAGES = {
|
||||
'https?://www\.wordreference\.com/.*/.*': TranslatePage
|
||||
}
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
Browser.__init__(self, *args, **kwargs)
|
||||
class WordReferenceBrowser(PagesBrowser):
|
||||
BASEURL = 'http://www.wordreference.com'
|
||||
translation_page = URL('(?P<sl>[a-z]{2})(?P<tl>[a-z]{2})/(?P<pattern>.*)', TranslatePage)
|
||||
|
||||
def translate(self, source, to, text):
|
||||
"""
|
||||
translate 'text' from 'source' language to 'to' language
|
||||
"""
|
||||
sl = source.encode('utf-8')
|
||||
tl = to.encode('utf-8')
|
||||
text = text.encode('utf-8')
|
||||
self.location('http://'+self.DOMAIN+'/'+sl+tl+'/'+urllib.quote(text))
|
||||
translation = self.page.get_translation()
|
||||
return translation
|
||||
|
||||
return self.translation_page.go(sl=source.encode('utf-8'),
|
||||
tl=to.encode('utf-8'),
|
||||
pattern=text.encode('utf-8')).get_translation()
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@
|
|||
"backend for http://www.wordreference.com"
|
||||
|
||||
|
||||
from weboob.capabilities.translate import CapTranslate, Translation, TranslationFail, LanguageNotSupported
|
||||
from weboob.capabilities.translate import CapTranslate, TranslationFail, LanguageNotSupported
|
||||
from weboob.tools.backend import Module
|
||||
|
||||
from .browser import WordReferenceBrowser
|
||||
|
|
@ -37,9 +37,9 @@ class WordReferenceModule(Module, CapTranslate):
|
|||
DESCRIPTION = u'Free online translator'
|
||||
BROWSER = WordReferenceBrowser
|
||||
WRLANGUAGE = {
|
||||
'Arabic':'ar', 'Chinese':'zh', 'Czech':'cz', 'English':'en', 'French':'fr', 'Greek':'gr',
|
||||
'Italian':'it', 'Japanese':'ja', 'Korean':'ko', 'Polish':'pl', 'Portuguese':'pt',
|
||||
'Romanian':'ro', 'Spanish':'es', 'Turkish':'tr',
|
||||
'Arabic': 'ar', 'Chinese': 'zh', 'Czech': 'cz', 'English': 'en', 'French': 'fr', 'Greek': 'gr',
|
||||
'Italian': 'it', 'Japanese': 'ja', 'Korean': 'ko', 'Polish': 'pl', 'Portuguese': 'pt',
|
||||
'Romanian': 'ro', 'Spanish': 'es', 'Turkish': 'tr',
|
||||
}
|
||||
|
||||
def translate(self, lan_from, lan_to, text):
|
||||
|
|
@ -49,12 +49,12 @@ class WordReferenceModule(Module, CapTranslate):
|
|||
if lan_to not in self.WRLANGUAGE.keys():
|
||||
raise LanguageNotSupported()
|
||||
|
||||
translation = Translation(0)
|
||||
translation.lang_src = unicode(self.WRLANGUAGE[lan_from])
|
||||
translation.lang_dst = unicode(self.WRLANGUAGE[lan_to])
|
||||
translation.text = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
|
||||
translations = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
|
||||
has_translation = False
|
||||
|
||||
if translation.text is None:
|
||||
for translation in translations:
|
||||
has_translation = True
|
||||
yield translation
|
||||
|
||||
if not has_translation:
|
||||
raise TranslationFail()
|
||||
|
||||
return translation
|
||||
|
|
|
|||
|
|
@ -17,48 +17,22 @@
|
|||
# You should have received a copy of the GNU Affero General Public License
|
||||
# along with weboob. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
|
||||
from weboob.deprecated.browser import Page
|
||||
import re
|
||||
from weboob.browser.pages import HTMLPage
|
||||
from weboob.browser.elements import ItemElement, ListElement, method
|
||||
from weboob.capabilities.translate import Translation
|
||||
from weboob.browser.filters.standard import CleanText, Regexp, Env
|
||||
from weboob.browser.filters.html import CleanHTML
|
||||
|
||||
|
||||
LAST_THING_IN_PARENTHESIS = re.compile("\([^)]\)$")
|
||||
class TranslatePage(HTMLPage):
|
||||
@method
|
||||
class get_translation(ListElement):
|
||||
item_xpath = '//table[@class="WRD" and not(@id)]/tr[@id]'
|
||||
|
||||
class item(ItemElement):
|
||||
klass = Translation
|
||||
|
||||
class TranslatePage(Page):
|
||||
def get_translation(self):
|
||||
trs = self.document.getroot().xpath("//table[@class='WRD']/tr[@class='even']")
|
||||
if trs and len(trs) > 0:
|
||||
# taking the first signification in the case several were found
|
||||
return self.parser.select(trs[0], "td[@class='ToWrd']", 1, method='xpath').text
|
||||
"""
|
||||
# taking the first signification in the case several were found
|
||||
for tr in self.document.getiterator('tr'):
|
||||
prev_was_nums1 = False
|
||||
for td in tr.getiterator('td'):
|
||||
if prev_was_nums1:
|
||||
result = u''+td.text_content().split(';')[0].strip()
|
||||
result = LAST_THING_IN_PARENTHESIS.sub("",result)
|
||||
return result
|
||||
if td.attrib.get('class','') == 'nums1':
|
||||
prev_was_nums1 = True
|
||||
# if only one signification is found
|
||||
for div in self.document.getiterator('div'):
|
||||
if div.attrib.get('class','') == "trans clickable":
|
||||
if ']' in div.text_content():
|
||||
tnames = div.text_content().split(']')[1].split()[1:]
|
||||
else:
|
||||
tnames = div.text_content().split()[1:]
|
||||
names = u''+" ".join(tnames).split(';')[0]
|
||||
names = LAST_THING_IN_PARENTHESIS.sub("",names)
|
||||
return names.strip()
|
||||
# another numerotation possibility...
|
||||
for table in self.document.getiterator('table'):
|
||||
if table.attrib.get('class','') == "trans clickable":
|
||||
prev_was_roman1 = False
|
||||
for td in table.getiterator('td'):
|
||||
if prev_was_roman1:
|
||||
return u''+td.text_content().split(';')[0].strip()
|
||||
if td.attrib.get('class','') == 'roman1':
|
||||
prev_was_roman1 = True
|
||||
"""
|
||||
obj_id = Regexp(CleanText('./@id'), '.*:(.*)')
|
||||
obj_lang_src = Env('sl')
|
||||
obj_lang_dst = Env('tl')
|
||||
obj_text = CleanHTML('./td[@class="ToWrd"]')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue