[wordreference] Adapt to browser2

2014-10-23 17:15:17 +02:00 · 2014-10-23 17:15:17 +02:00 · a7684982f8
commit a7684982f8
parent 8688e266b5
3 changed files with 34 additions and 72 deletions
--- a/modules/wordreference/browser.py
+++ b/modules/wordreference/browser.py
@ -18,34 +18,22 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.


-import urllib
-
-from weboob.deprecated.browser import Browser
-
+from weboob.browser import PagesBrowser, URL
 from .pages import TranslatePage


 __all__ = ['WordReferenceBrowser']


-class WordReferenceBrowser(Browser):
-    DOMAIN = 'www.wordreference.com'
-    ENCODING = 'UTF-8'
-    USER_AGENT = Browser.USER_AGENTS['desktop_firefox']
-    PAGES = {
-        'https?://www\.wordreference\.com/.*/.*': TranslatePage
-        }
-
-    def __init__(self, *args, **kwargs):
-        Browser.__init__(self, *args, **kwargs)
+class WordReferenceBrowser(PagesBrowser):
+    BASEURL = 'http://www.wordreference.com'
+    translation_page = URL('(?P<sl>[a-z]{2})(?P<tl>[a-z]{2})/(?P<pattern>.*)', TranslatePage)

    def translate(self, source, to, text):
        """
        translate 'text' from 'source' language to 'to' language
        """
-        sl   = source.encode('utf-8')
-        tl   = to.encode('utf-8')
-        text = text.encode('utf-8')
-        self.location('http://'+self.DOMAIN+'/'+sl+tl+'/'+urllib.quote(text))
-        translation = self.page.get_translation()
-        return translation
+
+        return self.translation_page.go(sl=source.encode('utf-8'),
+                                        tl=to.encode('utf-8'),
+                                        pattern=text.encode('utf-8')).get_translation()
--- a/modules/wordreference/module.py
+++ b/modules/wordreference/module.py
@ -19,7 +19,7 @@
 "backend for http://www.wordreference.com"


-from weboob.capabilities.translate import CapTranslate, Translation, TranslationFail, LanguageNotSupported
+from weboob.capabilities.translate import CapTranslate, TranslationFail, LanguageNotSupported
 from weboob.tools.backend import Module

 from .browser import WordReferenceBrowser
@ -37,9 +37,9 @@ class WordReferenceModule(Module, CapTranslate):
    DESCRIPTION = u'Free online translator'
    BROWSER = WordReferenceBrowser
    WRLANGUAGE = {
-        'Arabic':'ar', 'Chinese':'zh', 'Czech':'cz', 'English':'en', 'French':'fr', 'Greek':'gr',
-        'Italian':'it', 'Japanese':'ja', 'Korean':'ko', 'Polish':'pl', 'Portuguese':'pt',
-        'Romanian':'ro', 'Spanish':'es', 'Turkish':'tr',
+        'Arabic': 'ar', 'Chinese': 'zh', 'Czech': 'cz', 'English': 'en', 'French': 'fr', 'Greek': 'gr',
+        'Italian': 'it', 'Japanese': 'ja', 'Korean': 'ko', 'Polish': 'pl', 'Portuguese': 'pt',
+        'Romanian': 'ro', 'Spanish': 'es', 'Turkish': 'tr',
        }

    def translate(self, lan_from, lan_to, text):
@ -49,12 +49,12 @@ class WordReferenceModule(Module, CapTranslate):
        if lan_to not in self.WRLANGUAGE.keys():
            raise LanguageNotSupported()

-        translation = Translation(0)
-        translation.lang_src = unicode(self.WRLANGUAGE[lan_from])
-        translation.lang_dst = unicode(self.WRLANGUAGE[lan_to])
-        translation.text = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
+        translations = self.browser.translate(self.WRLANGUAGE[lan_from], self.WRLANGUAGE[lan_to], text)
+        has_translation = False

-        if translation.text is None:
+        for translation in translations:
+            has_translation = True
+            yield translation
+
+        if not has_translation:
            raise TranslationFail()
-
-        return translation
--- a/modules/wordreference/pages.py
+++ b/modules/wordreference/pages.py
@ -17,48 +17,22 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.

-
-from weboob.deprecated.browser import Page
-import re
+from weboob.browser.pages import HTMLPage
+from weboob.browser.elements import ItemElement, ListElement, method
+from weboob.capabilities.translate import Translation
+from weboob.browser.filters.standard import CleanText, Regexp, Env
+from weboob.browser.filters.html import CleanHTML


-LAST_THING_IN_PARENTHESIS = re.compile("\([^)]\)$")
+class TranslatePage(HTMLPage):
+    @method
+    class get_translation(ListElement):
+        item_xpath = '//table[@class="WRD" and not(@id)]/tr[@id]'

+        class item(ItemElement):
+            klass = Translation

-class TranslatePage(Page):
-    def get_translation(self):
-        trs = self.document.getroot().xpath("//table[@class='WRD']/tr[@class='even']")
-        if trs and len(trs) > 0:
-            # taking the first signification in the case several were found
-            return self.parser.select(trs[0], "td[@class='ToWrd']", 1, method='xpath').text
-        """
-        # taking the first signification in the case several were found
-        for tr in self.document.getiterator('tr'):
-            prev_was_nums1 = False
-            for td in tr.getiterator('td'):
-                if prev_was_nums1:
-                    result = u''+td.text_content().split(';')[0].strip()
-                    result = LAST_THING_IN_PARENTHESIS.sub("",result)
-                    return result
-                if td.attrib.get('class','') == 'nums1':
-                    prev_was_nums1 = True
-        # if only one signification is found
-        for div in self.document.getiterator('div'):
-            if div.attrib.get('class','') == "trans clickable":
-                if ']' in div.text_content():
-                    tnames = div.text_content().split(']')[1].split()[1:]
-                else:
-                    tnames = div.text_content().split()[1:]
-                names = u''+" ".join(tnames).split(';')[0]
-                names = LAST_THING_IN_PARENTHESIS.sub("",names)
-                return names.strip()
-        # another numerotation possibility...
-        for table in self.document.getiterator('table'):
-            if table.attrib.get('class','') == "trans clickable":
-                prev_was_roman1 = False
-                for td in table.getiterator('td'):
-                    if prev_was_roman1:
-                        return u''+td.text_content().split(';')[0].strip()
-                    if td.attrib.get('class','') == 'roman1':
-                        prev_was_roman1 = True
-        """
+            obj_id = Regexp(CleanText('./@id'), '.*:(.*)')
+            obj_lang_src = Env('sl')
+            obj_lang_dst = Env('tl')
+            obj_text = CleanHTML('./td[@class="ToWrd"]')