[wordreference] works now, dirty fix for a dirty website

2013-02-10 18:39:51 +01:00 · 2013-02-10 18:39:51 +01:00 · d5c391696d
commit d5c391696d
parent c977a6a2fe
1 changed files with 23 additions and 4 deletions
--- a/modules/wordreference/pages.py
+++ b/modules/wordreference/pages.py
@ -26,9 +26,28 @@ __all__ = ['TranslatePage']

 class TranslatePage(BasePage):
    def get_translation(self):
+      # taking the first signification in the case several were found
      for tr in self.document.getiterator('tr'):
-        if tr.attrib.get('class','') == 'odd' or tr.attrib.get('class','') == 'even':
-            for td in tr.getiterator('td'):
-                if td.attrib.get('class','') == 'ToWrd':
-                    return u''+td.text
+        prev_was_nums1 = False
+        for td in tr.getiterator('td'):
+            if prev_was_nums1:
+                return u''+td.text_content().split(';')[0].strip()
+            if td.attrib.get('class','') == 'nums1':
+                prev_was_nums1 = True
+      # if only one signification is found
+      for div in self.document.getiterator('div'):
+          if div.attrib.get('class','') == "trans clickable":
+              names = u''+" ".join(div.text_content().split(']')[1].split()[1:]).split(';')[0]
+              if ")" in names:
+                  names = names.split(")")[1]
+              return names.strip()
+      # another numerotation possibility...
+      for table in self.document.getiterator('table'):
+          if table.attrib.get('class','') == "trans clickable":
+            prev_was_roman1 = False
+            for td in table.getiterator('td'):
+                if prev_was_nums1:
+                    return u''+td.text_content().split(';')[0].strip()
+                if td.attrib.get('class','') == 'roman1':
+                    prev_was_nums1 = True