[wordreference] works now, dirty fix for a dirty website

This commit is contained in:
Julien Veyssier 2013-02-10 18:39:51 +01:00 committed by Romain Bignon
commit d5c391696d

View file

@ -26,9 +26,28 @@ __all__ = ['TranslatePage']
class TranslatePage(BasePage):
def get_translation(self):
# taking the first signification in the case several were found
for tr in self.document.getiterator('tr'):
if tr.attrib.get('class','') == 'odd' or tr.attrib.get('class','') == 'even':
for td in tr.getiterator('td'):
if td.attrib.get('class','') == 'ToWrd':
return u''+td.text
prev_was_nums1 = False
for td in tr.getiterator('td'):
if prev_was_nums1:
return u''+td.text_content().split(';')[0].strip()
if td.attrib.get('class','') == 'nums1':
prev_was_nums1 = True
# if only one signification is found
for div in self.document.getiterator('div'):
if div.attrib.get('class','') == "trans clickable":
names = u''+" ".join(div.text_content().split(']')[1].split()[1:]).split(';')[0]
if ")" in names:
names = names.split(")")[1]
return names.strip()
# another numerotation possibility...
for table in self.document.getiterator('table'):
if table.attrib.get('class','') == "trans clickable":
prev_was_roman1 = False
for td in table.getiterator('td'):
if prev_was_nums1:
return u''+td.text_content().split(';')[0].strip()
if td.attrib.get('class','') == 'roman1':
prev_was_nums1 = True