boobot: Try to use HTML encoding
This commit is contained in:
parent
f7c169bcbd
commit
b6021ec90d
1 changed files with 8 additions and 0 deletions
|
|
@ -116,6 +116,14 @@ class BoobotBrowser(StandardBrowser):
|
||||||
hsize = self.human_size(size)
|
hsize = self.human_size(size)
|
||||||
r.seek(0)
|
r.seek(0)
|
||||||
encoding = EncodingFinder('windows-1252').encoding(r).lower()
|
encoding = EncodingFinder('windows-1252').encoding(r).lower()
|
||||||
|
try:
|
||||||
|
h = self.get_document(r, parser='lxml', encoding=encoding)
|
||||||
|
for meta in h.xpath('//head/meta'):
|
||||||
|
encoding = meta.attrib.get('charset', encoding).lower()
|
||||||
|
except Exception as e:
|
||||||
|
print e
|
||||||
|
finally:
|
||||||
|
r.seek(0)
|
||||||
if encoding == 'iso-8859-1':
|
if encoding == 'iso-8859-1':
|
||||||
encoding = 'windows-1252'
|
encoding = 'windows-1252'
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue