Some black magic to fix pdf parsing
This commit is contained in:
parent
32ee771ee0
commit
6a4f338f05
1 changed files with 17 additions and 15 deletions
|
|
@ -53,7 +53,7 @@ class PdfPage():
|
||||||
txtfile = open(temptxt, 'r')
|
txtfile = open(temptxt, 'r')
|
||||||
txt = txtfile.read()
|
txt = txtfile.read()
|
||||||
txtfile.close()
|
txtfile.close()
|
||||||
os.remove(temptxt)
|
#os.remove(temptxt)
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
def get_details(self):
|
def get_details(self):
|
||||||
|
|
@ -119,30 +119,29 @@ class PdfPage():
|
||||||
lines = page.split('\n')
|
lines = page.split('\n')
|
||||||
lines = [x for x in lines if len(x) > 0] # Remove empty lines
|
lines = [x for x in lines if len(x) > 0] # Remove empty lines
|
||||||
numitems = (len(lines) + 1) / 4 # Each line has five columns
|
numitems = (len(lines) + 1) / 4 # Each line has five columns
|
||||||
lines.pop(0)
|
lines.pop(0) # remove the extra € symbol
|
||||||
modif = 0
|
modif = 0
|
||||||
i = 0
|
i = 0
|
||||||
while i < numitems:
|
while i < numitems:
|
||||||
if modif != 0:
|
if modif != 0:
|
||||||
numitems = ((len(lines) + 1 + modif) / 4)
|
numitems = ((len(lines) + 1 + modif) / 4)
|
||||||
nature = i * 4 - modif
|
base = i * 4 - modif
|
||||||
dateop = nature
|
dateop = base
|
||||||
corres = nature + 1
|
corres = base + 1
|
||||||
duree = corres + 1
|
duree = base + 2
|
||||||
price = duree + 1
|
price = base + 3
|
||||||
if "Changement vers le Forfait" in lines[nature]:
|
if "Changement vers le Forfait" in lines[base]:
|
||||||
modif += 1
|
modif += 1
|
||||||
i += 1
|
i += 1
|
||||||
continue
|
continue
|
||||||
|
# Special case with 5 columns, the operation date is not in the first one
|
||||||
if len(re.split("(\d+\/\d+\/\d+)", lines[dateop])) < 2:
|
if len(re.split("(\d+\/\d+\/\d+)", lines[dateop])) < 2:
|
||||||
lines[nature + 1] = lines[nature] + " " + lines[nature + 1]
|
lines[base + 1] = lines[base] + " " + lines[base + 1]
|
||||||
dateop = nature + 1
|
dateop = base + 1
|
||||||
corres = dateop + 1
|
corres = base + 2
|
||||||
duree = corres + 1
|
duree = base + 3
|
||||||
price = duree + 1
|
price = base + 4
|
||||||
modif -= 1
|
modif -= 1
|
||||||
if not lines[corres][0:3].isdigit() and not lines[corres][0:3] == "-":
|
|
||||||
modif += 1
|
|
||||||
detail = Detail()
|
detail = Detail()
|
||||||
splits = re.split("(\d+\/\d+\/\d+)", lines[dateop])
|
splits = re.split("(\d+\/\d+\/\d+)", lines[dateop])
|
||||||
mydate = date(*reversed([int(x) for x in splits[1].split("/")]))
|
mydate = date(*reversed([int(x) for x in splits[1].split("/")]))
|
||||||
|
|
@ -159,6 +158,9 @@ class PdfPage():
|
||||||
try:
|
try:
|
||||||
detail.price = Decimal(lines[price].replace(',', '.'))
|
detail.price = Decimal(lines[price].replace(',', '.'))
|
||||||
except:
|
except:
|
||||||
|
# In some special cases, there are no price column. Try to detect it
|
||||||
|
if "Inclus" not in lines[price]:
|
||||||
|
modif += 1
|
||||||
detail.price = Decimal(0)
|
detail.price = Decimal(0)
|
||||||
|
|
||||||
details.append(detail)
|
details.append(detail)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue