Skip to content
This repository has been archived by the owner on Sep 5, 2022. It is now read-only.

Commit

Permalink
Add method 'parse' to class 'Invoices'
Browse files Browse the repository at this point in the history
* Add method 'parse' to class 'Invoices'
* Add tax rate detection when parsing PDF invoices
* Wire PDF parsing with class `Volksbank`
* Move helper method 'convert_tax_rates`

Also, this fixes #16
  • Loading branch information
S1SYPHOS authored May 27, 2021
1 parent ad89256 commit 97bd5d5
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 34 deletions.
8 changes: 2 additions & 6 deletions knv_cli/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def convert_date(self, string: str) -> str:


def convert_number(self, string) -> str:
# Convert integers & floats
string = str(string)
# Convert to string & clear whitespaces
string = str(string).strip()

# Take care of thousands separator, as in '1.234,56'
if '.' in string and ',' in string:
Expand All @@ -63,7 +63,3 @@ def convert_number(self, string) -> str:
integer = f'{string:.2f}'

return str(integer)


def convert_tax_rate(self, string: str) -> str:
return str(string).replace(',00', '') + '%'
45 changes: 35 additions & 10 deletions knv_cli/gateways/volksbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,18 +157,37 @@ def match_payments(self, data: list, invoice_handler: Invoices = None) -> None:
payment['Vorgang'] = matching_invoices

# There are two ways extracting information about invoices ..
if matching_orders:
# .. via fetching order data
taxes = self.match_invoices(matching_invoices, matching_orders)
if invoice_handler:
# .. via parsing invoice files
matching_invoices = self.parse_invoices(matching_invoices, invoice_handler)

payment['Vorgang'] = [invoice['Rechnungsnummer'] for invoice in matching_invoices]
payment['Rechnungssumme'] = '0.00'
payment['Bestellung'] = []

taxes = {}

for invoice in matching_invoices:
payment['Rechnungssumme'] = self.convert_number(float(payment['Rechnungssumme']) + float(invoice['Gesamtbetrag']))
payment['Bestellung'].append(invoice['Steuern'])

for tax_rate, tax_amount in invoice['Steuern'].items():
if not tax_rate in taxes:
taxes[tax_rate] = tax_amount

else:
taxes[tax_rate] = self.convert_number(float(taxes[tax_rate]) + float(tax_amount))

if taxes:
payment['Bestellung'] = taxes
for tax_rate, tax_amount in taxes.items():
payment[tax_rate + ' MwSt'] = tax_amount

else:
# .. via parsing invoice files
pass
# if invoice_handler:
# taxes = self.parse_invoices(matching_invoices, invoice_handler)
if matching_orders:
# .. via fetching order data
taxes = self.match_invoices(matching_invoices, matching_orders)

if taxes:
payment['Bestellung'] = taxes

# Reverse-lookup orders if no matching order number(s) yet
# if not matching_orders:
Expand Down Expand Up @@ -218,4 +237,10 @@ def match_invoices(self, invoices: list, orders: list) -> dict:


def parse_invoices(self, invoices: list, handler: Invoices) -> list:
pass
results = []

for invoice in invoices:
if handler.has(invoice):
results.append(handler.parse(invoice))

return results
214 changes: 196 additions & 18 deletions knv_cli/knv/invoices.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
from datetime import datetime
from os.path import basename

import PyPDF2

class Invoices:
from ..command import Command


class Invoices():
# PROPS

regex = '*_Invoices_TimeFrom*_TimeTo*.zip'
Expand Down Expand Up @@ -39,25 +43,203 @@ def remove(self, invoice_number: str) -> None:
del self.invoices[invoice_number]


# HELPER methods
# PARSING methods

@staticmethod
def invoice2date(string: str) -> str:
# Distinguish between delimiters ..
# (1) .. hyphen ('Shopkonfigurator')
delimiter = '-'
def parse(self, invoice_file) -> list:
# Make sure given invoice is available for parsing
if self.invoice2number(invoice_file) not in self.invoices:
raise Exception

# (2) .. underscore ('Barsortiment')
if delimiter not in string:
delimiter = '_'
# Normalize input
invoice_file = self.invoices[self.invoice2number(invoice_file)]

# Extract general information from file name
invoice_date = self.invoice2date(invoice_file)
invoice_number = self.invoice2number(invoice_file)

# Prepare data storage
invoice = {
'Rechnungsnummer': invoice_number,
'Datum': invoice_date,
'Versandkosten': '0.00',
'Gesamtbetrag': 'keine Angabe',
'Steuern': {},
'Gutscheine': 'keine Angabe',
}

content = []

# Fetch content from invoice file
with open(invoice_file, 'rb') as file:
pdf = PyPDF2.PdfFileReader(file)

for page in pdf.pages:
content += [text.strip() for text in page.extractText().splitlines() if text]

# Determine invoice kind, as those starting with 'R' are formatted quite differently
if invoice_number[:1] == 'R':
# Parse content, looking for information about ..
# (1) .. general information
for line in content:
if 'Rechnungsbetrag gesamt brutto' in line:
invoice['Gesamtbetrag'] = self.convert_number(content[content.index(line) + 1])

if 'Versandpauschale' in line or 'Versandkosten' in line:
invoice['Versandkosten'] = self.convert_number(content[content.index(line) + 2])

# Edge case with two lines preceeding shipping cost
if 'versandt an' in line:
invoice['Versandkosten'] = self.convert_number(content[content.index(line) + 2])

# (2) .. taxes
for tax_rate in ['5', '7', '16', '19']:
tax_string = 'MwSt. ' + tax_rate + ',00 %'

if tax_string in content:
invoice['Steuern'][tax_rate + '%'] = self.convert_number(content[content.index(tax_string) + 2])

# (3) .. coupons
if 'Gutschein' in content:
coupons = []

for index in self.build_indices(content, 'Gutschein'):
coupons.append({
'Anzahl': int(content[index - 1]),
'Wert': self.convert_number(content[index + 2]),
})

invoice['Gutscheine'] = coupons

else:
# Gather general information
for index, line in enumerate(content):
# TODO: Get values via regexes
if 'Versandkosten:' in line:
invoice['Versandkosten'] = self.convert_number(line.replace('Versandkosten:', ''))

if 'Gesamtbetrag' in line:
invoice['Gesamtbetrag'] = self.convert_number(line.replace('Gesamtbetrag', ''))

# Fetch first occurence of ..
# (1) .. 'Nettobetrag' (= starting point)
starting_point = self.get_index(content, 'Nettobetrag')

# (2) .. 'Gesamtbetrag' (= terminal point)
terminal_point = self.get_index(content, 'Gesamtbetrag')

# Try different setup, since some invoices are the other way around
reverse_order = starting_point > terminal_point

if reverse_order:
# In this case, fetch last occurence of 'EUR' (= terminal point)
terminal_point = self.get_index(content, 'EUR', True)

costs = content[starting_point:terminal_point + 1]

# Determine tax rates:
# reduced = 5% or 7%
# full = 16% or 19%
tax_rates = [self.format_tax_rate(tax_rate) for tax_rate in costs[:2]]

reduced_tax = 0
full_tax = 0

if len(costs) < 8:
costs_list = costs[4].replace('MwSt. gesamt:', '').split()

reduced_tax = costs_list[0]
full_tax = costs_list[1]

if len(costs_list) < 3:
full_tax = costs[5]

elif len(costs) == 9:
reduced_tax = costs[4].split(':')[-1]
full_tax = costs[5]

date_string = basename(string).split(delimiter)[1].replace('.pdf', '')
if 'MwSt. gesamt' in costs[5]:
costs_list = costs[5].split(':')[-1].split()

reduced_tax = costs_list[0]
full_tax = costs_list[1]

if 'MwSt. gesamt' in costs[6]:
reduced_tax = costs[6].split(':')[-1]
full_tax = costs[7]


elif len(costs) in [10, 11]:
index = 6 if 'MwSt.' in costs[6] else 5

reduced_tax = costs[index].split(':')[-1].split()[0]
full_tax = costs[index + 1].split()[0]

else:
reduced_tax = costs[5].split()[0]
full_tax = costs[2].split()[2]

if reduced_tax == 'MwSt.':
reduced_tax = costs[5].split(':')[-1]
full_tax = costs[6]

invoice['Steuern'][tax_rates[0]] = self.convert_number(reduced_tax)
invoice['Steuern'][tax_rates[1]] = self.convert_number(full_tax)

return invoice


# PARSING HELPER methods

def format_tax_rate(self, string: str) -> str:
return (string[:-1].replace('Nettobetrag', '')).strip()


def get_index(self, haystack: list, needle: str, last: bool = False) -> int:
position = 0

if last:
position = -1

return [i for i, string in enumerate(haystack) if needle in string][position]


def build_indices(self, haystack: list, needle: str) -> list:
return [count for count, line in enumerate(haystack) if line == needle]


def convert_number(self, string) -> str:
# Clear whitespaces & convert to string (suck it, `int` + `float`)
string = str(string).replace('EUR', '').strip()

# Take care of thousands separator, as in '1.234,56'
if '.' in string and ',' in string:
string = string.replace('.', '')

string = float(string.replace(',', '.'))
integer = f'{string:.2f}'

return str(integer)


# HELPER methods

def invoice2date(self, string: str) -> str:
date_string = self.split_string(string)[1].replace('.pdf', '')

return datetime.strptime(date_string, '%Y%m%d').strftime('%Y-%m-%d')


@staticmethod
def invoice2number(string: str) -> str:
def invoice2number(self, string: str) -> str:
string_list = self.split_string(string)

# Sort out invoice numbers
if len(string_list) == 1:
return string

return string_list[-1].replace('.pdf', '')


def split_string(self, string: str) -> str:
# Strip path information
string = basename(string)

Expand All @@ -69,8 +251,4 @@ def invoice2number(string: str) -> str:
if delimiter not in string:
delimiter = '_'

# (3) .. as well as invoice numbers
if delimiter not in string:
return string

return string.split(delimiter)[-1].replace('.pdf', '')
return string.split(delimiter)
6 changes: 6 additions & 0 deletions knv_cli/knv/orders.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,9 @@ def process_data(self, order_data: list) -> dict:
def orders(self):
# Sort orders by date & order number, output as list
return sorted(list(self.data.values()), key=itemgetter('Datum', 'ID'))


# DATA HELPER methods

def convert_tax_rate(self, string: str) -> str:
return str(string).replace(',00', '') + '%'

0 comments on commit 97bd5d5

Please sign in to comment.