Skip to content

Commit 895e4b9

Browse files
committed
adding files
0 parents  commit 895e4b9

File tree

213 files changed

+238
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

213 files changed

+238
-0
lines changed

Procfile.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
web: gunicorn app_inv_ocr:app
1.23 KB
Binary file not shown.

app_inv_ocr.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import requests
2+
'''
3+
I've used locasl directory for the sake of testing the module
4+
Modify the url according to the other modules
5+
'''
6+
7+
filename='D:/IP-App-Price-Tracker/src/invoice_recognition/reciept_image_dataset/1204-receipt.jpg'
8+
url = 'http://127.0.0.1:5000/' #I've used this for testing purpose
9+
10+
# files = {'image': (open(filename, 'rb'), "image/jpeg")}
11+
12+
files = {"file": ( open(filename, "rb"))}
13+
headers = {'authorization': "Bearer {token}"}
14+
# print(requests.post(url, files=files))
15+
16+
response = requests.request('GET', url, files=files, headers=headers)
17+
print(response.text)

invoice_rec_tesseract.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
'''
2+
ML part for IP-APP-Price-Tracker project
3+
OCR invoice recognition using pytesseract module to extract:
4+
1. Category of expense
5+
2. Total Amount spent
6+
3. Tax Amount
7+
4. Score(accuracy of OCR)
8+
'''
9+
10+
#importing necessary modules
11+
from flask import Flask, request
12+
import pytesseract
13+
import os
14+
import cv2
15+
import json
16+
import re
17+
from Levenshtein import distance
18+
from werkzeug.utils import secure_filename
19+
20+
#local folder to save the image of invoice
21+
UPLOAD_FOLDER = 'D:/IP-App-Price-Tracker/src/uploads/'
22+
ALLOWED_EXTENSIONS = set([ 'png', 'jpg', 'jpeg']) #the extensions allowed for images uplaoded
23+
24+
#starting Flask server to get image
25+
app = Flask(__name__)
26+
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
27+
28+
'''
29+
OCR Processing Part
30+
'''
31+
32+
#basic dictionary to classify the text extracted from image after OCR processing
33+
dic = {'category': [],
34+
'amount': [],
35+
'score': [],
36+
'tax': []}
37+
38+
#assigning directory for pytesseract
39+
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\USER\AppData\Local\Tesseract-OCR\tesseract.exe'
40+
41+
#OCR processing the Image into a string
42+
def ocr_process(img, resolution=450, page_seg_method='3'):
43+
txt=""
44+
txt = "".join([txt, pytesseract.image_to_string(img, lang="eng",config='--psm ' + page_seg_method)])
45+
return txt
46+
47+
#Categorising the Invoices according to some keywords from text extracted from the images
48+
def categories(result_string,dic=dic):
49+
50+
#Categories
51+
dining=re.findall('(server)|(Food)|(Dining)|(order)|(table)|(restaurant)',result_string, re.IGNORECASE)
52+
apparel=re.findall('(shirt)|(pant)|(jeans)|(clothing)|(sleeve)|(men)|(ladies)',result_string,re.IGNORECASE)
53+
medicine=re.findall('(medical)|(pharmacy)|(hospital)|(doctor)',result_string,re.IGNORECASE)
54+
accessories=re.findall('(accesories)|(earring)',result_string,re.IGNORECASE)
55+
56+
#Appending the Categories into the dictionary
57+
if(len(dining)!=0):
58+
dic['category'].append('Dining')
59+
elif(len(apparel)!=0):
60+
dic['category'].append('Apparel')
61+
elif(len(medicine)!=0):
62+
dic['category'].append('Medical')
63+
elif(len(accessories)!=0):
64+
dic['category'].append('Accessories')
65+
return dic
66+
67+
#Scoring - removing frequently used words with '%d'
68+
def scoring(regex_expression, item):
69+
item_revised = re.sub('[0-9]{1,15}.{1,15}[0-9]{2}', '%d', item).lower()
70+
#removing most frequently repeated words
71+
item_revised = (item_revised.replace('total', '')
72+
.replace('amount', '')
73+
.replace('balance', '')
74+
.replace('due', '')
75+
.replace('$', '')
76+
.replace('₹','')
77+
.replace('USD', '')
78+
.replace('INR','')
79+
.strip())
80+
#if all the strings are replaced with '%d' scpre=100, even if one extra character the score will become less
81+
score = 100 - distance('%d', item_revised)
82+
return score
83+
84+
#Extracting Total amount and Tax amount from the bill
85+
def amount_parsser(invoice_string, regex_expression, dic=dic):
86+
#comparing using regular expression
87+
target_found = re.findall(regex_expression['regex'], invoice_string, re.IGNORECASE)
88+
tax_found=re.findall(regex_expression['regex_tax'],invoice_string, re.IGNORECASE)
89+
90+
if len(target_found) == 0 :
91+
print('Nothing matched')
92+
return dic
93+
else:
94+
#Total Amount Processing
95+
for ind, item in enumerate(target_found):
96+
if_tax_in_string = 'tax' in item.lower()
97+
if_last_in_string = 'last' in item.lower()
98+
99+
if any([if_tax_in_string, if_last_in_string]):
100+
del(target_found[ind]) #delete the part with tax and last to find Total Amount
101+
102+
else:
103+
target_amount = re.search('[0-9]{1,15}.{1,15}[0-9]{2}', item)
104+
105+
if target_amount is not None:
106+
#extract the amount and score it at the same time
107+
score = scoring(regex_expression, item.lower())
108+
amount = target_amount.group(0).replace(',', '')
109+
110+
#appending values into the dictionary
111+
dic['amount'].append(amount)
112+
dic['score'].append(score)
113+
#Tax Amount Processing
114+
for ind,item in enumerate(tax_found):
115+
116+
if_last_in_string = 'last' in item.lower()
117+
if (if_last_in_string):
118+
del(target_found[ind])
119+
120+
tax_amount=re.search('[0-9]{1,15}.{1,15}[0-9]{2}', item)
121+
122+
if tax_amount is not None:
123+
#extract the amount and score it at the same time
124+
score = scoring(regex_expression, item.lower())
125+
tax = tax_amount.group(0).replace(',', '')
126+
127+
#appending values into the dictionary
128+
dic['tax'].append(tax)
129+
return dic #return updated Dictionary
130+
131+
'''
132+
API PART
133+
'''
134+
#Checking allowed extensions of image
135+
def allowed_file(filename):
136+
return '.' in filename and \
137+
filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
138+
139+
#Request file, process it and return json file with information
140+
@app.route('/', methods=['GET', 'POST'])
141+
def upload_file():
142+
143+
if request.method == 'POST':
144+
# check if the post request has the file part
145+
if 'file' not in request.files:
146+
print('No file part')
147+
148+
file = request.files['file']
149+
if file.filename == '':
150+
print('No selected file')
151+
152+
#Getting the Image from Server and Calling the Processing Functions
153+
if file and allowed_file(file.filename):
154+
155+
filename = secure_filename(file.filename)
156+
file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
157+
#reading the image
158+
img= cv2.imread(os.path.join(app.config['UPLOAD_FOLDER'], filename))
159+
img=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
160+
#calling OCR function
161+
ocr_result=ocr_process(img)
162+
#Regular Expression for finding Total and Tax Amount
163+
totalAmountRegex = {'regex_tax': '(gst[^0-9]{1,30}[0-9,]*\.\d\d)',
164+
'regex':'(?<!Tax )(?<!Sub)(?<!Sub )(Total[^0-9]{1,30}[0-9,]*\.\d\d)'}
165+
166+
#final dictionary that contains updated values
167+
result_dic=categories(ocr_result,dic)
168+
result_dic = amount_parsser(ocr_result, totalAmountRegex, dic)
169+
#converting dictionary into json file
170+
app_json = json.dumps(result_dic, sort_keys=True)
171+
#returning json file back
172+
return app_json
173+
174+
if __name__ == '__main__':
175+
app.run()

ipapp_bill_ocr.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import easyocr
2+
import PIL
3+
from PIL import ImageDraw
4+
import keras
5+
from keras.preprocessing.image import save_img
6+
from keras.preprocessing.image import img_to_array
7+
reader = easyocr.Reader(['en'], gpu = False)
8+
im = PIL.Image.open("1184-receipt.jpg")
9+
bounds = reader.readtext('1184-receipt.jpg')
10+
def draw_boxes(image, bounds, color='yellow', width=2):
11+
draw = ImageDraw.Draw(image)
12+
for bound in bounds:
13+
p0, p1, p2, p3 = bound[0]
14+
draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
15+
return image
16+
17+
print(draw_boxes(im, bounds))
18+
for i in bounds:
19+
print(i[1])
20+
img_array = img_to_array(im)
21+
save_img('1184-receipt-boxed.jpg', img_array)
58.6 KB
79.8 KB
128 KB
108 KB
84.5 KB

0 commit comments

Comments
 (0)