HarshiniR4
diff --git a/‎Procfile.txt‎
Lines changed: 1 addition & 0 deletions b/‎Procfile.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎__pycache__/extract_amount.cpython-37.pyc‎
1.23 KB b/‎__pycache__/extract_amount.cpython-37.pyc‎
1.23 KB
diff --git a/‎app_inv_ocr.py‎
Lines changed: 17 additions & 0 deletions b/‎app_inv_ocr.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎invoice_rec_tesseract.py‎
Lines changed: 175 additions & 0 deletions b/‎invoice_rec_tesseract.py‎
Lines changed: 175 additions & 0 deletions
diff --git a/‎ipapp_bill_ocr.py‎
Lines changed: 21 additions & 0 deletions b/‎ipapp_bill_ocr.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎reciept_image_dataset/1000-receipt.jpg‎
58.6 KB b/‎reciept_image_dataset/1000-receipt.jpg‎
58.6 KB
diff --git a/‎reciept_image_dataset/1001-receipt.jpg‎
79.8 KB b/‎reciept_image_dataset/1001-receipt.jpg‎
79.8 KB
diff --git a/‎reciept_image_dataset/1002-receipt.jpg‎
128 KB b/‎reciept_image_dataset/1002-receipt.jpg‎
128 KB
diff --git a/‎reciept_image_dataset/1003-receipt.jpg‎
108 KB b/‎reciept_image_dataset/1003-receipt.jpg‎
108 KB
diff --git a/‎reciept_image_dataset/1004-receipt.jpg‎
84.5 KB b/‎reciept_image_dataset/1004-receipt.jpg‎
84.5 KB
@@ -0,0 +1 @@
+web: gunicorn app_inv_ocr:app
@@ -0,0 +1,17 @@
+import requests
+'''
+I've used locasl directory for the sake of testing the module
+Modify the url according to the other modules
+'''
+
+filename='D:/IP-App-Price-Tracker/src/invoice_recognition/reciept_image_dataset/1204-receipt.jpg'
+url = 'http://127.0.0.1:5000/'  #I've used this for testing purpose
+
+# files = {'image': (open(filename, 'rb'), "image/jpeg")}
+
+files = {"file": ( open(filename, "rb"))}
+headers = {'authorization': "Bearer {token}"}
+# print(requests.post(url, files=files))
+
+response = requests.request('GET', url, files=files, headers=headers)
+print(response.text)
@@ -0,0 +1,175 @@
+'''
+ML part for IP-APP-Price-Tracker project
+    OCR invoice recognition using pytesseract module to extract:
+        1. Category of expense
+        2. Total Amount spent
+        3. Tax Amount
+        4. Score(accuracy of OCR)
+'''
+
+#importing necessary modules
+from flask import Flask, request
+import pytesseract
+import os
+import cv2
+import json
+import re
+from Levenshtein import distance
+from werkzeug.utils import secure_filename
+
+#local folder to save the image of invoice
+UPLOAD_FOLDER = 'D:/IP-App-Price-Tracker/src/uploads/'   
+ALLOWED_EXTENSIONS = set([ 'png', 'jpg', 'jpeg'])  #the extensions allowed for images uplaoded
+
+#starting Flask server to get image 
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+
+'''
+OCR  Processing Part
+'''
+
+#basic dictionary to classify the text extracted from image after OCR processing
+dic = {'category': [],
+       'amount': [],
+       'score': [],
+       'tax': []}
+
+#assigning directory for pytesseract
+pytesseract.pytesseract.tesseract_cmd = r'C:\Users\USER\AppData\Local\Tesseract-OCR\tesseract.exe'
+
+#OCR processing the Image into a string 
+def ocr_process(img, resolution=450, page_seg_method='3'):
+    txt=""
+    txt = "".join([txt, pytesseract.image_to_string(img, lang="eng",config='--psm ' + page_seg_method)])
+    return txt
+
+#Categorising the Invoices according to some keywords from text extracted from the images
+def categories(result_string,dic=dic):
+    
+    #Categories
+    dining=re.findall('(server)|(Food)|(Dining)|(order)|(table)|(restaurant)',result_string, re.IGNORECASE)
+    apparel=re.findall('(shirt)|(pant)|(jeans)|(clothing)|(sleeve)|(men)|(ladies)',result_string,re.IGNORECASE)
+    medicine=re.findall('(medical)|(pharmacy)|(hospital)|(doctor)',result_string,re.IGNORECASE)
+    accessories=re.findall('(accesories)|(earring)',result_string,re.IGNORECASE)
+    
+    #Appending the Categories into the dictionary
+    if(len(dining)!=0):
+       dic['category'].append('Dining')
+    elif(len(apparel)!=0):
+        dic['category'].append('Apparel')
+    elif(len(medicine)!=0):
+        dic['category'].append('Medical')
+    elif(len(accessories)!=0):
+        dic['category'].append('Accessories')
+    return dic
+
+#Scoring - removing frequently used words with '%d'
+def scoring(regex_expression, item):
+    item_revised = re.sub('[0-9]{1,15}.{1,15}[0-9]{2}', '%d', item).lower()
+    #removing most frequently repeated words
+    item_revised = (item_revised.replace('total', '')
+                    .replace('amount', '')
+                    .replace('balance', '')
+                    .replace('due', '')
+                    .replace('$', '')
+                    .replace('₹','')
+                    .replace('USD', '')
+                    .replace('INR','')
+                    .strip())
+    #if all the strings are replaced with '%d' scpre=100, even if one extra character the score will become less
+    score = 100 - distance('%d', item_revised)   
+    return score                                
+
+#Extracting Total amount and Tax amount from the bill
+def amount_parsser(invoice_string, regex_expression, dic=dic):
+    #comparing using regular expression 
+    target_found = re.findall(regex_expression['regex'], invoice_string, re.IGNORECASE)
+    tax_found=re.findall(regex_expression['regex_tax'],invoice_string, re.IGNORECASE)
+    
+    if len(target_found) == 0 :
+         print('Nothing matched')
+         return dic
+    else:
+    #Total Amount Processing
+        for ind, item in enumerate(target_found):
+            if_tax_in_string = 'tax' in item.lower()
+            if_last_in_string = 'last' in item.lower()
+            
+            if any([if_tax_in_string, if_last_in_string]):
+                del(target_found[ind])  #delete the part with tax and last to find Total Amount
+
+            else:
+                target_amount = re.search('[0-9]{1,15}.{1,15}[0-9]{2}', item)
+                
+                if target_amount is not None:
+                    #extract the amount and score it at the same time
+                    score = scoring(regex_expression, item.lower())
+                    amount = target_amount.group(0).replace(',', '')
+                    
+                    #appending values into the dictionary
+                    dic['amount'].append(amount)
+                    dic['score'].append(score)
+    #Tax Amount Processing
+        for ind,item in enumerate(tax_found):
+        
+            if_last_in_string = 'last' in item.lower()
+            if (if_last_in_string):
+                del(target_found[ind])  
+                
+            tax_amount=re.search('[0-9]{1,15}.{1,15}[0-9]{2}', item)
+            
+            if tax_amount is not None:
+                 #extract the amount and score it at the same time
+                    score = scoring(regex_expression, item.lower())
+                    tax = tax_amount.group(0).replace(',', '')  
+                    
+                    #appending values into the dictionary
+                    dic['tax'].append(tax)
+        return dic                           #return updated Dictionary
+
+'''
+API PART
+'''
+#Checking allowed extensions of image
+def allowed_file(filename):
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+
+#Request file, process it and return json file with information
+@app.route('/', methods=['GET', 'POST'])
+def upload_file():
+    
+    if request.method == 'POST':
+       # check if the post request has the file part
+        if 'file' not in request.files:
+            print('No file part')
+            
+        file = request.files['file']
+        if file.filename == '':
+            print('No selected file')
+            
+    #Getting the Image from Server and Calling the Processing Functions
+        if file and allowed_file(file.filename):            
+            
+            filename = secure_filename(file.filename)
+            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+            #reading the image
+            img= cv2.imread(os.path.join(app.config['UPLOAD_FOLDER'], filename))
+            img=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+            #calling OCR function
+            ocr_result=ocr_process(img)
+            #Regular Expression for finding Total and Tax Amount
+            totalAmountRegex = {'regex_tax': '(gst[^0-9]{1,30}[0-9,]*\.\d\d)',
+                                 'regex':'(?<!Tax )(?<!Sub)(?<!Sub )(Total[^0-9]{1,30}[0-9,]*\.\d\d)'}
+           
+            #final dictionary that contains updated values
+            result_dic=categories(ocr_result,dic)
+            result_dic = amount_parsser(ocr_result, totalAmountRegex, dic)
+            #converting dictionary into json file
+            app_json = json.dumps(result_dic, sort_keys=True)
+            #returning json file back 
+            return app_json
+
+if __name__ == '__main__':
+    app.run()
@@ -0,0 +1,21 @@
+import easyocr
+import PIL
+from PIL import ImageDraw
+import keras
+from keras.preprocessing.image import save_img
+from keras.preprocessing.image import img_to_array
+reader = easyocr.Reader(['en'], gpu = False)
+im = PIL.Image.open("1184-receipt.jpg")
+bounds = reader.readtext('1184-receipt.jpg')
+def draw_boxes(image, bounds, color='yellow', width=2):
+    draw = ImageDraw.Draw(image)
+    for bound in bounds:
+        p0, p1, p2, p3 = bound[0]
+        draw.line([*p0, *p1, *p2, *p3, *p0], fill=color, width=width)
+    return image
+
+print(draw_boxes(im, bounds))
+for i in bounds:
+    print(i[1])
+img_array = img_to_array(im)
+save_img('1184-receipt-boxed.jpg', img_array)