1
+ '''
2
+ ML part for IP-APP-Price-Tracker project
3
+ OCR invoice recognition using pytesseract module to extract:
4
+ 1. Category of expense
5
+ 2. Total Amount spent
6
+ 3. Tax Amount
7
+ 4. Score(accuracy of OCR)
8
+ '''
9
+
10
+ #importing necessary modules
11
+ from flask import Flask , request
12
+ import pytesseract
13
+ import os
14
+ import cv2
15
+ import json
16
+ import re
17
+ from Levenshtein import distance
18
+ from werkzeug .utils import secure_filename
19
+
20
+ #local folder to save the image of invoice
21
+ UPLOAD_FOLDER = 'D:/IP-App-Price-Tracker/src/uploads/'
22
+ ALLOWED_EXTENSIONS = set ([ 'png' , 'jpg' , 'jpeg' ]) #the extensions allowed for images uplaoded
23
+
24
+ #starting Flask server to get image
25
+ app = Flask (__name__ )
26
+ app .config ['UPLOAD_FOLDER' ] = UPLOAD_FOLDER
27
+
28
+ '''
29
+ OCR Processing Part
30
+ '''
31
+
32
+ #basic dictionary to classify the text extracted from image after OCR processing
33
+ dic = {'category' : [],
34
+ 'amount' : [],
35
+ 'score' : [],
36
+ 'tax' : []}
37
+
38
+ #assigning directory for pytesseract
39
+ pytesseract .pytesseract .tesseract_cmd = r'C:\Users\USER\AppData\Local\Tesseract-OCR\tesseract.exe'
40
+
41
+ #OCR processing the Image into a string
42
+ def ocr_process (img , resolution = 450 , page_seg_method = '3' ):
43
+ txt = ""
44
+ txt = "" .join ([txt , pytesseract .image_to_string (img , lang = "eng" ,config = '--psm ' + page_seg_method )])
45
+ return txt
46
+
47
+ #Categorising the Invoices according to some keywords from text extracted from the images
48
+ def categories (result_string ,dic = dic ):
49
+
50
+ #Categories
51
+ dining = re .findall ('(server)|(Food)|(Dining)|(order)|(table)|(restaurant)' ,result_string , re .IGNORECASE )
52
+ apparel = re .findall ('(shirt)|(pant)|(jeans)|(clothing)|(sleeve)|(men)|(ladies)' ,result_string ,re .IGNORECASE )
53
+ medicine = re .findall ('(medical)|(pharmacy)|(hospital)|(doctor)' ,result_string ,re .IGNORECASE )
54
+ accessories = re .findall ('(accesories)|(earring)' ,result_string ,re .IGNORECASE )
55
+
56
+ #Appending the Categories into the dictionary
57
+ if (len (dining )!= 0 ):
58
+ dic ['category' ].append ('Dining' )
59
+ elif (len (apparel )!= 0 ):
60
+ dic ['category' ].append ('Apparel' )
61
+ elif (len (medicine )!= 0 ):
62
+ dic ['category' ].append ('Medical' )
63
+ elif (len (accessories )!= 0 ):
64
+ dic ['category' ].append ('Accessories' )
65
+ return dic
66
+
67
+ #Scoring - removing frequently used words with '%d'
68
+ def scoring (regex_expression , item ):
69
+ item_revised = re .sub ('[0-9]{1,15}.{1,15}[0-9]{2}' , '%d' , item ).lower ()
70
+ #removing most frequently repeated words
71
+ item_revised = (item_revised .replace ('total' , '' )
72
+ .replace ('amount' , '' )
73
+ .replace ('balance' , '' )
74
+ .replace ('due' , '' )
75
+ .replace ('$' , '' )
76
+ .replace ('₹' ,'' )
77
+ .replace ('USD' , '' )
78
+ .replace ('INR' ,'' )
79
+ .strip ())
80
+ #if all the strings are replaced with '%d' scpre=100, even if one extra character the score will become less
81
+ score = 100 - distance ('%d' , item_revised )
82
+ return score
83
+
84
+ #Extracting Total amount and Tax amount from the bill
85
+ def amount_parsser (invoice_string , regex_expression , dic = dic ):
86
+ #comparing using regular expression
87
+ target_found = re .findall (regex_expression ['regex' ], invoice_string , re .IGNORECASE )
88
+ tax_found = re .findall (regex_expression ['regex_tax' ],invoice_string , re .IGNORECASE )
89
+
90
+ if len (target_found ) == 0 :
91
+ print ('Nothing matched' )
92
+ return dic
93
+ else :
94
+ #Total Amount Processing
95
+ for ind , item in enumerate (target_found ):
96
+ if_tax_in_string = 'tax' in item .lower ()
97
+ if_last_in_string = 'last' in item .lower ()
98
+
99
+ if any ([if_tax_in_string , if_last_in_string ]):
100
+ del (target_found [ind ]) #delete the part with tax and last to find Total Amount
101
+
102
+ else :
103
+ target_amount = re .search ('[0-9]{1,15}.{1,15}[0-9]{2}' , item )
104
+
105
+ if target_amount is not None :
106
+ #extract the amount and score it at the same time
107
+ score = scoring (regex_expression , item .lower ())
108
+ amount = target_amount .group (0 ).replace (',' , '' )
109
+
110
+ #appending values into the dictionary
111
+ dic ['amount' ].append (amount )
112
+ dic ['score' ].append (score )
113
+ #Tax Amount Processing
114
+ for ind ,item in enumerate (tax_found ):
115
+
116
+ if_last_in_string = 'last' in item .lower ()
117
+ if (if_last_in_string ):
118
+ del (target_found [ind ])
119
+
120
+ tax_amount = re .search ('[0-9]{1,15}.{1,15}[0-9]{2}' , item )
121
+
122
+ if tax_amount is not None :
123
+ #extract the amount and score it at the same time
124
+ score = scoring (regex_expression , item .lower ())
125
+ tax = tax_amount .group (0 ).replace (',' , '' )
126
+
127
+ #appending values into the dictionary
128
+ dic ['tax' ].append (tax )
129
+ return dic #return updated Dictionary
130
+
131
+ '''
132
+ API PART
133
+ '''
134
+ #Checking allowed extensions of image
135
+ def allowed_file (filename ):
136
+ return '.' in filename and \
137
+ filename .rsplit ('.' , 1 )[1 ].lower () in ALLOWED_EXTENSIONS
138
+
139
+ #Request file, process it and return json file with information
140
+ @app .route ('/' , methods = ['GET' , 'POST' ])
141
+ def upload_file ():
142
+
143
+ if request .method == 'POST' :
144
+ # check if the post request has the file part
145
+ if 'file' not in request .files :
146
+ print ('No file part' )
147
+
148
+ file = request .files ['file' ]
149
+ if file .filename == '' :
150
+ print ('No selected file' )
151
+
152
+ #Getting the Image from Server and Calling the Processing Functions
153
+ if file and allowed_file (file .filename ):
154
+
155
+ filename = secure_filename (file .filename )
156
+ file .save (os .path .join (app .config ['UPLOAD_FOLDER' ], filename ))
157
+ #reading the image
158
+ img = cv2 .imread (os .path .join (app .config ['UPLOAD_FOLDER' ], filename ))
159
+ img = cv2 .cvtColor (img , cv2 .COLOR_BGR2GRAY )
160
+ #calling OCR function
161
+ ocr_result = ocr_process (img )
162
+ #Regular Expression for finding Total and Tax Amount
163
+ totalAmountRegex = {'regex_tax' : '(gst[^0-9]{1,30}[0-9,]*\.\d\d)' ,
164
+ 'regex' :'(?<!Tax )(?<!Sub)(?<!Sub )(Total[^0-9]{1,30}[0-9,]*\.\d\d)' }
165
+
166
+ #final dictionary that contains updated values
167
+ result_dic = categories (ocr_result ,dic )
168
+ result_dic = amount_parsser (ocr_result , totalAmountRegex , dic )
169
+ #converting dictionary into json file
170
+ app_json = json .dumps (result_dic , sort_keys = True )
171
+ #returning json file back
172
+ return app_json
173
+
174
+ if __name__ == '__main__' :
175
+ app .run ()
0 commit comments