Asset Confirms

This program extracts floats from an Excel file and searches local PDFs for matching floats using OCR.

User provides a directory ('dir') to the console.
Search dir for an .xlsx file
Once found, search .xlsx file for floats (dollars)
Search all PDFs in dir for matching floats
- If not text-readable, use OCR to convert to text
Write matching floats and their locations to new .xlsx file. Also report non-matches.

Kickoff

Finds first .xlsx file in directory, extracts floats, then breaks loop. excelRead()

main() #Receives path by user in console then calls doExtract()

do_extract(path) #Path passed from main()
...
  for f_now in os.listdir(path):
    ext = os.path.splitext(f_now)[1]
    if ext.lower() in ['.xlsx']:
      float_dict = extract_excel(os.path.join(path, f_now))
      break
...

Loop through each PDF

In doExtract()

for f_now in os.listdir(path):
  ext = os.path.splitext(f_now)[1]
  if ext.lower() in ['.pdf']:
    print(path, f_now)
    doc_results = {}
    doc_results = convert_doc_pdf(os.path.join(path, f_now), float_dict) #Returns float matches in a PDF

Now in covertDocPDF() makeCV2(), doJPGPNG()

convert_doc_pdf(path, float_dict)
...
  doc_results = {}
    try:
      img = Image(filename=path, resolution=350)
    except:
      print('Corrupt File - {0}'.format(path))
      return

    img_seq = img.sequence

    for i in range(len(img_seq)):
      cv_ready = make_cv2(Image(image=img_seq[i]))
      text = do_jpg_png(cv_ready)
      results = find_text(text, float_dict, [path, i+1, len(img_seq)]) #Find matches between float_dict and given text
      for key, val in results.items():
        if key in doc_results:
          doc_results[key].append(val)
        else:
          doc_results[key] = val
      print('Scan {0} page {1}/{2} completed'.format(str(path).split('\\')[-1], i+1, len(img_seq)))
    return doc_results

Back in doExtract(), only do something if matches were found. Then return matches and original float list.

if doc_results != None: #If any float matches were
  for key, val in doc_results.items():
    if key in end_dict:
      end_dict[key].append(val)
    else:
      end_dict[key] = [val]

print(len(end_dict))
return end_dict, float_dict

Finish by passing dicts to write new .xlsx file writeExcel()

main()
...
  write_excel(var, end_dict, not_found)
  return

Supporting Functions

Import floats from XLSX file

def extract_excel(file):
  xl = pd.ExcelFile(file)
  df = xl.parse(xl.sheet_names[0])
  n_list = df.values.tolist()

  floats = {}
  for element in n_list:
    for i in element:
      if isinstance(i, float):
        floats[str(i)] = element
        break

  final_dict = {}
  for key, val in floats.items():
    temp_list = []
    for x in val:
      temp_list.append(str(x))
    final_dict[str(key)] = temp_list

  print('Distribution list imported')
  return final_dict

Convert PDF to PNG to CV2-usable for OCR

def make_cv2(img):
  img.convert("png")
  img.background_color = Color('white')
  img.format = 'tif'
  img.alpha_channel = 'remove'

  #Convert PNG to cv2-usable PNG
  img_buffer = np.asarray(bytearray(img.make_blob()), dtype=np.uint8)
  retval = cv2.imdecode(img_buffer, 0)
  return retval


  def do_jpg_png(cvF, ssn_re=''):
    pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files (x86)\\Tesseract-OCR\\tesseract'

    gray = cv2.bitwise_not(cvF)
    #thresh = cv2.adaptiveThreshold(cvF, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 3, 1)
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    #kernel = np.ones((1,1),np.uint8)
    #thresh = cv2.erode(thresh,kernel,iterations = 1)
    #blur = cv2.GaussianBlur(thresh,(1,1),0)
    #thresh = cv2.addWeighted(blur,1.5,thresh,-0.5,0)
    #im2 = cv2.resize(thresh, (int(thresh.shape[0]*0.25), int(thresh.shape[1]*0.25)))
    #cv2.imshow('img',im2)
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()

    #Tesseract find text from img
    #final = ssn_re.findall(pytesseract.image_to_string(thresh).replace(' ', ''))
    #final = formatSSN(final)
    final = pytesseract.image_to_string(thresh)
    return final

Text-match OCR'd image and floats

def find_text(text, float_dict, docLoc):
  found_dict = {}

  for key in float_dict.keys():
    if key in text:
      ind = text.index(key)
      b_text = text[:ind].split('\n')[-1]
      b_ind = 2
      while len(b_text) < 12:
        b_text = text[:ind].split('\n')[-b_ind]+' '+b_text
        b_ind += 1

      a_text = text[ind:].split('\n')[0]
      final_t = [b_text+a_text]
      doc_str = str(docLoc[0]).split('\\')[-1]
      page_str = 'page {0}/{1}'.format(docLoc[1], docLoc[2])
      final_t.append(doc_str+' - '+page_str)
      found_dict[key] = final_t
  return found_dict

Writes output to new XLSX file

def write_excel(path, f_dict, not_found):
  workbook = xlsxwriter.Workbook(path+'/Output.xlsx')
  worksheet = workbook.add_worksheet()

  format_h = workbook.add_format()
  format_h.set_bold()
  format_h.set_bg_color('#d9d9d9')
  format_h.set_align('vcenter')
  format_h.set_align('center')
  format_h.set_border(1)
  format_h.set_border_color('#000000')

  format_l = workbook.add_format()
  format_l.set_bold()
  format_l.set_font_color('blue')
  format_l.set_align('vcenter')

  format_a = workbook.add_format()
  format_a.set_align('vcenter')
  format_a.set_align('center')

  cols = ['Amount', 'Description', 'File']
  for ind, val in enumerate(cols):
    worksheet.write(1, ind+1, val, format_h)

  worksheet.set_column('A:A', 1.0)
  worksheet.set_column('B:B', 10)
  worksheet.set_column('C:C', 51)
  worksheet.set_column('D:D', 20)
  worksheet.set_row(0, 10)

  r_count = 2
  for key, val in f_dict.items():
    for inner_val in val:
      worksheet.write(r_count, 1, ('$'+key), format_a)
      worksheet.write(r_count, 2, inner_val[0])
      worksheet.write_url(r_count, 3, 'files\{0}'.format(inner_val[1].split(' ')[0]), string=inner_val[1], cell_format=format_l)
      r_count += 1

  r_count += 1
  worksheet.write(r_count, 1, "Not Found", format_h)
  r_count += 1
  for val in not_found:
    worksheet.write(r_count, 1, ('$'+val), format_a)
    r_count += 1

  workbook.close()
  return

Name		Name	Last commit message	Last commit date
Latest commit History 22 Commits
__pycache__		__pycache__
.gitignore		.gitignore
Filler text.docx		Filler text.docx
README.md		README.md
_config.yml		_config.yml
excelRead.py		excelRead.py
excelWrite.py		excelWrite.py
main.py		main.py
main.spec		main.spec
pdfClass.py		pdfClass.py
pdfstrip.py		pdfstrip.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

Asset Confirms

Kickoff

Loop through each PDF

Supporting Functions

Import floats from XLSX file

Convert PDF to PNG to CV2-usable for OCR

Text-match OCR'd image and floats

Writes output to new XLSX file

About

Uh oh!

Releases

Packages

Languages

scasella/Asset_Confirms

Folders and files

Latest commit

History

Repository files navigation

Asset Confirms

Kickoff

Loop through each PDF

Supporting Functions

Import floats from XLSX file

Convert PDF to PNG to CV2-usable for OCR

Text-match OCR'd image and floats

Writes output to new XLSX file

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages