-
Notifications
You must be signed in to change notification settings - Fork 0
/
latex_generation.py
44 lines (32 loc) · 1.29 KB
/
latex_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
import pdf2image
import os
import cv2
import pytesseract
from tqdm import tqdm
from utils import escape_special_chars
def determine_content(block_img):
# TODO: This is a very naive way of determining content. We should use a better method.
data_dict = pytesseract.image_to_data(block_img, output_type=pytesseract.Output.DICT)
confidence_leval = np.mean([abs(int(c)) for c in data_dict['conf']])
# If the confidence level is high enough, we can trust the text
if confidence_leval > 40:
return ("text", ' '.join(data_dict['text']))
# If the confidence level is too low, we can't trust the text. So we will treat it as an image
else:
return ("image", block_img)
def generate_latex_block(block_type, content):
if block_type == "text":
content = escape_special_chars(content)
return content
elif block_type == "image":
# TODO - Check if its math or pure image
# http://www.inftyproject.org/en/software.html
# https://github.com/UW-COSMOS/latex-ocr
# https://github.com/blaisewang/img2latex-mathpix
# TODO
pass
def generate_latex(block_img):
block_type, content = determine_content(block_img)
content = generate_latex_block(block_type, content)
return block_type, content