-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf_to_txt_trivial.py
33 lines (27 loc) · 1.01 KB
/
pdf_to_txt_trivial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
filepath = "data/pdf/Echantillon Facture SNM .pdf"
filepath = 'data/pdf/pdf2.pdf'
#working
import PyPDF2 #pip install ptpdf2
#PyPDF2 cannot read scanned (or image based) files. we'll use an ocr if so
def methode1(filepath):
Liste_pages=[]
#open allows you to read the file.
pdfFileObj = open(filepath,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
num_pages = pdfReader.numPages
for count in range(num_pages):
text = pdfReader.getPage(count).extractText() #un str
Liste_pages.append(text)
return Liste_pages
#was working but now there is a pb
import textract #pip install textract
#It likely contains a lot of spaces, possibly junk such as '\n,' etc.
def methode2(filepath):
text = textract.process(filepath, method='tesseract', language='fr')
return [text]
#better mais il bug sur EDP-Etudiants et la facture scannée
print(methode1(filepath))
'''
methode1,2
- https://betterprogramming.pub/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f
'''