-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_text_from_pdf.py
30 lines (21 loc) · 1008 Bytes
/
extract_text_from_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
"""
Extract the text from the pdf files - note that there are two columns in the original .pdf file
author: Dai Yao (dai@yaod.ai)
"""
import pdfplumber
paper_text_file = open("../data/mksc_conf_2022_paper_text.txt", 'w')
with pdfplumber.open("../data/298315070-2022_INFORMS_Marketing_Science_Program.pdf") as pdf:
'''get the number of pages'''
num_pages = len(pdf.pages)
'''deal with each page'''
for i in range(0, num_pages):
print("page %s" % i)
page = pdf.pages[i]
'''each page has two columns - deal with the left column, then the right one'''
left = page.crop((0, 0.1 * float(page.height), 0.5 * float(page.width), 0.95 * float(page.height)))
right = page.crop((0.5 * float(page.width), 0.1 * float(page.height), page.width, 0.95 * float(page.height)))
text = left.extract_text()
paper_text_file.write(text + "\r\n")
text = right.extract_text()
paper_text_file.write(text + "\r\n")
paper_text_file.close()