-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf2text.py
165 lines (146 loc) · 5.64 KB
/
pdf2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
美赛获奖证书信息OCR
"""
import fitz
import PIL
import pytesseract
import os
from multiprocessing import Process
import re
pytesseract.pytesseract.tesseract_cmd = 'E:/prog/TesseractOCR/tesseract.exe'
def pdf2text(pdfPath, zoom_x=6, zoom_y=6, rotation_angle=0):
students = ['']
university = ''
prize = ''
try:
# 打开PDF文件
pdf = fitz.open(pdfPath)
# 逐页读取PDF
for pg in range(0, pdf.pageCount):
page = pdf[pg]
rect = page.rect
clip = fitz.Rect(rect.width * 0.25, rect.height * 0.27,
rect.width * 0.8, rect.height * 0.7)
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
pix = page.get_pixmap(matrix=trans, alpha=False, clip=clip)
img = PIL.Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
text = pytesseract.image_to_string(img)
text = text.split('\n')
text = [s for s in text if s]
try:
advisor_index = text.index('With Student Advisor')
except:
try:
advisor_index = text.index('With Faculty Advisor')
except:
advisor_index = text.index('Was Designated As') - 3
try:
univ_index = text.index('Was Designated As') - 1
students = text[0:advisor_index]
university = text[univ_index]
except:
students = text[0:3]
university = text[5]
prize = text[-1]
# pix.save(imgPath + str(pg) + ".png")
pdf.close()
except:
print(pdfPath, 'File Exception')
return students, university, prize
def savetext(start, end, count):
global logger
all_data = ''
huake_data = ''
for control_number in range(start, end):
control_number = '%05d' % control_number
control_number = 2200000 + int(control_number)
path = "./paper/" + str(control_number) + ".pdf"
if os.path.exists(path):
students, university, prize = pdf2text(path)
students = ','.join(students)
row = '%s,%s,%s\n' % (students, university, prize)
if prize:
num_row = '%s,%s' % (control_number, row)
num_row = num_row.encode('gbk', 'backslashreplace').decode('gbk', 'backslashreplace')
try:
print(num_row)
except:
print(control_number, ' -- gbk encoding error')
all_data += num_row
if university == 'Huazhong University of Science and Technology':
huake_data += num_row
with open('./all/all' + str(count) + '.txt', 'w', encoding='utf-8') as al:
# all_data = all_data.encode('utf-8')
al.write(all_data)
print('./all/all' + str(count) + '.txt save sucessfully')
with open('./huake/huake' + str(count) + '.txt', 'w', encoding='utf-8') as huake:
# huake_data = huake_data.decode('utf-8')
huake.write(huake_data)
print('./huake/huake' + str(count) + '.txt save sucessfully')
# def savetextlist(lists,count):
# all_data = ''
# huake_data = ''
# for control_number in lists:
# path = "./paper/" + str(control_number) + ".pdf"
# if os.path.exists(path):
# students, university, prize = pdf2text(path)
# students = ','.join(students)
# row = '%s,%s,%s\n' % (students, university, prize)
# if prize:
# num_row = '%s,%s' % (control_number, row)
# num_row = num_row.encode('gbk', 'backslashreplace').decode('gbk', 'backslashreplace')
# try:
# print(num_row)
# except:
# print(control_number, ' -- gbk encoding error')
#
# all_data += num_row
# if university == 'Huazhong University of Science and Technology':
# huake_data += num_row
# with open('./all/all_add' + str(count) + '.txt', 'w', encoding='utf-8') as al:
# # all_data = all_data.encode('utf-8')
# al.write(all_data)
# print('./all/all_add' + str(count) + '.txt save sucessfully')
# with open('./huake/huake_add' + str(count) + '.txt', 'w', encoding='utf-8') as huake:
# # huake_data = huake_data.decode('utf-8')
# huake.write(huake_data)
# print('./huake/huake_add' + str(count) + '.txt save sucessfully')
def txtjoint(dir):
files = os.listdir(dir)
res = ''
for file in files:
with open(dir + file, "r", encoding='utf-8') as f:
content = f.read()
res += content
with open(dir + "all.txt", "w", encoding='utf-8') as outFile:
outFile.write(res)
outFile.close()
if __name__ == '__main__':
step = 1000
count = 1
for i in range(1, 30000, step):
start = i
end = i + step - 1
p = Process(target=savetext, args=(start, end, count))
p.start()
count += 1
# 合并文件
all_dir = "./all/"
huake_dir = './huake/'
txtjoint(all_dir)
txtjoint(huake_dir)
# with open('log.log','r') as f:
# alltxt = f.read()
# fail_lists = re.findall('./paper/(\d*?).pdf File Exception',alltxt,re.S)
#
# with open('fail_lists.txt','w') as f:
# f.write('\n'.join(fail_lists))
# step = 20
# count = 1
# for i in range(1, len(fail_lists), step):
# start = i
# end = i + step - 1
# lists = fail_lists[start:end]
# p = Process(target=savetextlist, args=(lists,count,))
# p.start()
# count += 1