Skip to content

Commit c5340ae

Browse files
author
Zang Zhiya
committed
init commit
0 parents  commit c5340ae

File tree

4 files changed

+94
-0
lines changed

4 files changed

+94
-0
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
**/__pycache__
2+
*.pyc
3+
*.egg-info
4+
5+
.DS_Store

config.cfg

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[default]
2+
pdf_folder=/Users/Zzy/Documents/Code/pdf2word/pdf
3+
word_folder=/Users/Zzy/Documents/Code/pdf2word/word
4+
max_worker=5

main.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
from configparser import ConfigParser
3+
from io import StringIO
4+
from io import open
5+
from concurrent.futures import ProcessPoolExecutor
6+
7+
from pdfminer.pdfinterp import PDFResourceManager
8+
from pdfminer.pdfinterp import process_pdf
9+
from pdfminer.converter import TextConverter
10+
from pdfminer.layout import LAParams
11+
from docx import Document
12+
13+
14+
def read_from_pdf(file_path):
15+
with open(file_path, 'rb') as file:
16+
resource_manager = PDFResourceManager()
17+
return_str = StringIO()
18+
lap_params = LAParams()
19+
20+
device = TextConverter(
21+
resource_manager, return_str, laparams=lap_params)
22+
process_pdf(resource_manager, device, file)
23+
device.close()
24+
25+
content = return_str.getvalue()
26+
return_str.close()
27+
return content
28+
29+
30+
def save_text_to_word(content, file_path):
31+
doc = Document()
32+
for line in content.split('\n'):
33+
paragraph = doc.add_paragraph()
34+
paragraph.add_run(remove_control_characters(line))
35+
doc.save(file_path)
36+
37+
38+
def remove_control_characters(content):
39+
mpa = dict.fromkeys(range(32))
40+
return content.translate(mpa)
41+
42+
43+
def pdf_to_word(pdf_file_path, word_file_path):
44+
content = read_from_pdf(pdf_file_path)
45+
save_text_to_word(content, word_file_path)
46+
47+
48+
def main():
49+
config_parser = ConfigParser()
50+
config_parser.read('config.cfg')
51+
config = config_parser['default']
52+
53+
tasks = []
54+
with ProcessPoolExecutor(max_workers=int(config['max_worker'])) as executor:
55+
for file in os.listdir(config['pdf_folder']):
56+
extension_name = os.path.splitext(file)[1]
57+
if extension_name != '.pdf':
58+
continue
59+
file_name = os.path.splitext(file)[0]
60+
pdf_file = config['pdf_folder'] + '/' + file
61+
word_file = config['word_folder'] + '/' + file_name + '.docx'
62+
print('正在处理: ', file)
63+
result = executor.submit(pdf_to_word, pdf_file, word_file)
64+
tasks.append(result)
65+
while True:
66+
exit_flag = True
67+
for task in tasks:
68+
if not task.done():
69+
exit_flag = False
70+
if exit_flag:
71+
print('完成')
72+
exit(0)
73+
74+
75+
if __name__ == '__main__':
76+
main()

requirements.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
attrs==17.4.0
2+
lxml==4.1.1
3+
pdfminer3k==1.3.1
4+
pluggy==0.6.0
5+
ply==3.11
6+
py==1.5.2
7+
pytest==3.4.1
8+
python-docx==0.8.6
9+
six==1.11.0

0 commit comments

Comments
 (0)