|
| 1 | +import os |
| 2 | +from configparser import ConfigParser |
| 3 | +from io import StringIO |
| 4 | +from io import open |
| 5 | +from concurrent.futures import ProcessPoolExecutor |
| 6 | + |
| 7 | +from pdfminer.pdfinterp import PDFResourceManager |
| 8 | +from pdfminer.pdfinterp import process_pdf |
| 9 | +from pdfminer.converter import TextConverter |
| 10 | +from pdfminer.layout import LAParams |
| 11 | +from docx import Document |
| 12 | + |
| 13 | + |
| 14 | +def read_from_pdf(file_path): |
| 15 | + with open(file_path, 'rb') as file: |
| 16 | + resource_manager = PDFResourceManager() |
| 17 | + return_str = StringIO() |
| 18 | + lap_params = LAParams() |
| 19 | + |
| 20 | + device = TextConverter( |
| 21 | + resource_manager, return_str, laparams=lap_params) |
| 22 | + process_pdf(resource_manager, device, file) |
| 23 | + device.close() |
| 24 | + |
| 25 | + content = return_str.getvalue() |
| 26 | + return_str.close() |
| 27 | + return content |
| 28 | + |
| 29 | + |
| 30 | +def save_text_to_word(content, file_path): |
| 31 | + doc = Document() |
| 32 | + for line in content.split('\n'): |
| 33 | + paragraph = doc.add_paragraph() |
| 34 | + paragraph.add_run(remove_control_characters(line)) |
| 35 | + doc.save(file_path) |
| 36 | + |
| 37 | + |
| 38 | +def remove_control_characters(content): |
| 39 | + mpa = dict.fromkeys(range(32)) |
| 40 | + return content.translate(mpa) |
| 41 | + |
| 42 | + |
| 43 | +def pdf_to_word(pdf_file_path, word_file_path): |
| 44 | + content = read_from_pdf(pdf_file_path) |
| 45 | + save_text_to_word(content, word_file_path) |
| 46 | + |
| 47 | + |
| 48 | +def main(): |
| 49 | + config_parser = ConfigParser() |
| 50 | + config_parser.read('config.cfg') |
| 51 | + config = config_parser['default'] |
| 52 | + |
| 53 | + tasks = [] |
| 54 | + with ProcessPoolExecutor(max_workers=int(config['max_worker'])) as executor: |
| 55 | + for file in os.listdir(config['pdf_folder']): |
| 56 | + extension_name = os.path.splitext(file)[1] |
| 57 | + if extension_name != '.pdf': |
| 58 | + continue |
| 59 | + file_name = os.path.splitext(file)[0] |
| 60 | + pdf_file = config['pdf_folder'] + '/' + file |
| 61 | + word_file = config['word_folder'] + '/' + file_name + '.docx' |
| 62 | + print('正在处理: ', file) |
| 63 | + result = executor.submit(pdf_to_word, pdf_file, word_file) |
| 64 | + tasks.append(result) |
| 65 | + while True: |
| 66 | + exit_flag = True |
| 67 | + for task in tasks: |
| 68 | + if not task.done(): |
| 69 | + exit_flag = False |
| 70 | + if exit_flag: |
| 71 | + print('完成') |
| 72 | + exit(0) |
| 73 | + |
| 74 | + |
| 75 | +if __name__ == '__main__': |
| 76 | + main() |
0 commit comments