-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2txt.py
68 lines (56 loc) · 2.24 KB
/
pdf2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import sys
import time
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_path):
"""
Reads pages from a PDF and returns the combined text,
plus page count, word count, and character count.
"""
reader = PdfReader(pdf_path)
page_count = len(reader.pages)
text_parts = []
for page in reader.pages:
page_text = page.extract_text() or ""
text_parts.append(page_text)
full_text = "\n".join(text_parts)
words = full_text.split()
return full_text, page_count, len(words), len(full_text)
def main():
print("Processing...")
start_time = time.time()
input_pdf = "input.pdf"
output_txt = "output.txt"
if not os.path.exists(input_pdf):
alt_inputs = [f for f in os.listdir() if f.startswith("input") and f != "input.pdf"]
if alt_inputs:
print(f"Found files: {alt_inputs} — cannot process them.")
print("Error: Required file 'input.pdf' is missing.")
print("Processing completed")
return
try:
# Extract text data
text_data, page_count, word_count, char_count = extract_text_from_pdf(input_pdf)
if len(text_data.strip()) < 10:
print("Warning: Extracted text is very short (<10 characters).")
final_output = output_txt
if os.path.exists(output_txt):
with open(output_txt, 'r', encoding='utf-8') as old_file:
if old_file.read().strip() == text_data.strip():
print(f"Content is identical. Overwriting '{output_txt}'.")
else:
final_output = "output2.txt"
print(f"Content differs. Writing to '{final_output}'.")
with open(final_output, 'w', encoding='utf-8') as out_file:
out_file.write(text_data)
print("\nText extracted successfully:")
print(f"- Processed pages: {page_count}")
print(f"- Total words: {word_count}")
print(f"- Total characters: {char_count}")
except Exception as e:
print(f"\nError: {e}")
finally:
print("Processing completed")
print(f"Time taken: {time.time() - start_time:.2f} seconds")
if __name__ == "__main__":
main()