-
Notifications
You must be signed in to change notification settings - Fork 2
/
app.py
76 lines (66 loc) · 2.93 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
from docx import Document
import streamlit as st
from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text
extracted_images_dir = "extracted_images"
# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
uploads_dir = os.path.join(current_dir, "uploads")
os.makedirs(uploads_dir, exist_ok=True)
converted_docx_dir = os.path.join(current_dir, "converted_docx")
os.makedirs(converted_docx_dir, exist_ok=True)
# Create a file uploader component
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
# Check if a file was uploaded
if uploaded_file is not None:
# Save the uploaded file to the uploads directory
with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
file.write(uploaded_file.getbuffer())
st.success("File uploaded successfully!")
else:
st.info("Please upload a PDF file.")
# Get a list of all PDF files in the uploads directory
pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]
# Create a column layout
col1, col2 = st.columns(2)
# Show checkboxes for each PDF file in col1
with col1:
selected_files = []
for file in pdf_files:
checkbox = st.checkbox(file)
if checkbox:
selected_files.append(file)
# Check if any files are selected
if selected_files:
# Create a button to trigger the conversion process
if st.button("Convert"):
# Create or empty the extracted_images directory
print(f"Creating or emptying the {extracted_images_dir} directory")
create_or_empty_dir(extracted_images_dir)
# Convert selected PDF files to images
for file in selected_files:
pdf_path = os.path.join(uploads_dir, file)
print(f"Converting {file} to images in {extracted_images_dir}")
convert_pdf_to_images(pdf_path, extracted_images_dir)
# Create a Word document with text extracted from images
output_docx = os.path.join(
converted_docx_dir, f'{file.replace(".pdf", "")}.docx'
)
image_folder = os.path.join(current_dir, extracted_images_dir)
print(
f'Creating {file.replace(".pdf", "")}.docx with text extracted from images in the {extracted_images_dir}'
)
create_docx_with_text(image_folder, output_docx)
st.success("Conversion completed successfully!")
# Show documents from the converted_docx folder in col2
with col2:
docx_files = [
file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")
]
for file in docx_files:
st.download_button(
f"Download {file}",
open(os.path.join(converted_docx_dir, file), "rb").read(),
file_name=file,
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)