app.py

import os
from docx import Document
import streamlit as st
from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text

extracted_images_dir = "extracted_images"
# Get the current directory
current_dir = os.path.dirname(os.path.abspath(__file__))
uploads_dir = os.path.join(current_dir, "uploads")
os.makedirs(uploads_dir, exist_ok=True)
converted_docx_dir = os.path.join(current_dir, "converted_docx")
os.makedirs(converted_docx_dir, exist_ok=True)

# Create a file uploader component
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

# Check if a file was uploaded
if uploaded_file is not None:
    # Save the uploaded file to the uploads directory
    with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
        file.write(uploaded_file.getbuffer())
    st.success("File uploaded successfully!")
else:
    st.info("Please upload a PDF file.")

# Get a list of all PDF files in the uploads directory
pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]

# Create a column layout
col1, col2 = st.columns(2)

# Show checkboxes for each PDF file in col1
with col1:
    selected_files = []
    for file in pdf_files:
        checkbox = st.checkbox(file)
        if checkbox:
            selected_files.append(file)

    # Check if any files are selected
    if selected_files:
        # Create a button to trigger the conversion process
        if st.button("Convert"):
            # Create or empty the extracted_images directory
            print(f"Creating or emptying the {extracted_images_dir} directory")
            create_or_empty_dir(extracted_images_dir)

            # Convert selected PDF files to images
            for file in selected_files:
                pdf_path = os.path.join(uploads_dir, file)
                print(f"Converting {file} to images in {extracted_images_dir}")
                convert_pdf_to_images(pdf_path, extracted_images_dir)
                # Create a Word document with text extracted from images
                output_docx = os.path.join(
                    converted_docx_dir, f'{file.replace(".pdf", "")}.docx'
                )
                image_folder = os.path.join(current_dir, extracted_images_dir)
                print(
                    f'Creating {file.replace(".pdf", "")}.docx with text extracted from images in the {extracted_images_dir}'
                )
                create_docx_with_text(image_folder, output_docx)

            st.success("Conversion completed successfully!")

# Show documents from the converted_docx folder in col2
with col2:
    docx_files = [
        file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")
    ]
    for file in docx_files:
        st.download_button(
            f"Download {file}",
            open(os.path.join(converted_docx_dir, file), "rb").read(),
            file_name=file,
            mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        )