Skip to content

Fix LibreOffice support for Office document processing #200

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions core/parser/morphik_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ async def _parse_document(self, file: bytes, filename: str) -> Tuple[Dict[str, A
# slower. A simple extension check covers the majority of cases.
strategy = "hi_res"
file_content_type: Optional[str] = None # Default to None for auto-detection
if filename.lower().endswith((".pdf", ".doc", ".docx")):
if filename.lower().endswith((".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx")):
strategy = "fast"
elif filename.lower().endswith(".txt"):
strategy = "fast"
Expand All @@ -285,14 +285,36 @@ async def _parse_document(self, file: bytes, filename: str) -> Tuple[Dict[str, A
)

text = "\n\n".join(str(element) for element in elements if str(element).strip())
return {}, text

# Sanitize text to remove null characters and other problematic Unicode
sanitized_text = self._sanitize_text(text)
return {}, sanitized_text

async def parse_file_to_text(self, file: bytes, filename: str) -> Tuple[Dict[str, Any], str]:
"""Parse file content into text based on file type"""
if self._is_video_file(file, filename):
return await self._parse_video(file)
return await self._parse_document(file, filename)

def _sanitize_text(self, text: str) -> str:
"""Remove null characters and other problematic Unicode that cause PostgreSQL JSON errors"""
if not text:
return text

# Remove null bytes and other control characters that cause JSON/PostgreSQL issues
# Keep common whitespace (space, tab, newline, carriage return)
sanitized = ""
for char in text:
# Allow printable characters and common whitespace
if char.isprintable() or char in ('\n', '\r', '\t', ' '):
sanitized += char
elif ord(char) == 0: # Null character
# Replace with space to maintain text flow
sanitized += ' '
# Skip other problematic control characters

return sanitized

async def split_text(self, text: str) -> List[Chunk]:
"""Split text into chunks using configured chunking strategy"""
return self.chunker.split_text(text)
271 changes: 177 additions & 94 deletions core/services/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,9 @@ def __init__(

# MultiVectorStore initialization is now handled in the FastAPI startup event
# so we don't need to initialize it here again

# Check soffice availability on initialization
self._check_soffice_availability()

# Cache-related data structures
# Maps cache name to active cache object
Expand Down Expand Up @@ -1157,6 +1160,92 @@ async def ingest_file_content(

return doc

def _check_soffice_availability(self):
"""Check if soffice (LibreOffice) is available for document conversion."""
try:
import subprocess
result = subprocess.run(
["soffice", "--version"],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
logger.info(f"LibreOffice detected: {result.stdout.strip()}")
else:
logger.warning(
"LibreOffice (soffice) not found or not working properly. "
"Office document conversion (DOCX, PPTX) will fall back to text extraction."
)
except (subprocess.TimeoutExpired, FileNotFoundError, Exception) as e:
logger.warning(
f"LibreOffice (soffice) not available: {str(e)}. "
"Office document conversion (DOCX, PPTX) will fall back to text extraction."
)

def _convert_office_to_pdf(self, file_content: bytes, file_extension: str) -> Optional[bytes]:
"""Convert Office documents to PDF using LibreOffice.

Args:
file_content: The office document content
file_extension: File extension (e.g., '.docx', '.pptx')

Returns:
PDF content as bytes, or None if conversion fails
"""
with tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) as temp_input:
temp_input.write(file_content)
temp_input_path = temp_input.name

with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf_path = temp_pdf.name

try:
import subprocess

# Get the base filename without extension
base_filename = os.path.splitext(os.path.basename(temp_input_path))[0]
output_dir = os.path.dirname(temp_pdf_path)
expected_pdf_path = os.path.join(output_dir, f"{base_filename}.pdf")

result = subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
temp_input_path,
],
capture_output=True,
text=True,
)

if result.returncode != 0:
logger.error(f"Failed to convert {file_extension} to PDF: {result.stderr}")
return None

# LibreOffice creates the PDF with the same base name in the output directory
if not os.path.exists(expected_pdf_path) or os.path.getsize(expected_pdf_path) == 0:
logger.error(f"Generated PDF is empty or doesn't exist at expected path: {expected_pdf_path}")
return None

# Read the PDF content
with open(expected_pdf_path, "rb") as pdf_file:
return pdf_file.read()

except Exception as e:
logger.error(f"Error converting {file_extension} document: {str(e)}")
return None
finally:
# Clean up temporary files
for path in [temp_input_path, temp_pdf_path]:
if os.path.exists(path):
os.unlink(path)
if 'expected_pdf_path' in locals() and os.path.exists(expected_pdf_path) and expected_pdf_path != temp_pdf_path:
os.unlink(expected_pdf_path)

def img_to_base64_str(self, img: Image):
buffered = BytesIO()
img.save(buffered, format="PNG")
Expand Down Expand Up @@ -1229,115 +1318,109 @@ def _create_chunks_multivector(self, file_type, file_content_base64: str, file_c
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Convert Word document to PDF first
with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as temp_docx:
temp_docx.write(file_content)
temp_docx_path = temp_docx.name

with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
temp_pdf_path = temp_pdf.name


# Convert to PDF using helper method
pdf_content = self._convert_office_to_pdf(file_content, ".docx")
if not pdf_content:
logger.warning("Failed to convert Word document to PDF, falling back to text")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Process the PDF
try:
# Convert Word to PDF
import subprocess

# Get the base filename without extension
base_filename = os.path.splitext(os.path.basename(temp_docx_path))[0]
output_dir = os.path.dirname(temp_pdf_path)
expected_pdf_path = os.path.join(output_dir, f"{base_filename}.pdf")

result = subprocess.run(
[
"soffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
output_dir,
temp_docx_path,
],
capture_output=True,
text=True,
)

if result.returncode != 0:
logger.error(f"Failed to convert Word to PDF: {result.stderr}")
images = pdf2image.convert_from_bytes(pdf_content)
if not images:
logger.warning("No images extracted from PDF")
return [
Chunk(
content=chunk.content,
metadata=(chunk.metadata | {"is_image": False}),
)
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# LibreOffice creates the PDF with the same base name in the output directory
# Check if the expected PDF file exists
if not os.path.exists(expected_pdf_path) or os.path.getsize(expected_pdf_path) == 0:
logger.error(f"Generated PDF is empty or doesn't exist at expected path: {expected_pdf_path}")

images_b64 = [self.img_to_base64_str(image) for image in images]
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
except Exception as e:
logger.error(f"Error converting PDF to images: {str(e)}")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

case "application/vnd.openxmlformats-officedocument.presentationml.presentation" | "application/vnd.ms-powerpoint":
logger.info("Working with PowerPoint presentation!")
if not file_content or len(file_content) == 0:
logger.error("PowerPoint document content is empty")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Convert to PDF using helper method
pdf_content = self._convert_office_to_pdf(file_content, ".pptx")
if not pdf_content:
logger.warning("Failed to convert PowerPoint to PDF, falling back to text")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Process the PDF
try:
images = pdf2image.convert_from_bytes(pdf_content)
if not images:
logger.warning("No images extracted from PDF")
return [
Chunk(
content=chunk.content,
metadata=(chunk.metadata | {"is_image": False}),
)
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Now process the PDF using the correct path
with open(expected_pdf_path, "rb") as pdf_file:
pdf_content = pdf_file.read()

try:
images = pdf2image.convert_from_bytes(pdf_content)
if not images:
logger.warning("No images extracted from PDF")
return [
Chunk(
content=chunk.content,
metadata=(chunk.metadata | {"is_image": False}),
)
for chunk in chunks
]

images_b64 = [self.img_to_base64_str(image) for image in images]
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
except Exception as pdf_error:
logger.error(f"Error converting PDF to images: {str(pdf_error)}")

images_b64 = [self.img_to_base64_str(image) for image in images]
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
except Exception as e:
logger.error(f"Error converting PDF to images: {str(e)}")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
logger.info("Working with Excel spreadsheet!")
if not file_content or len(file_content) == 0:
logger.error("Excel document content is empty")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Convert to PDF using helper method
pdf_content = self._convert_office_to_pdf(file_content, ".xlsx")
if not pdf_content:
logger.warning("Failed to convert Excel to PDF, falling back to text")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

# Process the PDF
try:
images = pdf2image.convert_from_bytes(pdf_content)
if not images:
logger.warning("No images extracted from PDF")
return [
Chunk(
content=chunk.content,
metadata=(chunk.metadata | {"is_image": False}),
)
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]

images_b64 = [self.img_to_base64_str(image) for image in images]
return [Chunk(content=image_b64, metadata={"is_image": True}) for image_b64 in images_b64]
except Exception as e:
logger.error(f"Error processing Word document: {str(e)}")
logger.error(f"Error converting PDF to images: {str(e)}")
return [
Chunk(content=chunk.content, metadata=(chunk.metadata | {"is_image": False}))
for chunk in chunks
]
finally:
# Clean up temporary files
if os.path.exists(temp_docx_path):
os.unlink(temp_docx_path)
if os.path.exists(temp_pdf_path):
os.unlink(temp_pdf_path)
# Also clean up the expected PDF path if it exists and is different from temp_pdf_path
if (
"expected_pdf_path" in locals()
and os.path.exists(expected_pdf_path)
and expected_pdf_path != temp_pdf_path
):
os.unlink(expected_pdf_path)

# case filetype.get_type(ext="txt"):
# logger.info(f"Found text input: chunks for multivector embedding")
# return chunks.copy()
# TODO: Add support for office documents
# case document.Xls | document.Xlsx | document.Ods |document.Odp:
# logger.warning(f"Colpali is not supported for file type {file_type.mime} - skipping")
# case file_type if file_type in DOCUMENT:
# pass

case _:
logger.warning(f"Colpali is not supported for file type {file_type.mime} - skipping")
return [
Expand Down
1 change: 1 addition & 0 deletions dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ RUN apt-get update && apt-get install -y \
cmake \
python3-dev \
git \
libreoffice-nogui \
&& rm -rf /var/lib/apt/lists/*

# Copy the virtual environment from the builder stage
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dependencies = [
"transformers==4.51.3",
"twine>=6.1.0",
"ty>=0.0.1a6",
"unstructured[pdf]>=0.17.2",
"unstructured[pdf,docx,pptx,xlsx]>=0.17.2",
"uvicorn>=0.34.2",
]

Expand Down
Loading