Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def get_semantic_chunker_skill(

semantic_text_chunker_skill_inputs = [
InputFieldMappingEntry(
name="content", source="/document/layout/merged_content"
name="content", source="/document/layout_merged_content"
)
]

Expand Down Expand Up @@ -486,7 +486,6 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
batch_size = 1
degree_of_parallelism = 8

output = [OutputFieldMappingEntry(name="content", target_name="merged_content")]
if chunk_by_page:
merger_context = "/document/page_wise_layout/*"
inputs = [
Expand All @@ -498,15 +497,23 @@ def get_layout_and_figure_merger_skill(self, chunk_by_page=False) -> WebApiSkill
source="/document/page_wise_layout/*/figures/*/updated_figure",
),
]
output = [
OutputFieldMappingEntry(name="content", target_name="merged_content")
]
else:
merger_context = "/document/layout"
merger_context = "/document"

inputs = [
InputFieldMappingEntry(name="layout", source="/document/layout"),
InputFieldMappingEntry(
name="figures", source="/document/layout/figures/*/updated_figure"
),
]
output = [
OutputFieldMappingEntry(
name="content", target_name="layout_merged_content"
)
]

figure_analysis_skill = WebApiSkill(
name="Layout and Figure Merger Skill",
Expand Down
87 changes: 61 additions & 26 deletions image_processing/src/image_processing/layout_and_figure_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
from layout_holders import FigureHolder, LayoutHolder
from typing import List


class LayoutAndFigureMerger:
Expand All @@ -18,37 +19,48 @@ def insert_figure_description(
figure_holder (FigureHolder): The figure to be updated.

Returns:
str: The updated Markdown content with the new figure description.
int: The change in length of the Markdown content after updating the figure description.
"""

# Calculate the end index of the content to be replaced
end_index = figure_holder.offset + figure_holder.length

# Ensure that the end_index does not exceed the length of the Markdown content
# Ensure the offset is valid
if figure_holder.offset < 0 or figure_holder.offset > len(
layout_holder.content
):
logging.error("Figure offset is out of bounds.")
raise ValueError("Figure offset is out of bounds.")

# Ensure the end index does not exceed the length of the Markdown content
if end_index > len(layout_holder.content):
logging.info(
"End index exceeds the length of the content. Adjusting the end index to the length of the content."
"End index exceeds the length of the content. Adjusting to the length of the content."
)
end_index = len(layout_holder.content)

logging.info(f"Figure Markdown Content: {figure_holder.markdown}")

# Replace the old string with the new string
layout_holder.content = (
layout_holder.content[: figure_holder.offset]
+ figure_holder.markdown
+ layout_holder.content[end_index:]
)

return len(figure_holder.markdown) - figure_holder.length
inserted_length = len(figure_holder.markdown) - figure_holder.length
logging.info(f"Inserted Length: {inserted_length}")

return layout_holder, inserted_length

async def merge_figures_into_layout(
self, layout: LayoutHolder, figures: list[FigureHolder]
self, layout_holder: LayoutHolder, figures: List[FigureHolder]
) -> LayoutHolder:
"""
Merges the figures into the layout.

Args:
layout (LayoutHolder): The layout text.
figures (list): The list of figures.
layout_holder (LayoutHolder): The layout text.
figures (List[FigureHolder]): The list of figures.

Returns:
LayoutHolder: The updated layout text with the figures.
Expand All @@ -59,30 +71,51 @@ async def merge_figures_into_layout(
# Iterate over the figures
for figure in figures:
logging.info(f"Inserting Figure: {figure.figure_id}")
logging.info(f"Figure Description: {figure.description}")
# Update the figure description in the layout
figure.offset += running_offset
length = self.insert_figure_description(layout, figure)
layout_holder, inserted_length = self.insert_figure_description(
layout_holder, figure
)

# Update the offset
running_offset += length
running_offset += inserted_length

logging.info("Merged figures into layout.")
logging.info("Updated Layout with Figures: %s", layout_holder.content)
# Precompile regex patterns
irrelevant_figure_pattern = re.compile(
r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>",
re.DOTALL,
)
empty_or_whitespace_figure_pattern = re.compile(
r"<figure[^>]*>\s*</figure>", re.DOTALL
)
html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)

# Remove irrelevant figures
irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
layout.content = re.sub(
irrelevant_figure_pattern, "", layout.content, flags=re.DOTALL
layout_holder.content = irrelevant_figure_pattern.sub("", layout_holder.content)
logging.info("Removed irrelevant figures from layout.")
logging.info(
"Updated Layout without Irrelevant Figures: %s", layout_holder.content
)

empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
layout.content = re.sub(
empty_or_whitespace_figure_pattern, "", layout.content, flags=re.DOTALL
# Remove empty or whitespace figures
layout_holder.content = empty_or_whitespace_figure_pattern.sub(
"", layout_holder.content
)

html_comments_pattern = r"<!--.*?-->"
layout.content = re.sub(
html_comments_pattern, "", layout.content, flags=re.DOTALL
logging.info("Removed empty or whitespace figures from layout.")
logging.info(
"Updated Layout without Empty or Whitespace Figures: %s",
layout_holder.content,
)

return layout
# Remove HTML comments
layout_holder.content = html_comments_pattern.sub("", layout_holder.content)
logging.info("Removed HTML comments from layout.")
logging.info("Updated Layout without HTML Comments: %s", layout_holder.content)

return layout_holder

async def merge(self, record: dict) -> dict:
"""
Expand All @@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
Returns:
- record (dict): The record containing the image, its caption, and the generated description.
"""
layout = LayoutHolder(**record["data"]["layout"])
layout_holder = LayoutHolder(**record["data"]["layout"])

figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]

try:
logging.info(f"Input Data: {layout}")
updated_layout = await self.merge_figures_into_layout(layout, figures)
logging.info(f"Updated Data: {updated_layout}")
logging.info(f"Input Data: {layout_holder}")
updated_layout = await self.merge_figures_into_layout(
layout_holder, figures
)
logging.info(f"Updated Layout Data: {updated_layout}")
except Exception as e:
logging.error(f"Failed to merge figures into layout. Error: {e}")
return {
"recordId": record["recordId"],
"data": {},
"data": None,
"errors": [
{
"message": "Failed to merge figures into layout.",
Expand Down
12 changes: 6 additions & 6 deletions image_processing/src/image_processing/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# This file was autogenerated by uv via the following command:
# uv export --frozen --no-hashes --no-editable --no-sources --no-group dev --directory image_processing -o src/image_processing/requirements.txt
aiohappyeyeballs==2.4.4
aiohttp==3.11.11
aiohttp==3.11.12
aiosignal==1.3.2
annotated-types==0.7.0
anyio==4.8.0
Expand All @@ -16,7 +16,7 @@ azure-identity==1.19.0
azure-search==1.0.0b2
azure-search-documents==11.6.0b8
azure-storage-blob==12.24.1
beautifulsoup4==4.12.3
beautifulsoup4==4.13.3
blis==0.7.11
bs4==0.0.2
catalogue==2.0.10
Expand All @@ -34,7 +34,7 @@ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_
et-xmlfile==2.0.0
filelock==3.17.0
frozenlist==1.5.0
fsspec==2024.12.0
fsspec==2025.2.0
h11==0.14.0
httpcore==1.0.7
httpx==0.28.1
Expand All @@ -50,15 +50,15 @@ marisa-trie==1.2.1
markdown-it-py==3.0.0
markupsafe==3.0.2
mdurl==0.1.2
model2vec==0.3.8
model2vec==0.3.9
msal==1.31.1
msal-extensions==1.2.0
msrest==0.7.1
multidict==6.1.0
murmurhash==1.0.12
numpy==1.26.4
oauthlib==3.2.2
openai==1.60.2
openai==1.61.1
openpyxl==3.1.5
packaging==24.2
pandas==2.2.3
Expand All @@ -71,7 +71,7 @@ pydantic==2.10.6
pydantic-core==2.27.2
pygments==2.19.1
pyjwt==2.10.1
pymupdf==1.25.2
pymupdf==1.25.3
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2025.1
Expand Down
63 changes: 48 additions & 15 deletions image_processing/src/image_processing/semantic_text_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:

return len(encoding.encode(string))

def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
cleaned_chunks = []
cleaned_is_table_or_figure_map = []

for current_chunk, is_table_or_figure in zip(chunks, is_table_or_figure_map):
cleaned_chunk = current_chunk.strip()
if len(cleaned_chunk) > 0:
# Add a newline if the chunk ends with a newline (it was a title)
if self.is_markdown_heading(current_chunk):
cleaned_chunk = "\n\n" + cleaned_chunk + "\n\n"

cleaned_chunks.append(cleaned_chunk)
cleaned_is_table_or_figure_map.append(is_table_or_figure)

return cleaned_chunks, cleaned_is_table_or_figure_map

async def chunk(self, text: str) -> list[dict]:
"""Attempts to chunk the text by:
Splitting into sentences
Expand Down Expand Up @@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
grouped_sentences, is_table_or_figure_map
)

forward_pass_chunks, new_is_table_or_figure_map = self.clean_chunks_and_map(
forward_pass_chunks, new_is_table_or_figure_map
)

logging.info(
f"""Number of Forward pass chunks: {
len(forward_pass_chunks)}"""
Expand Down Expand Up @@ -129,7 +149,7 @@ def filter_empty_figures(self, text):

def clean_new_lines(self, text):
# Remove single newlines surrounded by < and >
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text.strip())

# Replace all other single newlines with space
cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
Expand Down Expand Up @@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
self.is_markdown_heading(part)
and part.endswith("\n\n") is False
):
part = part + "\n\n"
part = "\n\n" + part + "\n\n"

heading_split_sentences.append(part)

Expand Down Expand Up @@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
else:
return current_chunk[n]

current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
def get_current_chunk_tokens(chunk_segments):
return self.num_tokens_from_string(" ".join(chunk_segments))

current_chunk_tokens = get_current_chunk_tokens(current_chunk)

if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
logging.info("Comparing chunks")
cosine_sim = self.sentence_similarity(
retrieve_current_chunks_from_n(-2), current_sentence
)
# Calculate the tokens if we were to split
if len(current_chunk) > 2:
would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
else:
would_be_new_chunk = retrive_current_chunk_at_n(0)
would_be_current_chunk = [retrive_current_chunk_at_n(1)]

if (
cosine_sim < self.similarity_threshold
or current_chunk_tokens >= self.max_chunk_tokens
get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
and get_current_chunk_tokens(would_be_current_chunk)
>= self.min_chunk_tokens
):
if len(current_chunk) > 2:
new_chunk = retrieve_current_chunk_up_to_n(1)
current_chunk = [retrive_current_chunk_at_n(-1)]
else:
new_chunk = retrive_current_chunk_at_n(0)
current_chunk = [retrive_current_chunk_at_n(1)]
logging.info("Comparing chunks")
if (
current_chunk_tokens >= self.max_chunk_tokens
or self.sentence_similarity(
retrieve_current_chunks_from_n(-2), current_sentence
)
< self.similarity_threshold
):
return would_be_new_chunk, would_be_current_chunk
else:
logging.info("Chunk too small to compare")
else:
logging.info("Chunk too small to compare")

Expand Down
Loading
Loading