Skip to content

Commit 9f95cc6

Browse files
authored
Merge pull request #1153 from shneeba/document_lookup-images
include images in document lookup
2 parents 3d307e6 + a05aba7 commit 9f95cc6

File tree

1 file changed

+21
-12
lines changed

1 file changed

+21
-12
lines changed

llmware/retrieval.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1320,37 +1320,46 @@ def aggregate_text(self, qr_list):
13201320

13211321
return text_agg, meta_agg
13221322

1323-
def document_lookup(self, doc_id="", file_source=""):
1324-
1325-
""" Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326-
returns all of the non-image text and table blocks in the document. """
1323+
def document_lookup(self, doc_id="", file_source="", include_images=False):
1324+
"""
1325+
Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326+
returns all of the text and table blocks in the document. Images can be optionally included.
1327+
1328+
Parameters:
1329+
doc_id (str): Document ID.
1330+
file_source (str): Source file name.
1331+
include_images (bool): Whether to include images in the result. Defaults to False.
1332+
1333+
Returns:
1334+
list: Filtered list of document blocks.
1335+
"""
13271336

13281337
if doc_id:
13291338
kv_dict = {"doc_ID": doc_id}
13301339
elif file_source:
13311340
kv_dict = {"file_source": file_source}
13321341
else:
1333-
raise RuntimeError("Query document_lookup method requires as input either a document ID or "
1334-
"the name of a file already parsed in the library ")
1342+
raise RuntimeError(
1343+
"Query document_lookup method requires as input either a document ID or "
1344+
"the name of a file already parsed in the library"
1345+
)
13351346

13361347
output = CollectionRetrieval(self.library_name, account_name=self.account_name).filter_by_key_dict(kv_dict)
13371348

13381349
if len(output) == 0:
13391350
logger.warning(f"update: Query - document_lookup - nothing found - {doc_id} - {file_source}")
1340-
result = []
1341-
1342-
return result
1351+
return []
13431352

13441353
output_final = []
13451354

1346-
# exclude images to avoid potential duplicate text
13471355
for entries in output:
1348-
if entries["content_type"] != "image":
1356+
# Filter out images if include_images is False
1357+
if include_images or entries["content_type"] != "image":
13491358
entries.update({"matches": []})
13501359
entries.update({"page_num": entries["master_index"]})
13511360
output_final.append(entries)
13521361

1353-
output_final = sorted(output_final, key=lambda x:x["block_ID"], reverse=False)
1362+
output_final = sorted(output_final, key=lambda x: x["block_ID"], reverse=False)
13541363

13551364
return output_final
13561365

0 commit comments

Comments
 (0)