@@ -1320,37 +1320,46 @@ def aggregate_text(self, qr_list):
1320
1320
1321
1321
return text_agg , meta_agg
1322
1322
1323
- def document_lookup (self , doc_id = "" , file_source = "" ):
1324
-
1325
- """ Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326
- returns all of the non-image text and table blocks in the document. """
1323
+ def document_lookup (self , doc_id = "" , file_source = "" , include_images = False ):
1324
+ """
1325
+ Takes as an input either a doc_id or file_source (e.g., filename) that is in a Library, and
1326
+ returns all of the text and table blocks in the document. Images can be optionally included.
1327
+
1328
+ Parameters:
1329
+ doc_id (str): Document ID.
1330
+ file_source (str): Source file name.
1331
+ include_images (bool): Whether to include images in the result. Defaults to False.
1332
+
1333
+ Returns:
1334
+ list: Filtered list of document blocks.
1335
+ """
1327
1336
1328
1337
if doc_id :
1329
1338
kv_dict = {"doc_ID" : doc_id }
1330
1339
elif file_source :
1331
1340
kv_dict = {"file_source" : file_source }
1332
1341
else :
1333
- raise RuntimeError ("Query document_lookup method requires as input either a document ID or "
1334
- "the name of a file already parsed in the library " )
1342
+ raise RuntimeError (
1343
+ "Query document_lookup method requires as input either a document ID or "
1344
+ "the name of a file already parsed in the library"
1345
+ )
1335
1346
1336
1347
output = CollectionRetrieval (self .library_name , account_name = self .account_name ).filter_by_key_dict (kv_dict )
1337
1348
1338
1349
if len (output ) == 0 :
1339
1350
logger .warning (f"update: Query - document_lookup - nothing found - { doc_id } - { file_source } " )
1340
- result = []
1341
-
1342
- return result
1351
+ return []
1343
1352
1344
1353
output_final = []
1345
1354
1346
- # exclude images to avoid potential duplicate text
1347
1355
for entries in output :
1348
- if entries ["content_type" ] != "image" :
1356
+ # Filter out images if include_images is False
1357
+ if include_images or entries ["content_type" ] != "image" :
1349
1358
entries .update ({"matches" : []})
1350
1359
entries .update ({"page_num" : entries ["master_index" ]})
1351
1360
output_final .append (entries )
1352
1361
1353
- output_final = sorted (output_final , key = lambda x :x ["block_ID" ], reverse = False )
1362
+ output_final = sorted (output_final , key = lambda x : x ["block_ID" ], reverse = False )
1354
1363
1355
1364
return output_final
1356
1365
0 commit comments