diff --git a/openai_server/agent_tools/convert_document_to_text.py b/openai_server/agent_tools/convert_document_to_text.py index 5aa072da9..9cde8ec23 100644 --- a/openai_server/agent_tools/convert_document_to_text.py +++ b/openai_server/agent_tools/convert_document_to_text.py @@ -53,6 +53,23 @@ def convert_to_csv(file): pass +def sources_to_text(sources1): + all_content1 = '' + for source in sources1: + meta_str = '' + meta = source.metadata + if 'source' in meta: + meta_str += f"Source: {meta['source']}\n" + if 'parser' in meta: + meta_str += f"Parser: {meta['parser']}\n" + if 'title' in meta: + meta_str += f"Title: {meta['title']}\n" + if 'page' in meta: + meta_str += f"Page: {meta['page']}\n" + all_content1 += f"""\n\n{meta_str}\n\n{source.page_content}\n\n\n""" + return all_content1 + + def process_files(files, urls): text_context_list = [] succeeded = [] @@ -132,8 +149,7 @@ def process_files(files, urls): chunk=False, enable_transcriptions=enable_transcriptions, ) - pages1 = [x.page_content for x in sources1] - all_content1 = "\n\n".join(pages1) + all_content1 = sources_to_text(sources1) if filename.lower().endswith('.pdf') and enable_pdf_doctr == 'off': if use_pymupdf == 'on': @@ -157,9 +173,7 @@ def process_files(files, urls): enable_transcriptions=False, ) - pages2 = [x.page_content for x in sources1] - all_content2 = "\n\n".join(pages2) - + all_content2 = sources_to_text(sources2) # choose one with more content in case pymupdf fails to find info if len(all_content2) > len(all_content1): sources1 = sources2 diff --git a/src/version.py b/src/version.py index 3f39780e9..4a8baade8 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "e77f54aa6d4f2b1b31a4f1b2cc27b9b0c0033ad6" +__version__ = "e1caa2e0ed0cee558c7122872a7caf1be54a0226"