Better handling

h2oai · Oct 26, 2024 · 1e2cc02 · 1e2cc02
1 parent 342840e
commit 1e2cc02
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 17 deletions.
diff --git a/openai_server/agent_tools/convert_document_to_text.py b/openai_server/agent_tools/convert_document_to_text.py
@@ -41,6 +41,7 @@ def get_num_pages(file):
 
 def process_files(files, urls):
     text_context_list = []
+    succeeded = []
 
     textual_types = ('.txt', '.csv', '.toml', '.py', '.rst', '.rtf', '.md', '.html', '.htm', '.xml', '.json', '.yaml',
                      '.yml', '.ini', '.log', '.tex', '.sql', '.sh', '.bat', '.js', '.css', '.php', '.jsp', '.pl', '.r',
@@ -138,18 +139,23 @@ def process_files(files, urls):
                 sources1 = sources2
 
         if not sources1:
+            succeeded.append(False)
             print(f"Unable to handle file type for {filename}")
         else:
+            succeeded.append(True)
             text_context_list.extend([x.page_content for x in sources1])
 
-    return text_context_list
+    return text_context_list, any(succeeded)
 
 
 def get_text(files, urls):
-    text_context_list = process_files(files, urls)
+    text_context_list, any_succeeded = process_files(files, urls)
 
     # Join the text_context_list into a single string
-    output_text = "\n\n".join(text_context_list)
+    if any_succeeded:
+        output_text = "\n\n".join(text_context_list)
+    else:
+        output_text = None
 
     return output_text
 
@@ -170,20 +176,23 @@ def main():
     output_text = get_text(files, urls)
 
     # Write the output to the specified file
-    with open(args.output, "w") as f:
-        f.write(output_text)
-
-    print(f"{files + urls} have been converted to text and written to {args.output}")
-    print("The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
-    print("Probably a verify any use of convert_document_to_text.py with ask_question_about_documents.py")
-
-    max_tokens = 1024
-    max_chars = max_tokens * 4
-    if len(output_text) > max_chars:
-        print("Head of the output:")
-        print(output_text[:max_chars])
+    if output_text is not None:
+        with open(args.output, "w") as f:
+            f.write(output_text)
+
+        print(f"{files + urls} have been converted to text and written to {args.output}")
+        print("The output may be complex for input of PDFs or URLs etc., so do not assume the structure of the output file and instead check it directly.")
+        print("Probably a verify any use of convert_document_to_text.py with ask_question_about_documents.py")
+
+        max_tokens = 1024
+        max_chars = max_tokens * 4
+        if len(output_text) > max_chars:
+            print("Head of the output:")
+            print(output_text[:max_chars])
+        else:
+            print(output_text)
     else:
-        print(output_text)
+        print("Failed to convert files or URLs to text")
 
     return output_text
 

diff --git a/src/version.py b/src/version.py
@@ -1 +1 @@
-__version__ = "bef9afd4cee37c93baaea3e6402d9e260ac59998"
+__version__ = "342840e6e5afdc114008ef3e85edd0a10ba99fdf"