Merge pull request #77 from eren23/strict-content-test

Strict content test
geeks-of-data · Apr 17, 2023 · acb0199 · acb0199
2 parents 0b32122 + 0f6e995
commit acb0199
Show file tree

Hide file tree

Showing 13 changed files with 335 additions and 25 deletions.
diff --git a/examples/pdf_extractor_strict_example.ipynb b/examples/pdf_extractor_strict_example.ipynb
diff --git a/knowledgegpt/extractors/audio_to_text_extractor.py b/knowledgegpt/extractors/audio_to_text_extractor.py
@@ -9,9 +9,9 @@ class AudioToTextExtractor(BaseExtractor):
     """
 
     def __init__(self, audio_path: str, embedding_extractor='hf', model_lang='en', is_turbo: bool = False,
-                 verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+                 verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.audio_path = audio_path
 
     def prepare_df(self):

diff --git a/knowledgegpt/extractors/base_agent.py b/knowledgegpt/extractors/base_agent.py
@@ -9,10 +9,10 @@ class BaseAgent(BaseExtractor):
     """
 
     def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is_turbo=False, index_type="basic",
-                 verbose=False, index_path=None, is_gpt4=False, prompt_template=None, task_type="image_generation", hf_token=None):
+                 verbose=False, index_path=None, is_gpt4=False, prompt_template=None, task_type="image_generation", hf_token=None,  strict_context: bool = False):
         super().__init__(dataframe=dataframe, embedding_extractor=embedding_extractor, model_lang=model_lang,
                          is_turbo=is_turbo, index_type=index_type, verbose=verbose, index_path=index_path, is_gpt4=is_gpt4,
-                         prompt_template=prompt_template)
+                         prompt_template=prompt_template, strict_context=strict_context)
 
         self.hf_token = hf_token
         self.task_type = task_type

diff --git a/knowledgegpt/extractors/base_extractor.py b/knowledgegpt/extractors/base_extractor.py
@@ -5,7 +5,7 @@
 
 class BaseExtractor:
     def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is_turbo=False, index_type="basic",
-                 verbose=False, index_path=None, is_gpt4=False, prompt_template=None):
+                 verbose=False, index_path=None, is_gpt4=False, prompt_template=None, strict_context=False):
 
         """
         :param dataframe: if you have own df use it else choose correct extractor
@@ -32,7 +32,7 @@ def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is
         self.is_gpt4 = is_gpt4
         self.prompt_template = prompt_template
         self.messages = []
-
+        self.strict_context = strict_context
         self.embeddings = None
         self.answer = ""
         self.prompt = ""
@@ -86,7 +86,8 @@ def extract(self, query, max_tokens, load_index=False, context_restarter=False)
             max_tokens=max_tokens,
             index_type=self.index_type,
             prompt_template=self.prompt_template,
-            context_restarter=context_restarter
+            context_restarter=context_restarter,
+            strict_context=self.strict_context
         )
         if not self.verbose:
             print("all_done!")

diff --git a/knowledgegpt/extractors/docs_extractor.py b/knowledgegpt/extractors/docs_extractor.py
@@ -13,9 +13,9 @@ class DocsExtractor(BaseExtractor):
     """
 
     def __init__(self, file_path: str, embedding_extractor: str = "hf", model_lang: str = "en", is_turbo: bool = False,
-                 verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+                 verbose: bool = False, index_path: str = None, index_type: str = "basic",  strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.file_path = file_path
 
 

diff --git a/knowledgegpt/extractors/hybrid_extractor.py b/knowledgegpt/extractors/hybrid_extractor.py
@@ -8,13 +8,13 @@
 
 class HybridFileExtractpr(BaseExtractor):
     def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
-                 model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+                 model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         """
         Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
         then answers a query using the embeddings.
         """
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
 
         self.directory_path = directory_path
         self.extraction_type = extraction_type

diff --git a/knowledgegpt/extractors/pdf_extractor.py b/knowledgegpt/extractors/pdf_extractor.py
@@ -6,13 +6,13 @@
 
 class PDFExtractor(BaseExtractor):
     def __init__(self, pdf_file_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
-                 model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+                 model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         """
         Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
         then answers a query using the embeddings.
         """
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type,is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
 
         self.pdf_file_path = pdf_file_path
         self.extraction_type = extraction_type

diff --git a/knowledgegpt/extractors/powerpoint_extractor.py b/knowledgegpt/extractors/powerpoint_extractor.py
@@ -11,10 +11,10 @@ class PowerpointExtractor(BaseExtractor):
     """
 
     def __init__(self, file_path, embedding_extractor: str = "hf", model_lang: str = "en", is_turbo: bool = False,
-                 verbose: bool = False, index_path: str = None, index_type: str = "basic"):
+                 verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
 
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.file_path = file_path
 
 

diff --git a/knowledgegpt/extractors/web_scrape_extractor.py b/knowledgegpt/extractors/web_scrape_extractor.py
@@ -8,9 +8,9 @@ class WebScrapeExtractor(BaseExtractor):
     """
 
     def __init__(self, url, embedding_extractor: str, model_lang: str, is_turbo: bool = False, verbose: bool = False,
-                 index_path: str = None, index_type: str = "basic"):
+                 index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.url = url
 
     def prepare_df(self):

diff --git a/knowledgegpt/extractors/yt_audio_extractor.py b/knowledgegpt/extractors/yt_audio_extractor.py
@@ -10,9 +10,9 @@ class YoutubeAudioExtractor(BaseExtractor):
     """
 
     def __init__(self, video_id: str, embedding_extractor='hf', model_lang='en', is_turbo: bool = False,
-                 verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False):
+                 verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False, strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.video_id = video_id
         self.is_playlist = is_playlist
 

diff --git a/knowledgegpt/extractors/yt_subs_extractor.py b/knowledgegpt/extractors/yt_subs_extractor.py
@@ -11,9 +11,9 @@ class YTSubsExtractor(BaseExtractor):
     """
 
     def __init__(self, video_id: str, model_lang="en", embedding_extractor="hf", is_turbo: bool = False,
-                 verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False):
+                 verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False, strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
         super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
-                         verbose=verbose, index_path=index_path, index_type=index_type)
+                         verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
         self.video_id = video_id
         self.is_playlist = is_playlist
 

diff --git a/knowledgegpt/utils/utils_completion.py b/knowledgegpt/utils/utils_completion.py
@@ -38,7 +38,8 @@ def answer_query_with_context(
         index_type: str = "basic",
         max_tokens=1000,
         prompt_template=None,
-        context_restarter: bool = False
+        context_restarter: bool = False,
+        strict_context: bool = False,
 ) -> str:
     """
     Answer a query using the provided context.
@@ -66,7 +67,8 @@ def answer_query_with_context(
             model_lang=model_lang,
             max_tokens=max_tokens,
             index_type=index_type,
-            prompt_template=prompt_template
+            prompt_template=prompt_template,
+            strict_context=strict_context,
         )
         if is_turbo:
             messages.append({"role": "user", "content": prompt})

diff --git a/knowledgegpt/utils/utils_prompt.py b/knowledgegpt/utils/utils_prompt.py
@@ -10,9 +10,17 @@
 encoding = tiktoken.get_encoding(ENCODING)
 separator_len = len(encoding.encode(SEPARATOR))
 
+relevancy_template = '''
+You duty is to check if the question given and the context part given are relevant to each other. If they are relevant, please write "yes" or "y" or "1" or "true" or "t". If they are not relevant, please write "no" or "n" or "0" or "false" or "f". If you are not sure, please write "unsure" or "u" or "2" or "maybe" or "m".
+You don't have to be super strict, a basic relevancy check is enough we are trying to hunt down stuff like references to other documents, or other stuff that is not relevant to the question.
+Question: {question}
+Context: {context}
+Answer:
+'''
+
 
 def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, embedding_type: str = "hf",
-                     verbose=False, model_lang: str = "en", max_tokens=1000, index_type="basic", prompt_template=None) -> str:
+                     verbose=False, model_lang: str = "en", max_tokens=1000, index_type="basic", prompt_template=None, strict_context=False) -> str:
     """
     Construct the prompt to be used for completion.
     :param question: The question to answer.
@@ -37,9 +45,28 @@ def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame,
     chosen_sections = []
     chosen_sections_len = 0
     chosen_sections_indexes = []
-
+    if strict_context:
+        print("STRICT MODE IS ON, THIS IS GOING TO TAKE A WHILE AND IS AN EXPERIMENTAL FEATURE")
     for _, section_index in most_relevant_document_sections:
         document_section = df.loc[section_index]
+
+        if strict_context:
+            if len(document_section.content) < 10:
+                continue
+
+            import openai
+            from knowledgegpt.utils.utils_completion import model_types
+
+            prompt = relevancy_template.format(question=question, context=document_section.content)
+
+            response = openai.Completion.create(
+                prompt = prompt,
+                ** model_types["davinci"]
+            )
+
+            if response["choices"][0]["text"].strip(" \n").lower() in ["no", "n", "0", "false", "f"]:
+                continue
+
         document_tokens = len(encoding.encode(document_section.content))
         chosen_sections_len += document_tokens + separator_len
         if chosen_sections_len > MAX_SECTION_LEN: