Skip to content

Commit

Permalink
Merge pull request #77 from eren23/strict-content-test
Browse files Browse the repository at this point in the history
Strict content test
  • Loading branch information
kaanozbudak authored Apr 17, 2023
2 parents 0b32122 + 0f6e995 commit acb0199
Show file tree
Hide file tree
Showing 13 changed files with 335 additions and 25 deletions.
280 changes: 280 additions & 0 deletions examples/pdf_extractor_strict_example.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions knowledgegpt/extractors/audio_to_text_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ class AudioToTextExtractor(BaseExtractor):
"""

def __init__(self, audio_path: str, embedding_extractor='hf', model_lang='en', is_turbo: bool = False,
verbose: bool = False, index_path: str = None, index_type: str = "basic"):
verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.audio_path = audio_path

def prepare_df(self):
Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ class BaseAgent(BaseExtractor):
"""

def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is_turbo=False, index_type="basic",
verbose=False, index_path=None, is_gpt4=False, prompt_template=None, task_type="image_generation", hf_token=None):
verbose=False, index_path=None, is_gpt4=False, prompt_template=None, task_type="image_generation", hf_token=None, strict_context: bool = False):
super().__init__(dataframe=dataframe, embedding_extractor=embedding_extractor, model_lang=model_lang,
is_turbo=is_turbo, index_type=index_type, verbose=verbose, index_path=index_path, is_gpt4=is_gpt4,
prompt_template=prompt_template)
prompt_template=prompt_template, strict_context=strict_context)

self.hf_token = hf_token
self.task_type = task_type
Expand Down
7 changes: 4 additions & 3 deletions knowledgegpt/extractors/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class BaseExtractor:
def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is_turbo=False, index_type="basic",
verbose=False, index_path=None, is_gpt4=False, prompt_template=None):
verbose=False, index_path=None, is_gpt4=False, prompt_template=None, strict_context=False):

"""
:param dataframe: if you have own df use it else choose correct extractor
Expand All @@ -32,7 +32,7 @@ def __init__(self, dataframe=None, embedding_extractor="hf", model_lang="en", is
self.is_gpt4 = is_gpt4
self.prompt_template = prompt_template
self.messages = []

self.strict_context = strict_context
self.embeddings = None
self.answer = ""
self.prompt = ""
Expand Down Expand Up @@ -86,7 +86,8 @@ def extract(self, query, max_tokens, load_index=False, context_restarter=False)
max_tokens=max_tokens,
index_type=self.index_type,
prompt_template=self.prompt_template,
context_restarter=context_restarter
context_restarter=context_restarter,
strict_context=self.strict_context
)
if not self.verbose:
print("all_done!")
Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/docs_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ class DocsExtractor(BaseExtractor):
"""

def __init__(self, file_path: str, embedding_extractor: str = "hf", model_lang: str = "en", is_turbo: bool = False,
verbose: bool = False, index_path: str = None, index_type: str = "basic"):
verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.file_path = file_path


Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/hybrid_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@

class HybridFileExtractpr(BaseExtractor):
def __init__(self, directory_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
"""
Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
then answers a query using the embeddings.
"""
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)

self.directory_path = directory_path
self.extraction_type = extraction_type
Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/pdf_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@

class PDFExtractor(BaseExtractor):
def __init__(self, pdf_file_path: str, extraction_type: str = "page", embedding_extractor: str = "hf",
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic"):
model_lang: str = "en", is_turbo: bool = False, verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
"""
Extracts paragraphs from a PDF file and computes embeddings for each paragraph,
then answers a query using the embeddings.
"""
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type,is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)

self.pdf_file_path = pdf_file_path
self.extraction_type = extraction_type
Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/powerpoint_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ class PowerpointExtractor(BaseExtractor):
"""

def __init__(self, file_path, embedding_extractor: str = "hf", model_lang: str = "en", is_turbo: bool = False,
verbose: bool = False, index_path: str = None, index_type: str = "basic"):
verbose: bool = False, index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):

super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.file_path = file_path


Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/web_scrape_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ class WebScrapeExtractor(BaseExtractor):
"""

def __init__(self, url, embedding_extractor: str, model_lang: str, is_turbo: bool = False, verbose: bool = False,
index_path: str = None, index_type: str = "basic"):
index_path: str = None, index_type: str = "basic", strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.url = url

def prepare_df(self):
Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/yt_audio_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ class YoutubeAudioExtractor(BaseExtractor):
"""

def __init__(self, video_id: str, embedding_extractor='hf', model_lang='en', is_turbo: bool = False,
verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False):
verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False, strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.video_id = video_id
self.is_playlist = is_playlist

Expand Down
4 changes: 2 additions & 2 deletions knowledgegpt/extractors/yt_subs_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ class YTSubsExtractor(BaseExtractor):
"""

def __init__(self, video_id: str, model_lang="en", embedding_extractor="hf", is_turbo: bool = False,
verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False):
verbose: bool = False, index_path: str = None, index_type: str = "basic", is_playlist: bool = False, strict_context: bool = False, is_gpt4: bool = False, prompt_template: str = None):
super().__init__(embedding_extractor=embedding_extractor, model_lang=model_lang, is_turbo=is_turbo,
verbose=verbose, index_path=index_path, index_type=index_type)
verbose=verbose, index_path=index_path, index_type=index_type, is_gpt4=is_gpt4, prompt_template=prompt_template, strict_context=strict_context)
self.video_id = video_id
self.is_playlist = is_playlist

Expand Down
6 changes: 4 additions & 2 deletions knowledgegpt/utils/utils_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def answer_query_with_context(
index_type: str = "basic",
max_tokens=1000,
prompt_template=None,
context_restarter: bool = False
context_restarter: bool = False,
strict_context: bool = False,
) -> str:
"""
Answer a query using the provided context.
Expand Down Expand Up @@ -66,7 +67,8 @@ def answer_query_with_context(
model_lang=model_lang,
max_tokens=max_tokens,
index_type=index_type,
prompt_template=prompt_template
prompt_template=prompt_template,
strict_context=strict_context,
)
if is_turbo:
messages.append({"role": "user", "content": prompt})
Expand Down
31 changes: 29 additions & 2 deletions knowledgegpt/utils/utils_prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,17 @@
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

relevancy_template = '''
You duty is to check if the question given and the context part given are relevant to each other. If they are relevant, please write "yes" or "y" or "1" or "true" or "t". If they are not relevant, please write "no" or "n" or "0" or "false" or "f". If you are not sure, please write "unsure" or "u" or "2" or "maybe" or "m".
You don't have to be super strict, a basic relevancy check is enough we are trying to hunt down stuff like references to other documents, or other stuff that is not relevant to the question.
Question: {question}
Context: {context}
Answer:
'''


def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, embedding_type: str = "hf",
verbose=False, model_lang: str = "en", max_tokens=1000, index_type="basic", prompt_template=None) -> str:
verbose=False, model_lang: str = "en", max_tokens=1000, index_type="basic", prompt_template=None, strict_context=False) -> str:
"""
Construct the prompt to be used for completion.
:param question: The question to answer.
Expand All @@ -37,9 +45,28 @@ def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame,
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []

if strict_context:
print("STRICT MODE IS ON, THIS IS GOING TO TAKE A WHILE AND IS AN EXPERIMENTAL FEATURE")
for _, section_index in most_relevant_document_sections:
document_section = df.loc[section_index]

if strict_context:
if len(document_section.content) < 10:
continue

import openai
from knowledgegpt.utils.utils_completion import model_types

prompt = relevancy_template.format(question=question, context=document_section.content)

response = openai.Completion.create(
prompt = prompt,
** model_types["davinci"]
)

if response["choices"][0]["text"].strip(" \n").lower() in ["no", "n", "0", "false", "f"]:
continue

document_tokens = len(encoding.encode(document_section.content))
chosen_sections_len += document_tokens + separator_len
if chosen_sections_len > MAX_SECTION_LEN:
Expand Down

0 comments on commit acb0199

Please sign in to comment.