From 614e751a0a1b6c80bafb2500db6b04b8514fbe92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=B5=E8=8A=B7=E9=8A=93?= Date: Mon, 26 Feb 2024 14:36:28 +0800 Subject: [PATCH] deprecated topK;add json_format dict; add use_rerank --- .gitignore | 7 +- akasha/akasha.py | 47 ++++++++++--- akasha/eval/eval.py | 36 +++++----- akasha/helper.py | 20 +++--- akasha/prompts.py | 60 +++++++++++------ akasha/search.py | 51 ++++++++++---- cli/glue.py | 160 ++++++++++++++++++++++++++------------------ example.py | 18 ++++- readme.md | 26 +++---- setup.py | 2 +- 10 files changed, 274 insertions(+), 153 deletions(-) diff --git a/.gitignore b/.gitignore index 495603f..c6f69c7 100644 --- a/.gitignore +++ b/.gitignore @@ -59,4 +59,9 @@ dataset.json expert.json question_set2.txt result.csv -test_*.py \ No newline at end of file +test_*.py +.vscode/settings.json +SIMYOU.TTF +SIMYOU.cw127.pkl +SIMYOU.pkl +default.pdf diff --git a/akasha/akasha.py b/akasha/akasha.py index 3da59d8..ba32a72 100644 --- a/akasha/akasha.py +++ b/akasha/akasha.py @@ -11,6 +11,7 @@ import akasha.prompts as prompts import akasha.db import datetime, traceback +import warnings from dotenv import load_dotenv load_dotenv(pathlib.Path().cwd() / ".env") @@ -180,7 +181,7 @@ def __init__( chunk_size: int = 1000, model: str = "openai:gpt-3.5-turbo", verbose: bool = False, - topK: int = 2, + topK: int = -1, threshold: float = 0.2, language: str = "ch", search_type: Union[str, Callable] = "svm", @@ -225,6 +226,10 @@ def __init__( self.temperature = temperature self.timestamp_list = [] + if topK != -1: + warnings.warn( + "The 'topK' parameter is deprecated and will be removed in future versions", + DeprecationWarning) def _set_model(self, **kwargs): """change model, embeddings, search_type, temperature if user use **kwargs to change them.""" @@ -439,18 +444,42 @@ def __init__( chunk_size: int = 1000, model: str = "openai:gpt-3.5-turbo", verbose: bool = False, - topK: int = 2, + topK: int = -1, threshold: float = 0.2, language: str = "ch", search_type: Union[str, Callable] = "svm", record_exp: str = "", system_prompt: str = "", + prompt_format_type: str = "gpt", max_doc_len: int = 1500, temperature: float = 0.0, compression: bool = False, use_chroma: bool = False, + use_rerank: bool = False, ignore_check: bool = False, ): + """initials of Doc_QA class + + Args: + embeddings (_type_, optional): embedding model, including two types(openai and huggingface). Defaults to "openai:text-embedding-ada-002". + chunk_size (int, optional): the max length of each text segments. Defaults to 1000. + model (_type_, optional): language model. Defaults to "openai:gpt-3.5-turbo". + verbose (bool, optional): print the processing text or not. Defaults to False. + topK (int, optional): the number of documents to be selected. Defaults to 2. + threshold (float, optional): threshold of similarity for searching relavant documents. Defaults to 0.2. + language (str, optional): "ch" chinese or "en" english. Defaults to "ch". + search_type (Union[str, Callable], optional): _description_. Defaults to "svm". + record_exp (str, optional): experiment name of aiido. Defaults to "". + system_prompt (str, optional): the prompt you want llm to output in certain format. Defaults to "". + prompt_format_type (str, optional): the prompt and system prompt format for the language model, including two types(gpt and llama). Defaults to "gpt". + max_doc_len (int, optional): max total length of selected documents. Defaults to 1500. + temperature (float, optional): temperature for language model. Defaults to 0.0. + compression (bool, optional): compress the selected documents or not. Defaults to False. + use_chroma (bool, optional): use chroma db name instead of documents path to load data or not. Defaults to False. + use_rerank (bool, optional): use rerank model to re-rank the selected documents or not. Defaults to False. + ignore_check (bool, optional): speed up loading data if the chroma db is already existed. Defaults to False. + """ + super().__init__( chunk_size, model, @@ -469,6 +498,8 @@ def __init__( self.compression = compression self.use_chroma = use_chroma self.ignore_check = ignore_check + self.use_rerank = use_rerank + self.prompt_format_type = prompt_format_type ### set variables ### self.logs = {} self.model_obj = helper.handle_model(model, self.verbose, @@ -530,7 +561,7 @@ def get_response(self, doc_path: Union[List[str], str], prompt: str, self.db, self.embeddings_obj, self.prompt, - self.topK, + self.use_rerank, self.threshold, self.language, self.search_type, @@ -549,7 +580,7 @@ def get_response(self, doc_path: Union[List[str], str], prompt: str, if self.system_prompt.replace(' ', '') == "": self.system_prompt = prompts.default_doc_ask_prompt() prod_sys_prompt, prod_prompt = prompts.format_sys_prompt( - self.system_prompt, self.prompt) + self.system_prompt, self.prompt, self.prompt_format_type) self.response = self._ask_model(prod_sys_prompt, prod_prompt) @@ -640,7 +671,7 @@ def recursive_get_response(prompt_list): self.db, self.embeddings_obj, prompt, - self.topK, + self.use_rerank, self.threshold, self.language, self.search_type, @@ -656,7 +687,7 @@ def recursive_get_response(prompt_list): self.docs = docs + pre_result ## format prompt ## prod_sys_prompt, prod_prompt = prompts.format_sys_prompt( - self.system_prompt, prompt) + self.system_prompt, prompt, self.prompt_format_type) response = self._ask_model(prod_sys_prompt, prod_prompt) @@ -737,7 +768,7 @@ def ask_whole_file(self, file_path: str, prompt: str, **kwargs) -> str: if self.system_prompt.replace(' ', '') == "": self.system_prompt = prompts.default_doc_ask_prompt() prod_sys_prompt, prod_prompt = prompts.format_sys_prompt( - self.system_prompt, self.prompt) + self.system_prompt, self.prompt, self.prompt_format_type) self.response = self._ask_model(prod_sys_prompt, prod_prompt) end_time = time.time() @@ -804,7 +835,7 @@ def ask_self(self, if self.system_prompt.replace(' ', '') == "": self.system_prompt = prompts.default_doc_ask_prompt() prod_sys_prompt, prod_prompt = prompts.format_sys_prompt( - self.system_prompt, self.prompt) + self.system_prompt, self.prompt, self.prompt_format_type) self.response = self._ask_model(prod_sys_prompt, prod_prompt) end_time = time.time() diff --git a/akasha/eval/eval.py b/akasha/eval/eval.py index 35eaaac..f67cc14 100644 --- a/akasha/eval/eval.py +++ b/akasha/eval/eval.py @@ -85,17 +85,19 @@ def __init__( chunk_size: int = 1000, model: str = "openai:gpt-3.5-turbo", verbose: bool = False, - topK: int = 2, + topK: int = -1, threshold: float = 0.2, language: str = "ch", search_type: Union[str, Callable] = "svm", record_exp: str = "", system_prompt: str = "", + prompt_format_type: str = "gpt", max_doc_len: int = 1500, temperature: float = 0.0, question_type: str = "fact", question_style: str = "essay", use_chroma: bool = False, + use_rerank: bool = False, ignore_check: bool = False, ): """initials of Model_Eval class @@ -121,6 +123,7 @@ def __init__( **temperature (float, optional)**: temperature of llm model from 0.0 to 1.0 . Defaults to 0.0.\n **question_style (str, optional)**: the style of question you want to generate, "essay" or "single_choice". Defaults to "essay".\n **question_type (str, optional)**: the type of question you want to generate, "fact", "summary", "irrelevant", "compared". Defaults to "fact".\n + **use_rerank (bool, optional)**: use rerank model to re-rank the selected documents or not. Defaults to False. """ super().__init__( @@ -141,7 +144,7 @@ def __init__( self.question_type = question_type self.question_style = question_style self.question_num = 0 - + self.prompt_format_type = prompt_format_type ### set variables ### self.logs = {} self.model_obj = akasha.helper.handle_model(model, self.verbose, @@ -161,6 +164,7 @@ def __init__( self.score = {} self.use_chroma = use_chroma self.ignore_check = ignore_check + self.use_rerank = use_rerank def _save_questionset(self, timestamp: str, output_file_path: str): """save questions and ref answers into txt file, and save the path of question set into logs @@ -486,7 +490,7 @@ def _eval_get_res_fact(self, question: Union[str, list], answer: str, prod_sys = self.system_prompt + akasha.prompts.default_doc_ask_prompt( ) prod_sys, query_with_prompt = akasha.prompts.format_sys_prompt( - prod_sys, question) + prod_sys, question, self.prompt_format_type) else: prod_sys = self.system_prompt query, ans = akasha.prompts.format_question_query(question, answer) @@ -497,7 +501,7 @@ def _eval_get_res_fact(self, question: Union[str, list], answer: str, self.db, self.embeddings_obj, query, - self.topK, + self.use_rerank, self.threshold, self.language, self.search_type, @@ -571,7 +575,7 @@ def _eval_get_res_summary(self, sum_doc: str, answer: str, prompt = "請對以上文件進行摘要。" prod_sys, query_with_prompt = akasha.prompts.format_sys_prompt( - self.system_prompt, prompt) + self.system_prompt, prompt, self.prompt_format_type) self.docs = [ Document(page_content=sum_doc, metadata={ @@ -645,7 +649,7 @@ def auto_create_questionset( choice_num: int = 4, output_file_path: str = "", **kwargs, - ) -> (list, list): + ) -> Tuple[list, list]: """auto create question set by llm model, each time it will randomly select a range of documents from the documents directory, then use llm model to generate a question and answer pair, and save it into a txt file. 1.The format of "single_choice" questionset should be one line one question, and the possibles answers and questions are separate by tab(\t), @@ -953,10 +957,9 @@ def optimum_combination( embeddings_list: list = ["openai:text-embedding-ada-002"], chunk_size_list: list = [500], model_list: list = ["openai:gpt-3.5-turbo"], - topK_list: list = [2], search_type_list: list = ["svm", "tfidf", "mmr"], **kwargs, - ) -> (list, list): + ) -> Tuple[list, list]: """test all combinations of giving lists, and run auto_evaluation to find parameters of the best result. Args: @@ -966,7 +969,6 @@ def optimum_combination( **embeddings_list (_type_, optional)**: list of embeddings models. Defaults to ["openai:text-embedding-ada-002"].\n **chunk_size_list (list, optional)**: list of chunk sizes. Defaults to [500].\n **model_list (_type_, optional)**: list of models. Defaults to ["openai:gpt-3.5-turbo"].\n - **topK_list (list, optional)**: list of topK. Defaults to [2].\n **threshold (float, optional)**: the similarity threshold of searching. Defaults to 0.2.\n **search_type_list (list, optional)**: list of search types, currently have "merge", "svm", "knn", "tfidf", "mmr". Defaults to ['svm','tfidf','mmr']. Returns: @@ -977,7 +979,7 @@ def optimum_combination( start_time = time.time() combinations = akasha.helper.get_all_combine(embeddings_list, chunk_size_list, - model_list, topK_list, + model_list, search_type_list) progress = tqdm(len(combinations), total=len(combinations), @@ -991,7 +993,7 @@ def optimum_combination( else: bcr = 0.0 - for embed, chk, mod, tK, st in combinations: + for embed, chk, mod, st in combinations: progress.update(1) if self.question_type.lower() == "essay": @@ -1001,7 +1003,6 @@ def optimum_combination( embeddings=embed, chunk_size=chk, model=mod, - topK=tK, search_type=st, ) @@ -1015,7 +1016,6 @@ def optimum_combination( embed, chk, mod, - tK, self.search_type_str, ) else: @@ -1025,7 +1025,6 @@ def optimum_combination( embeddings=embed, chunk_size=chk, model=mod, - topK=tK, search_type=st, ) bcr = max(bcr, cur_correct_rate) @@ -1035,7 +1034,6 @@ def optimum_combination( embed, chk, mod, - tK, self.search_type_str, ) result_list.append(cur_tup) @@ -1093,7 +1091,7 @@ def create_topic_questionset( choice_num: int = 4, output_file_path: str = "", **kwargs, - ) -> (list, list): + ) -> Tuple[list, list]: """similar to auto_create_questionset, but it will use the topic to find the related documents and create questionset. Args: **doc_path (str)**: documents directory path\n @@ -1157,19 +1155,19 @@ def create_topic_questionset( self.db, self.embeddings_obj, topic, - 99, + self.use_rerank, self.threshold, self.language, self.search_type, self.verbose, self.model_obj, - 999999, + 25000, self.logs[timestamp], ) texts = [doc.page_content for doc in self.docs] metadata = [doc.metadata for doc in self.docs] - print(texts) + doc_range = min(doc_range, len(texts)) progress = tqdm(total=question_num, diff --git a/akasha/helper.py b/akasha/helper.py index cb8a8cb..4e96c18 100644 --- a/akasha/helper.py +++ b/akasha/helper.py @@ -3,7 +3,7 @@ import json, re from pathlib import Path import opencc -from typing import Callable, Union +from typing import Callable, Union, Tuple from langchain_core.messages.ai import AIMessage from langchain_openai import OpenAIEmbeddings, ChatOpenAI, AzureChatOpenAI, AzureOpenAIEmbeddings from akasha.models.hf import chatGLM, get_hf_model, custom_model, custom_embed, remote_model @@ -64,7 +64,7 @@ def _separate_name(name: str): return res_type, res_name -def _handle_azure_env() -> (str, str, str): +def _handle_azure_env() -> Tuple[str, str, str]: """from environment variable get the api_base, api_key, api_version Returns: @@ -408,7 +408,6 @@ def get_all_combine( embeddings_list: list, chunk_size_list: list, model_list: list, - topK_list: list, search_type_list: list, ) -> list: """record all combinations of giving lists @@ -417,7 +416,6 @@ def get_all_combine( **embeddings_list (list)**: list of embeddings(str)\n **chunk_size_list (list)**: list of chunk sizes(int)\n **model_list (list)**: list of models(str)\n - **topK_list (list)**: list of topK(int)\n **search_type_list (list)**: list of search types(str)\n Returns: @@ -427,9 +425,8 @@ def get_all_combine( for embed in embeddings_list: for chk in chunk_size_list: for mod in model_list: - for tK in topK_list: - for st in search_type_list: - res.append((embed, chk, mod, tK, st)) + for st in search_type_list: + res.append((embed, chk, mod, st)) return res @@ -451,11 +448,10 @@ def get_best_combination(result_list: list, idx: int) -> list: for tup in sorted_res: if tup[idx] < max_score: break - res_str = ("embeddings: " + tup[-5] + ", chunk size: " + str(tup[-4]) + - ", model: " + tup[-3] + ", topK: " + str(tup[-2]) + - ", search type: " + tup[-1] + "\n") + res_str = ("embeddings: " + tup[-4] + ", chunk size: " + str(tup[-3]) + + ", model: " + tup[-2] + ", search type: " + tup[-1] + "\n") print(res_str) - res.append(tup[-5:]) + res.append(tup[-4:]) return res @@ -477,7 +473,7 @@ def _get_text(texts: list, previous_summary: str, i: int, max_doc_len: int, - language: str = "ch") -> (int, str, int): + language: str = "ch") -> Tuple[int, str, int]: """used in summary, combine chunks of texts into one chunk that can fit into llm model Args: diff --git a/akasha/prompts.py b/akasha/prompts.py index 67f7b32..6b6eb3c 100644 --- a/akasha/prompts.py +++ b/akasha/prompts.py @@ -1,16 +1,17 @@ -from typing import List, Union +from typing import List, Union, Tuple sys_s = "[INST] <> " sys_e = " <> [/INST]\n\n" -def format_llama_sys_prompt(system_prompt: str, prompt: str) -> (str, str): +def format_llama_sys_prompt(system_prompt: str, + prompt: str) -> Tuple[str, str]: if system_prompt == "": return "", "[INST] " + prompt + " [/INST]\n" return "[INST] <> " + system_prompt + " <> \n", prompt + " [/INST]\n" -def format_GPT_sys_prompt(system_prompt: str, prompt: str) -> (str, str): +def format_GPT_sys_prompt(system_prompt: str, prompt: str) -> Tuple[str, str]: if system_prompt == "": return "", "Human: " + prompt + "\n" return "System: " + system_prompt + "\n", "Human: " + prompt + "\n" @@ -18,8 +19,8 @@ def format_GPT_sys_prompt(system_prompt: str, prompt: str) -> (str, str): def format_sys_prompt(system_prompt: str, prompt: str, - model_type: str = "GPT"): - if model_type == "llama": + model_type: str = "gpt"): + if model_type.lower() == "llama": prod_sys_prompt, prod_prompt = format_llama_sys_prompt( system_prompt, prompt) else: @@ -28,7 +29,7 @@ def format_sys_prompt(system_prompt: str, return prod_sys_prompt, prod_prompt -def format_question_query(question: list, answer: str) -> (str, str): +def format_question_query(question: list, answer: str) -> Tuple[str, str]: """generate a certain format of question to input to llm. Last element means which selection is the correct answer. return the question query string and the answer string.\n example: ["what is 1+1 euqals to?", "2", "4", "8", "10", "1"] @@ -388,13 +389,13 @@ def JSON_formatter(schemas: Union[list, OutputSchema]): def JSON_formatter_list(names: list, descriptions: list, - types: list = ["str"]): + types: list = ["str"]) -> list: """generate prompt for generate JSON format, input list name and descriptions, which include every key and value you want to generate in JSON format""" - + ret = [] if len(names) != len(descriptions): print("error, names and descriptions should have the same length\n\n") - return "" - schema_str = "" + return ret + for i in range(len(names)): if i < len(types) and types[i] in [ "str", "int", "list", "dict", "tuple", "float", "double", @@ -403,15 +404,36 @@ def JSON_formatter_list(names: list, checked_type = types[i] else: checked_type = "str" - schema_str += f"\t{names[i]}: {checked_type} // {descriptions[i]}\n" + #schema_str += f"\t{names[i]}: {checked_type} // {descriptions[i]}\n" + schema = OutputSchema(names[i], descriptions[i], checked_type) + ret.append(schema) - format_instruct = f"""The output should be formatted as a JSON instance that conforms to the JSON schema below: - {{ - {schema_str} - }}\n - """ - return format_instruct + return ret + + +def JSON_formatter_dict(var_list: Union[list, dict]) -> list: + """generate prompt for generate JSON format, input list of dictionary, keys contain name,type and descriptions, which represent every variable you want to generate in JSON format""" + ret = [] + if isinstance(var_list, dict): + var_list = [var_list] + + if not isinstance(var_list, list): + print("error, var_list should be a list of dictionary\n\n") + return ret + + for var in var_list: + if "name" not in var or "description" not in var: + print("var should contain name and description, ignore.\n\n") + continue + if "type" in var and var["type"] in [ + "str", "int", "list", "dict", "tuple", "float", "double", + "long" + ]: + checked_type = var["type"] + else: + checked_type = "str" + schema = OutputSchema(var["name"], var["description"], checked_type) + ret.append(schema) -from langchain.output_parsers import ResponseSchema, StructuredOutputParser -from langchain.output_parsers import XMLOutputParser + return ret diff --git a/akasha/search.py b/akasha/search.py index 1535685..bb0e841 100644 --- a/akasha/search.py +++ b/akasha/search.py @@ -4,6 +4,7 @@ SVMRetriever, KNNRetriever, ) +from langchain_core.callbacks import CallbackManagerForRetrieverRun from langchain.retrievers import ContextualCompressionRetriever from langchain.schema.vectorstore import VectorStoreRetriever from langchain_community.vectorstores import chroma @@ -11,7 +12,7 @@ from langchain.retrievers.document_compressors import LLMChainExtractor from langchain.schema import BaseRetriever from langchain.embeddings.base import Embeddings -from typing import Any, List, Optional, Callable, Union +from typing import Any, List, Optional, Callable, Union, Tuple import numpy as np import akasha.helper as helper import akasha.prompts as prompts @@ -60,7 +61,7 @@ def _get_relevant_doc_custom( else: docs = customR.get_relevant_documents(query) - if k >= 100: + if k >= 200: docs = rerank_reduce(query, docs, k) return docs @@ -99,7 +100,7 @@ def __get_relevant_doc_knn( else: docs = knnR.get_relevant_documents(query) - if k >= 100: + if k >= 200: docs = rerank_reduce(query, docs, k) return docs @@ -124,7 +125,7 @@ def _get_relevant_doc_tfidf( list: list of Documents """ - retriever = TFIDFRetriever.from_documents(docs_list, k=k) + retriever = myTFIDFRetriever.from_documents(docs_list, k=k) if compression: compressor = LLMChainExtractor.from_llm( model, llm_chain_kwargs={"verbose": verbose}) @@ -134,7 +135,7 @@ def _get_relevant_doc_tfidf( else: docs = retriever.get_relevant_documents(query) - if k >= 100: + if k >= 200: docs = rerank_reduce(query, docs[:k], k) return docs[:k] @@ -173,7 +174,7 @@ def _get_relevant_doc_svm( else: docs = svmR.get_relevant_documents(query) - if k >= 100: + if k >= 200: docs = rerank_reduce(query, docs, k) return docs @@ -236,14 +237,14 @@ def _get_relevant_doc_mmr( del retriever - if k >= 100: + if k >= 200: docs = rerank_reduce(query, docs, k) return docs def _merge_docs(docs_list: list, topK: int, language: str, verbose: bool, - max_doc_len: int, model) -> (list, int): + max_doc_len: int, model) -> Tuple[list, int]: """merge different search types documents, if total len of documents too large, will not select all documents. use jieba to count length of chinese words, use split space otherwise. @@ -291,7 +292,7 @@ def get_docs( db: Union[dbs, list], embeddings, query: str, - topK: int, + use_rerank: bool, threshold: float, language: str, search_type: Union[str, Callable], @@ -300,7 +301,7 @@ def get_docs( max_token: int, log: dict, compression: bool = False, -) -> (list, int): +) -> Tuple[list, int]: """search docs based on given search_type, default is merge, which contain 'mmr', 'svm', 'tfidf' and merge them together. @@ -321,6 +322,12 @@ def get_docs( list: selected list of similar documents. """ + ### if use rerank to get more accurate similar documents, set topK to 200 ### + if use_rerank: + topK = 200 + else: + topK = 199 + if callable(search_type): times = _get_threshold_times(db) @@ -560,7 +567,8 @@ def _ks(self, query: str) -> List[Document]: denominator = np.max(similarities) - np.min(similarities) + 1e-6 normalized_similarities = (similarities - np.min(similarities)) / denominator - + # print([normalized_similarities[row] + # for row in sorted_ix[0:self.k]]) # stats top_k_results = [ Document(page_content=self.texts[row], metadata=self.metadata[row]) for row in sorted_ix[0:self.k] @@ -600,6 +608,23 @@ def _aget_relevant_documents(self, query: str) -> List[Document]: return top_k_results +class myTFIDFRetriever(TFIDFRetriever): + + def _get_relevant_documents( + self, query: str, *, + run_manager: CallbackManagerForRetrieverRun) -> List[Document]: + from sklearn.metrics.pairwise import cosine_similarity + + query_vec = self.vectorizer.transform( + [query]) # Ip -- (n_docs,x), Op -- (n_docs,n_Feats) + results = cosine_similarity(self.tfidf_array, query_vec).reshape( + (-1, )) # Op -- (n_docs,1) -- Cosine Sim with each doc + + # print(results) # stats + return_docs = [self.docs[i] for i in results.argsort()[-self.k:][::-1]] + return return_docs + + class mySVMRetriever(BaseRetriever): embeddings: Embeddings """Embeddings model to use.""" @@ -621,7 +646,6 @@ def from_db( relevancy_threshold: float = 0.2, **kwargs: Any, ) -> SVMRetriever: - # db_data = _get_all_docs(db) index = np.array(db.get_embeds()) texts = db.get_docs() @@ -694,6 +718,7 @@ def _gs(self, query: str) -> List[Document]: top_k_results = [] for row in sorted_ix[1:self.k + 1]: + # print(normalized_similarities[row]) # stats if (self.relevancy_threshold is None or normalized_similarities[row] >= self.relevancy_threshold): top_k_results.append( @@ -767,7 +792,7 @@ def rerank_reduce(query, docs, topK): model = AutoModelForSequenceClassification.from_pretrained(model_name).to( device) model.eval() - topK //= 5 + topK //= 2 k, score_list = 0, [] while k < len(docs): pairs = [[query, doc.page_content] for doc in docs[k:k + 10]] diff --git a/cli/glue.py b/cli/glue.py index 1a8822c..581173a 100644 --- a/cli/glue.py +++ b/cli/glue.py @@ -13,19 +13,24 @@ def akasha(): @click.option( "--doc_path", "-d", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) -@click.option("--prompt", "-p", help="prompt you want to ask to llm", required=True) +@click.option("--prompt", + "-p", + help="prompt you want to ask to llm", + required=True) @click.option( "--embeddings", "-e", default="openai:text-embedding-ada-002", help="embeddings for storing the documents", ) -@click.option( - "--chunk_size", "-c", default=1000, help="chunk size for storing the documents" -) +@click.option("--chunk_size", + "-c", + default=1000, + help="chunk size for storing the documents") @click.option( "--model", "-m", @@ -55,14 +60,17 @@ def akasha(): "--record_exp", "-r", default="", - help="input the experiment name if you want to record the experiment using aiido", -) -@click.option( - "--system_prompt", "-sys", default="", help="system prompt for the llm model" -) -@click.option( - "--max_doc_len", "-md", default=1500, help="max doc len for the llm model input" -) + help= + "input the experiment name if you want to record the experiment using aiido", +) +@click.option("--system_prompt", + "-sys", + default="", + help="system prompt for the llm model") +@click.option("--max_doc_len", + "-md", + default=1500, + help="max doc len for the llm model input") def get_response( doc_path: str, prompt: str, @@ -101,7 +109,8 @@ def get_response( @click.option( "--doc_path", "-d", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) @click.option( @@ -110,16 +119,20 @@ def get_response( default="openai:text-embedding-ada-002", help="embeddings for storing the documents", ) -@click.option( - "--chunk_size", "-c", default=1000, help="chunk size for storing the documents" -) +@click.option("--chunk_size", + "-c", + default=1000, + help="chunk size for storing the documents") @click.option( "--model", "-m", default="openai:gpt-3.5-turbo", help="llm model for generating the response", ) -@click.option("--topk", "-k", default=2, help="select topK relevant documents") +@click.option("--use_rerank", + "-ur", + default=False, + help="use rerank to sort the documents") @click.option( "--threshold", "-t", @@ -138,18 +151,20 @@ def get_response( default="merge", help="search type for the documents, include merge, svm, mmr, tfidf", ) -@click.option( - "--system_prompt", "-sys", default="", help="system prompt for the llm model" -) -@click.option( - "--max_doc_len", "-md", default=3000, help="max token for the llm model input" -) +@click.option("--system_prompt", + "-sys", + default="", + help="system prompt for the llm model") +@click.option("--max_doc_len", + "-md", + default=3000, + help="max token for the llm model input") def keep_responsing( doc_path: str, embeddings: str, chunk_size: int, model: str, - topk: int, + use_rerank: bool, threshold: float, language: str, search_type: str, @@ -164,22 +179,22 @@ def keep_responsing( embeddings = helper.handle_embeddings(embeddings, False) model = helper.handle_model(model, False) - db = helper.create_chromadb( - doc_path, False, embeddings, embeddings_name, chunk_size - ) + db = helper.create_chromadb(doc_path, False, embeddings, embeddings_name, + chunk_size) if db is None: info = "document path not exist\n" print(info) return "" - user_input = click.prompt('Please input your question(type "exit()" to quit) ') + user_input = click.prompt( + 'Please input your question(type "exit()" to quit) ') while user_input != "exit()": docs, docs_len, tokens = search.get_docs( db, embeddings, user_input, - topk, + use_rerank, threshold, language, search_type, @@ -193,12 +208,14 @@ def keep_responsing( chain = load_qa_chain(llm=model, chain_type="stuff", verbose=False) - res = chain.run(input_documents=docs, question=system_prompt + user_input) + res = chain.run(input_documents=docs, + question=system_prompt + user_input) res = helper.sim_to_trad(res) print("Response: ", res) print("\n\n") - user_input = click.prompt('Please input your question(type "exit()" to quit) ') + user_input = click.prompt( + 'Please input your question(type "exit()" to quit) ') del db, model, embeddings @@ -207,14 +224,16 @@ def keep_responsing( @click.option( "--doc_path", "-d", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) @click.option( "--prompt", "-p", multiple=True, - help="prompt you want to ask to llm, if you want to ask multiple questions, use -p multiple times", + help= + "prompt you want to ask to llm, if you want to ask multiple questions, use -p multiple times", required=True, ) @click.option( @@ -223,9 +242,10 @@ def keep_responsing( default="openai:text-embedding-ada-002", help="embeddings for storing the documents", ) -@click.option( - "--chunk_size", "-c", default=1000, help="chunk size for storing the documents" -) +@click.option("--chunk_size", + "-c", + default=1000, + help="chunk size for storing the documents") @click.option( "--model", "-m", @@ -255,14 +275,17 @@ def keep_responsing( "--record_exp", "-r", default="", - help="input the experiment name if you want to record the experiment using aiido", -) -@click.option( - "--system_prompt", "-sys", default="", help="system prompt for the llm model" -) -@click.option( - "--max_doc_len", "-md", default=1500, help="max token for the llm model input" -) + help= + "input the experiment name if you want to record the experiment using aiido", +) +@click.option("--system_prompt", + "-sys", + default="", + help="system prompt for the llm model") +@click.option("--max_doc_len", + "-md", + default=1500, + help="max token for the llm model input") def chain_of_thought( doc_path: str, prompt, @@ -302,12 +325,14 @@ def chain_of_thought( @click.option( "--doc_path", "-d", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) -@click.option( - "-question_num", "-qn", default=10, help="number of questions you want to generate" -) +@click.option("-question_num", + "-qn", + default=10, + help="number of questions you want to generate") @click.option( "-question_type", "--qt", @@ -320,9 +345,10 @@ def chain_of_thought( default="openai:text-embedding-ada-002", help="embeddings for storing the documents", ) -@click.option( - "--chunk_size", "-c", default=1000, help="chunk size for storing the documents" -) +@click.option("--chunk_size", + "-c", + default=1000, + help="chunk size for storing the documents") @click.option("--topk", "-k", default=2, help="select topK relevant documents") @click.option( "--threshold", @@ -346,7 +372,8 @@ def chain_of_thought( "--record_exp", "-r", default="", - help="input the experiment name if you want to record the experiment using aiido", + help= + "input the experiment name if you want to record the experiment using aiido", ) def auto_create_questionset( doc_path: str, @@ -384,13 +411,15 @@ def auto_create_questionset( @click.option( "--question_path", "-qp", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) @click.option( "--doc_path", "-d", - help="document directory path, parse all .txt, .pdf, .docx files in the directory", + help= + "document directory path, parse all .txt, .pdf, .docx files in the directory", required=True, ) @click.option( @@ -405,9 +434,10 @@ def auto_create_questionset( default="openai:text-embedding-ada-002", help="embeddings for storing the documents", ) -@click.option( - "--chunk_size", "-c", default=1000, help="chunk size for storing the documents" -) +@click.option("--chunk_size", + "-c", + default=1000, + help="chunk size for storing the documents") @click.option( "--model", "-m", @@ -437,11 +467,13 @@ def auto_create_questionset( "--record_exp", "-r", default="", - help="input the experiment name if you want to record the experiment using aiido", -) -@click.option( - "--max_doc_len", "-md", default=1500, help="max token for the llm model input" + help= + "input the experiment name if you want to record the experiment using aiido", ) +@click.option("--max_doc_len", + "-md", + default=1500, + help="max token for the llm model input") def auto_evaluation( question_path: str, doc_path: str, @@ -510,8 +542,7 @@ def ui(): # make a folder `docs/Default` if not os.path.exists("docs") or not os.path.exists( - os.path.join("docs", "Default") - ): + os.path.join("docs", "Default")): os.makedirs(os.path.join(".", "docs", "Default")) else: pass @@ -548,6 +579,5 @@ def ui(): akasha.add_command(auto_evaluation) akasha.add_command(ui) - if __name__ == "__main__": akasha() diff --git a/example.py b/example.py index 08d6fdf..ac2708e 100644 --- a/example.py +++ b/example.py @@ -108,7 +108,9 @@ def QA(doc_path="./docs/mic/"): ## ask_whole_file response = qa.ask_whole_file( file_path="docs/mic/20230317_5軸工具機因應市場訴求改變的發展態勢.pdf", - prompt=f'''五軸是甚麼?''') + prompt=f'''五軸是甚麼?''', + model="openai:gpt-3.5-turbo-16k", + ) print(response) ## ask_self @@ -197,12 +199,19 @@ def SUM(file_name: str = "./docs/mic/20230531_智慧製造需求下之邊緣運 ### JSON FORMATTER ### def JSON(): - formatter = [ + formatter1 = [ prompts.OutputSchema(name="學歷", description="受試者的就讀大學", type="str"), prompts.OutputSchema(name="經驗", description="受試者的工作經驗", type="str"), prompts.OutputSchema(name="專長", description="受試者的專長能力", type="list"), prompts.OutputSchema(name="年資", description="受試者的總工作年數", type="int") ] + formatter2 = prompts.JSON_formatter_list(names=["學歷","經驗","專長","年資"], types=["str","str","list","int"],\ + descriptions=["受試者的就讀大學","受試者的工作經驗","受試者的專長能力","受試者的總工作年數"]) + + formatter3 = prompts.JSON_formatter_dict([{ "name": "學歷", "description": "受試者的就讀大學", "type": "str" },\ + { "name": "經驗", "description": "受試者的工作經驗", "type": "str" },\ + { "name": "專長", "description": "受試者的專長能力", "type": "list" },\ + { "name": "年資", "description": "受試者的總工作年數", "type": "int" }]) ak = akasha.Doc_QA( topK=10, threshold=0.0, @@ -211,8 +220,11 @@ def JSON(): response = ak.ask_whole_file(file_path="docs/resume_pool/A.docx", system_prompt="用中文回答" + - prompts.JSON_formatter(formatter), + prompts.JSON_formatter(formatter1), prompt=f'''以上是受試者的履歷,請回答該受試者的學歷、經驗、專長、年資''') parse_json = akasha.helper.extract_json(response) print(parse_json, type(parse_json)) + + +JSON() diff --git a/readme.md b/readme.md index beb66b2..8cf567b 100644 --- a/readme.md +++ b/readme.md @@ -1,7 +1,7 @@ # akasha [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![pypi package : 0.8.18](https://img.shields.io/badge/pypi%20package-0.8.18-blue)](https://pypi.org/project/akasha-terminal/) +[![pypi package : 0.8.19](https://img.shields.io/badge/pypi%20package-0.8.19-blue)](https://pypi.org/project/akasha-terminal/) [![python version : 3.8 3.9 3.10](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10-blue)](https://www.python.org/downloads/release/python-380/) ![GitLab CI](https://img.shields.io/badge/gitlab%20ci-%23181717.svg?style=for-the-badge&logo=gitlab&logoColor=white) @@ -536,7 +536,6 @@ Args: **chunk_size (int, optional)**: chunk size of texts from documents. Defaults to 1000.\n **model (str, optional)**: llm model to use. Defaults to "gpt-3.5-turbo".\n **verbose (bool, optional)**: show log texts or not. Defaults to False.\n - **topK (int, optional)**: search top k number of similar documents. Defaults to 2.\n **threshold (float, optional)**: the similarity threshold of searching. Defaults to 0.2.\n **language (str, optional)**: the language of documents and prompt, use to make sure docs won't exceed max token size of llm input.\n @@ -549,6 +548,9 @@ Args: in searching relevant documents. Defaults to "".\n **max_doc_len (int, optional)**: max document size of llm input. Defaults to 3000.\n **temperature (float, optional)**: temperature of llm model from 0.0 to 1.0 . Defaults to 0.0.\n + **use_chroma (bool, optional)**: use chroma db name instead of documents path to load data or not. Defaults to False. + **use_rerank (bool, optional)**: use rerank model to re-rank the selected documents or not. Defaults to False. + **ignore_check (bool, optional)**: speed up loading data if the chroma db is already existed. Defaults to False. """ ``` @@ -752,7 +754,7 @@ eva = eval.Model_Eval(question_style="essay", search_type='merge',\ eva.auto_create_questionset(doc_path="doc/mic/", question_num=10, output_file_path="questionset/mic_essay.txt") -bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_essay.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",topK=3,search_type="svm") +bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_essay.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",search_type="svm") # bert_score = 0.782 # rouge = 0.81 @@ -774,7 +776,7 @@ eva = eval.Model_Eval(search_type='merge', question_type = "irrelevant", model=" eva.auto_create_questionset(doc_path="doc/mic/", question_num=10, output_file_path="questionset/mic_irre.txt") -bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_irre.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",topK=3,search_type="svm") +bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_irre.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",search_type="svm") ``` @@ -795,7 +797,7 @@ eva = eval.Model_Eval(search_type='merge', question_type = "irrelevant", model=" eva.create_topic_questionset(doc_path="doc/mic/", topic= "工業4.0", question_num=3, output_file_path="questionset/mic_topic_irre.txt") -bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_topic_irre.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",topK=3,search_type="svm") +bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset/mic_topic_irre.txt", doc_path="doc/mic/", question_style = "essay", record_exp="exp_mic_auto_evaluation",search_type="svm") ``` @@ -811,7 +813,7 @@ bert_score, rouge, llm_score = eva.auto_evaluation(questionset_path="questionset ## Find Optimum Combination To test all available combinations and find the best parameters, you can use function **optimum_combination** , you can give different -embeddings, document chunk sizes, models, document similarity searching type and number of most relative documents (topK), and the function will +embeddings, document chunk sizes, models, document similarity searching type, and the function will test all combinations to find the best combination based on the given question set and documents. Noted that best score combination is the highest correct rate combination, and best cost-effective @@ -834,7 +836,7 @@ model_list = ["openai:gpt-3.5-turbo","hf:FlagAlpha/Llama2-Chinese-13b-Chat-4bit" eva = eval.Model_Eval(question_style="single_choice") eva.optimum_combination("question_pvc.txt", dir_path, embeddings_list = embeddings_list, model_list = model_list, - chunk_size_list=[200, 400, 600], search_type_list=["merge","tfidf",],record_exp=exp_name,topK_list=[2,3]) + chunk_size_list=[200, 400, 600], search_type_list=["merge","tfidf",],record_exp=exp_name) ``` @@ -845,11 +847,11 @@ eva.optimum_combination("question_pvc.txt", dir_path, embeddings_list = embeddi Best correct rate: 1.000 Best score combination: -embeddings: openai:text-embedding-ada-002, chunk size: 400, model: openai:gpt-3.5-turbo, topK: 3, search type: merge +embeddings: openai:text-embedding-ada-002, chunk size: 400, model: openai:gpt-3.5-turbo, search type: merge -embeddings: openai:text-embedding-ada-002, chunk size: 400, model: openai:gpt-3.5-turbo, topK: 3, search type: tfidf +embeddings: openai:text-embedding-ada-002, chunk size: 400, model: openai:gpt-3.5-turbo, search type: tfidf @@ -857,7 +859,7 @@ embeddings: openai:text-embedding-ada-002, chunk size: 400, model: openai:gpt-3. Best cost-effective: -embeddings: hf:shibing624/text2vec-base-chinese, chunk size: 400, model: openai:gpt-3.5-turbo, topK: 2, search type: tfidf +embeddings: hf:shibing624/text2vec-base-chinese, chunk size: 400, model: openai:gpt-3.5-turbo, search type: tfidf ``` @@ -872,7 +874,6 @@ embeddings: hf:shibing624/text2vec-base-chinese, chunk size: 400, model: openai: **chunk_size (int, optional)**: chunk size of texts from documents. Defaults to 1000. **model (str, optional)**: llm model to use. Defaults to "gpt-3.5-turbo". **verbose (bool, optional)**: show log texts or not. Defaults to False. - **topK (int, optional)**: search top k number of similar documents. Defaults to 2. **threshold (float, optional)**: the similarity threshold of searching. Defaults to 0.2. **language (str, optional)**: the language of documents and prompt, use to make sure docs won't exceed max token size of llm input. @@ -886,6 +887,7 @@ embeddings: hf:shibing624/text2vec-base-chinese, chunk size: 400, model: openai: **max_doc_len (int, optional)**: max document size of llm input. Defaults to 3000. **temperature (float, optional)**: temperature of llm model from 0.0 to 1.0 . Defaults to 0.0. **question_type (str, optional)**: the type of question you want to generate, "essay" or "single_choice". Defaults to "essay". + **use_rerank (bool, optional)**: use rerank model to re-rank the selected documents or not. Defaults to False. """ ``` @@ -1139,7 +1141,7 @@ Options: -e, --embeddings TEXT embeddings for storing the documents -c, --chunk_size INTEGER chunk size for storing the documents -m, --model TEXT llm model for generating the response - -k, --topk INTEGER select topK relevant documents + -ur --use_rerank BOOL use rerank to sort the documents -t, --threshold FLOAT threshold score for selecting the relevant documents -l, --language TEXT language for the documents, default is 'ch' for diff --git a/setup.py b/setup.py index 58634a2..7f8ff9b 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ setup( name="akasha-terminal", - version="0.8.18", + version="0.8.19", description="document QA package using langchain and chromadb", long_description=long_description, long_description_content_type="text/markdown",