From 0197839594f0c8977fce7432f4d17cd50fdcc8c7 Mon Sep 17 00:00:00 2001 From: eren23 Date: Sun, 9 Apr 2023 00:17:16 +0300 Subject: [PATCH 1/2] initial chromadb addition --- .gitignore | 2 +- examples/chroma_example.ipynb | 384 +++++++++++++++++++++++++++ knowledgegpt/extractors/helpers.py | 2 +- knowledgegpt/utils/utils_distance.py | 21 +- 4 files changed, 406 insertions(+), 3 deletions(-) create mode 100644 examples/chroma_example.ipynb diff --git a/.gitignore b/.gitignore index 30d13dc..839ea8f 100644 --- a/.gitignore +++ b/.gitignore @@ -25,4 +25,4 @@ examples/local_example_config.py examples/calculated_indexes/* static_files/*_test.* examples/example_config.py - +.chroma \ No newline at end of file diff --git a/examples/chroma_example.ipynb b/examples/chroma_example.ipynb new file mode 100644 index 0000000..de133e6 --- /dev/null +++ b/examples/chroma_example.ipynb @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/Users/eren/opt/anaconda3/envs/knowledgegpt-env/lib/python3.9/site-packages/pydub/utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\n", + " warn(\"Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\", RuntimeWarning)\n" + ] + } + ], + "source": [ + "from knowledgegpt.extractors.base_extractor import BaseExtractor\n", + "from knowledgegpt.utils.utils_scrape import scrape_content" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import openai\n", + "from example_config import SECRET_KEY\n", + "openai.api_key = SECRET_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = scrape_content(\"https://en.wikipedia.org/wiki/Bombard_(weapon)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "basic_extractor_chroma = BaseExtractor(df, embedding_extractor=\"hf\", model_lang=\"en\", is_turbo=True, index_type=\"chroma\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing embeddings...\n", + "model_lang en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n", + "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DONE, CHROMA\n", + "Selected 3 document sections:\n", + "0\n", + "11\n", + "32\n", + "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", + "\n", + "Context:\n", + "\n", + "* Bombard (weapon) - Wikipedia\n", + "* Bombard (weapon)\n", + "* Wikimedia Commons has media related to Bombards (weapon).\n", + "\n", + " Q: What is a bombard? Where were they used?\n", + " A:\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages =basic_extractor_chroma.extract(\"What is a bombard? Where were they used?\", max_tokens=200)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'A bombard is a type of weapon. It is a large caliber, muzzle-loading artillery piece used in the Middle Ages and the early modern period. They were used in various parts of the world, including Europe and Asia.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What is a bombard? Where were they used? What is the stronger aspects\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages =basic_extractor_chroma.extract(\"What is a bombard? Where were they used? What is the stronger aspects\", max_tokens=400,)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'A bombard is a type of weapon, a large caliber, muzzle-loading artillery piece used in the Middle Ages and the early modern period. They were used in various parts of the world, including Europe and Asia. \\n\\nIn terms of stronger aspects, bombards were known for their ability to fire large projectiles over long distances, making them effective siege weapons. They were also capable of causing significant damage to fortifications and other structures. However, they were heavy and difficult to move, and their rate of fire was relatively slow compared to other types of artillery.'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model_lang en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n", + "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DONE, CHROMA\n", + "Selected 4 document sections:\n", + "0\n", + "11\n", + "32\n", + "16\n", + "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", + "\n", + "Context:\n", + "\n", + "* Bombard (weapon) - Wikipedia\n", + "* Bombard (weapon)\n", + "* Wikimedia Commons has media related to Bombards (weapon).\n", + "* The bombard is a type of cannon or mortar which was used throughout the Middle Ages and the early modern period. Bombards were mainly large calibre, muzzle-loading artillery pieces used during sieges to shoot round stone projectiles at the walls of enemy fortifications, enabling troops to break in. Most bombards were made of iron and used gunpowder to launch the projectiles.[1] There are many examples of bombards, including Mons Meg, the Dardanelles Gun, and the handheld bombard. Bombard mortar and granite ball projectile of the Knights of Saint John of Jerusalem, Rhodes, 1480–1500. Founded at the request of Pierre d'Aubusson, the bombard was used for close defense of the walls (100–200 meters) at the Siege of Rhodes. It fired 260 kg granite balls. The bombard weighs about 3,325 kg. Musée de l'Armée.The weapon provided the name to the Royal Artillery rank of bombardier and the word bombardment.\n", + "\n", + " Q: What is a bombard? Where were they used? What is the stronger aspects\n", + " A:\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages =basic_extractor_chroma.extract(\"What is a bombard? Where were they used? What is the stronger aspects\", max_tokens=400, context_restarter=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model_lang en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n", + "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DONE, CHROMA\n", + "Selected 10 document sections:\n", + "11\n", + "0\n", + "45\n", + "30\n", + "37\n", + "42\n", + "27\n", + "32\n", + "20\n", + "26\n", + "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", + "\n", + "Context:\n", + "\n", + "* Bombard (weapon)\n", + "* Bombard (weapon) - Wikipedia\n", + "* Retrieved from \"https://en.wikipedia.org/w/index.php?title=Bombard_(weapon)&oldid=1141599306\"\n", + "* Bombard in its siege position, Denmark.\n", + "* ^ Gwei-Djen, Lu; Needham, Joseph; Chi-Hsing, Phan (July 1988). \"The Oldest Representation of a Bombard\". Technology and Culture. 29 (3): 594–605. doi:10.2307/3105275. JSTOR 3105275.\n", + "* ^ \"Bodiam Bombard | Kent and Sussex Courier\". Archived from the original on 2015-09-23. Retrieved 2015-04-13.\n", + "* Early Ming bombard with two pair of trunnions, 1377 AD.\n", + "* Wikimedia Commons has media related to Bombards (weapon).\n", + "* \"Hand bombard\", 1390–1400\n", + "* Bombard from the beginning of the 15th century, the only surviving bombard used by Teutonic Knights, now exposed in Kwidzyn Castle.\n", + "\n", + " Q: What is the first release date for Bombard?\n", + " A:\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages =basic_extractor_chroma.extract(\"What is the first release date for Bombard?\", max_tokens=400, context_restarter=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"I don't know. The provided context does not contain information about the release date of Bombard.\"" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model_lang en\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using embedded DuckDB without persistence: data will be transient\n", + "No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DONE, CHROMA\n", + "Selected 3 document sections:\n", + "0\n", + "11\n", + "37\n", + "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say \"I don't know.\"\n", + "\n", + "Context:\n", + "\n", + "* Bombard (weapon) - Wikipedia\n", + "* Bombard (weapon)\n", + "* ^ Gwei-Djen, Lu; Needham, Joseph; Chi-Hsing, Phan (July 1988). \"The Oldest Representation of a Bombard\". Technology and Culture. 29 (3): 594–605. doi:10.2307/3105275. JSTOR 3105275.\n", + "\n", + " Q: What is the time period the Bombard was used most actively?\n", + " A:\n", + "all_done!\n" + ] + } + ], + "source": [ + "answer, prompt, messages =basic_extractor_chroma.extract(\"What is the time period the Bombard was used most actively?\", max_tokens=800, context_restarter=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The Bombard was used most actively throughout the Middle Ages and the early modern period.'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "answer" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "knowledgegpt-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d0285ce201951a37668925b1b7de032ac1583adb61d048d8a5dd45351727e09e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/knowledgegpt/extractors/helpers.py b/knowledgegpt/extractors/helpers.py index d7e94e2..2186267 100644 --- a/knowledgegpt/extractors/helpers.py +++ b/knowledgegpt/extractors/helpers.py @@ -24,7 +24,7 @@ def check_model_lang(model_lang, model_lang_acceptable_list=None): def check_index_type(index_type, index_type_acceptable_list=None): if index_type_acceptable_list is None: - index_type_acceptable_list = ["basic", "faiss"] + index_type_acceptable_list = ["basic", "faiss", "chroma"] if not isinstance(index_type, str): raise Exception("Index Type must be a string") diff --git a/knowledgegpt/utils/utils_distance.py b/knowledgegpt/utils/utils_distance.py index 52f1daa..14dbd96 100644 --- a/knowledgegpt/utils/utils_distance.py +++ b/knowledgegpt/utils/utils_distance.py @@ -42,7 +42,7 @@ def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items() ], reverse=True) - else: + elif index_type == "faiss": import faiss if embedding_type == "hf": @@ -68,5 +68,24 @@ def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, # print("document_similarities", document_similarities) if not verbose: print("DONE, FAISS") + else: + import chromadb + client = chromadb.Client() + + collection = client.create_collection("chroma_collection") + + collection.add( + embeddings=list(contexts.values()), + ids=[str(i) for i in list(contexts.keys())] + ) + + query_result = collection.query( + query_embeddings=[query_embedding], + n_results=len(contexts), + ) + + document_similarities = [(query_result["distances"][0][i], int(query_result["ids"][0][i])) for i in range(len(query_result["ids"][0]))] + if not verbose: + print("DONE, CHROMA") return document_similarities From 3ed52c748e361258632736b5ac742f4aa4d4dc8a Mon Sep 17 00:00:00 2001 From: eren23 Date: Sun, 9 Apr 2023 00:23:43 +0300 Subject: [PATCH 2/2] version change --- knowledgegpt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledgegpt/__init__.py b/knowledgegpt/__init__.py index a15fb29..89556dc 100644 --- a/knowledgegpt/__init__.py +++ b/knowledgegpt/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.0.7b" +__version__ = "0.0.8b" from .extractors.yt_subs_extractor import YTSubsExtractor from .extractors.yt_audio_extractor import YoutubeAudioExtractor