-
Notifications
You must be signed in to change notification settings - Fork 1.5k
/
using_multiple_embeddings.py
158 lines (104 loc) · 7.12 KB
/
using_multiple_embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""This example shows how to easily create multiple embeddings over the same library with llmware
This recipe can be especially useful when trying to compare the effectiveness of a particular
embedding model for a specific domain or library corpus and to run other comparative experiments
without being 'locked-in' to a particular model.
Note: the example uses four different embedding models:
1. mini-lm-sbert - a favorite small, fast Sentence Transformer included in the llmware model catalog by default
2. text-embedding-ada-002 - the popular OpenAI embedding model
3. industry-bert-sec - an industry fine-tuned embedding model, in the llmware model catalog
4. all-mpnet-base-v2 - one of the most popular Sentence Transformers (which we will register and add to the
model catalog on the fly
To use OpenAI Ada will require an Open API key - if you do not have one, feel free to comment out or
select a different model. Any Sentence Transformer or Huggingface embedding model can be used.
"""
import os
from llmware.setup import Setup
from llmware.library import Library
from llmware.retrieval import Query
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig
os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<INSERT YOUR OPEN API KEY HERE>"
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid a HuggingFace tokenizer warning
# Note: this will build a small library that will be used in the embedding examples
def build_lib (library_name, folder="Agreements"):
# Step 1 - Create library which is the main 'organizing construct' in llmware
print ("\nupdate: creating library: {}".format(library_name))
library = Library().create_new_library(library_name)
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
# --note: if you need to refresh the sample files, set 'over_write=True'
print ("update: downloading sample files")
sample_files_path = Setup().load_sample_files(over_write=False)
# Step 3 - point ".add_files" method to the folder of documents that was just created
# this method parses the documents, text chunks, and captures in MongoDB
print("update: parsing and text indexing files")
# options: Agreements | UN-Resolutions-500
library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
return library
# use multiple embedding models on the same library and the same vector db
def multiple_embeddings_same_db_same_lib(document_folder=None,sample_query=None,vector_db=None, base_library_name=None):
print("\nupdate: Step 1- starting here- building library- parsing PDFs into text chunks")
lib = build_lib(base_library_name, folder=document_folder)
# optional - check the status of the library card and embedding
lib_card = lib.get_library_card()
print("update: library card - ", lib_card)
print("\nupdate: Step 2 - starting to install embeddings")
# alt embedding models - "mini-lm-sbert" | industry-bert-contracts | text-embedding-ada-002
# note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
# e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
# Note: batch size can be configured based on memory of machine and optimized for performance
# -- generally, between 100-500 is a safe range to optimize performance/memory
print(f"\nupdate: Embedding #1 - mini-lm-sbert - {vector_db}")
lib.install_new_embedding(embedding_model_name="mini-lm-sbert", vector_db=vector_db, batch_size=200)
print(f"\nupdate: Embedding #2 - text-embedding-ada-002 - {vector_db}")
lib.install_new_embedding(embedding_model_name="text-embedding-ada-002", vector_db=vector_db, batch_size=500)
print(f"\nupdate: Embedding #3 - industry-bert-sec - {vector_db}")
lib.install_new_embedding(embedding_model_name="industry-bert-sec", vector_db=vector_db, batch_size=100)
# for the last embedding, we will register a popular open source sentence transformer model to use
# -- see "using_sentence_transformer.py" for more details
ModelCatalog().register_sentence_transformer_model(model_name="all-mpnet-base-v2",
embedding_dims=768, context_window=384)
# use directly now as an embedding model
print(f"\nupdate: Embedding #4 - all-mpnet-base-v2 - {vector_db}")
lib.install_new_embedding(embedding_model_name="all-mpnet-base-v2",vector_db=vector_db,batch_size=300)
# optional - check the embeddings on the library
print("\nupdate: Embedding record of the Library")
emb_record = lib.get_embedding_status()
for j, entries in enumerate(emb_record):
print("update: embeddings on library: ", j, entries)
# Using the Embeddings to Execute Queries
#
# create query object:
# 1. if no embedding_model or vector_db passed in constructor, then selects the LAST embedding record, which
# is the most recent embedding on the library, and uses that combination of model + vector db
#
# 2. if embedding_model_name only passed, then looks up the first instance of that embedding model
# in the embedding record, and will use the associated vector db
#
# 3. if both embedding_model_name and vector_db passed in constructor, then looks up that combo in
# embedding record.
query1 = Query(lib, embedding_model_name="mini-lm-sbert")
query2 = Query(lib, embedding_model_name="text-embedding-ada-002")
# to execute query against any of the query objects
minilm_results = query1.semantic_query(sample_query, result_count=12)
ada_results = query2.semantic_query(sample_query, result_count=12)
print("\n\nupdate: Sample Query using Embeddings")
print("\nupdate: Embedding Model # 1 - MiniLM SBERT Results")
for i, qr1 in enumerate(minilm_results):
print("update: minilm semantic query results: ", i, qr1["distance"], qr1)
print("\nupdate: Embedding Model # 2- Ada Results")
for j, qr2 in enumerate(ada_results):
print("update: ada semantic query results: ", j, qr2["distance"], qr2)
return 0
if __name__ == "__main__":
# document folder options: Agreements | UN-Resolutions-500
# note: Agreements = ~15 contracts = ~1272 embeddings - takes ~5 minutes to run (without GPU)
# note: UN-Resolutions-500 = 500 documents = ~12500 embeddings - takes ~15-20 minutes to run (without GPU)
# -- good sample query for UN-Resolutions, e.g. "what are key initiatives to promote sustainability?"
#
# try substituting different vector-db, e.g, "pg_vector" | "redis" | "faiss"
# please note: to use multiple embeddings on a library requires either Mongo or Postgres as text collection
LLMWareConfig().set_active_db("mongo")
multiple_embeddings_same_db_same_lib(document_folder="Agreements",
sample_query="what is the sale bonus?",
vector_db="milvus",
base_library_name="multi_embeddings_test_lib_0")