forked from llmware-ai/llmware
-
Notifications
You must be signed in to change notification settings - Fork 0
/
using_sentence_transformer.py
113 lines (77 loc) · 4.76 KB
/
using_sentence_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""This example shows how to use sentence transformers as a vector embedding model with llmware.
To use models from the SentenceTransformer catalog, you may need to install as follows:
pip3 install sentence-transformers
"""
import os
from llmware.setup import Setup
from llmware.library import Library
from llmware.retrieval import Query
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig
from importlib import util
if not util.find_spec("sentence_transformers"):
print("\nto run this example, you should install the SentenceTransformer library with: "
"pip3 install sentence-transformers.")
def build_lib (library_name, folder="Agreements"):
# Step 1 - Create library which is the main 'organizing construct' in llmware
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
library = Library().create_new_library(library_name)
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
# --note: if you need to refresh the sample files, set 'over_write=True'
print ("update: Step 2 - Downloading Sample Files")
sample_files_path = Setup().load_sample_files(over_write=False)
# Step 3 - point ".add_files" method to the folder of documents that was just created
# this method parses the documents, text chunks, and captures in MongoDB
print("update: Step 3 - Parsing and Text Indexing Files")
# options: Agreements | UN-Resolutions-500
library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
return library
# start script
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
LLMWareConfig().set_active_db("sqlite")
lib = build_lib("st_embedding_0")
# register a model from the sentence transformers library/repository
# note: "all-MiniLM-L6-v2" is from the SentenceTransformer catalog, e.g.,
# -- https://www.sbert.net/docs/pretrained_models.html
# -- key inputs to register:
# -- "model_name" - should be an existing pre-trained model in the SentenceTransformer catalog
# -- "embedding_dims" - this is the output dimensions, included in the sbert model card info
# -- "context_window" - included in the sbert model card info
# -- *** "model_location" - "st_repo" is reserved word to tell llmware to look in sentence transformers ***
# -- *** "model_family" - "LLMWareSemanticModel" - knows how to load and embed with sentence transformers ***
# another sentence transformer to try: "all-mpnet-base-v2" - embedding_dims = 768 - context_window = 384
sentence_transformer_pretrained_model_name = "all-MiniLM-L6-v2"
embedding_dims = 384
context_window = 256
ModelCatalog().register_sentence_transformer_model(model_name=sentence_transformer_pretrained_model_name,
embedding_dims=embedding_dims, context_window=context_window)
"""
ModelCatalog().add_model_list({"model_name": sentence_transformer_pretrained_model_name,
"embedding_dims":embedding_dims,
"context_window":context_window,
"model_category": "embedding",
"model_family": "LLMWareSemanticModel",
"display_name": "MySentenceTransformer", "model_location": "st_repo"})
"""
# to confirm that model has been added to the catalog
mc = ModelCatalog().list_all_models()
model_card = ModelCatalog().lookup_model_card(sentence_transformer_pretrained_model_name)
print("update: model card - ", model_card)
# use directly now as an embedding model
lib.install_new_embedding(embedding_model_name=sentence_transformer_pretrained_model_name,
vector_db="milvus",batch_size=300)
# optional - check the status of the library card and embedding
lib_card = lib.get_library_card()
print("update: -- after embedding process - check updated library card - ", lib_card)
# create query object (note: including embedding_model is optional - only needed if multiple embeddings on library)
query_st = Query(lib, embedding_model_name=sentence_transformer_pretrained_model_name)
# run multiple queries using query_pgv
my_search_results = query_st.semantic_query("What is the sale bonus?", result_count = 24)
for i, qr in enumerate(my_search_results):
print("update: semantic query results: ", i, qr)
# if you want to delete the embedding - uncomment the line below - including the model_name and vector_db
# lib.delete_installed_embedding(sentence_transformer_pretrained_model_name, "milvus")
# optional - check the embeddings on the library
emb_record = lib.get_embedding_status()
for j, entries in enumerate(emb_record):
print("update: embeddings on library: ", j, entries)