forked from llmware-ai/llmware
-
Notifications
You must be signed in to change notification settings - Fork 0
/
using_redis.py
97 lines (62 loc) · 3.72 KB
/
using_redis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""This example shows how to use Redis as a vector embedding database with llmware"""
""" (A) Python Dependencies -
As a first step, you should pip install dependencies not included in the llmware package:
-- pip3 install redis
(B) Installing Redis -
If you need help installing Redis, please see the official redis implementation docs (or many widely available tutorials), e.g.,:
-- https://redis.io/docs/install/install-redis/
-- for a fast development install with docker-compose:
-- please see docker-compose-redis-stack.yaml in the llmware repository
(C) Configurations -
-- set os.environ variables to 'automatically' pass in installing embedding
-- os.environ["USER_MANAGED_REDIS_HOST"] = "localhost"
-- os.environ["USER_MANAGED_REDIS_PORT"] = 6379
"""
import os
from llmware.setup import Setup
from llmware.library import Library
from llmware.retrieval import Query
def build_lib (library_name, folder="Agreements"):
# Step 1 - Create library which is the main 'organizing construct' in llmware
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
library = Library().create_new_library(library_name)
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
# --note: if you need to refresh the sample files, set 'over_write=True'
print ("update: Step 2 - Downloading Sample Files")
sample_files_path = Setup().load_sample_files(over_write=False)
# Step 3 - point ".add_files" method to the folder of documents that was just created
# this method parses the documents, text chunks, and captures in MongoDB
print("update: Step 3 - Parsing and Text Indexing Files")
# options: Agreements | UN-Resolutions-500
library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
return library
# start script
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
lib = build_lib("redis_lib_1114_0")
# optional - check the status of the library card and embedding
lib_card = lib.get_library_card()
print("update: -- before embedding process - check library card - ", lib_card)
print("update: Step 2 - starting to install embeddings")
# alt embedding models - "mini-lm-sbert" | industry-bert-contracts | text-embedding-ada-002
# note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
# e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
# batch sizes from 100-500 usually give good performance and work on most environments
lib.install_new_embedding(embedding_model_name="industry-bert-contracts",vector_db="redis",batch_size=300)
# optional - check the status of the library card and embedding
lib_card = lib.get_library_card()
print("update: -- after embedding process - check updated library card - ", lib_card)
# run a query
# note: embedding_model_name is optional, but useful if you create multiple embeddings on the same library
# --see other example scripts for multiple embeddings
# create query object
query_pgv = Query(lib, embedding_model_name="industry-bert-contracts")
# run multiple queries using query_pgv
my_search_results = query_pgv.semantic_query("What is the sale bonus?", result_count = 24)
for i, qr in enumerate(my_search_results):
print("update: semantic query results: ", i, qr)
# if you want to delete the embedding - uncomment the line below
# lib.delete_installed_embedding("industry-bert-contracts", "redis")
# optional - check the embeddings on the library
emb_record = lib.get_embedding_status()
for j, entries in enumerate(emb_record):
print("update: embeddings on library: ", j, entries)