forked from milvus-io/pymilvus
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a simple example of performing sparse dense hybrid search (milv…
…us-io#1990) Signed-off-by: Buqian Zheng <zhengbuqian@gmail.com>
- Loading branch information
1 parent
73ada97
commit 584d04b
Showing
1 changed file
with
115 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# A demo showing hybrid semantic search with dense and sparse vectors using Milvus. | ||
# You can optionally choose to use the BGE-M3 model to embed the text as dense | ||
# and sparse vectors, or simply use random generated vectors as the example. | ||
|
||
# To use BGE-M3 model, you need to install the optional `model` module in pymilvus: | ||
# pip install pymilvus[model] | ||
use_bge_m3 = True | ||
|
||
# The overall steps are as follows: | ||
# 1. embed the text as dense and sparse vectors | ||
# 2. setup a Milvus collection to store the dense and sparse vectors | ||
# 3. insert the data to Milvus | ||
# 4. search and inspect the result! | ||
import random | ||
import string | ||
import numpy as np | ||
|
||
from pymilvus import ( | ||
utility, | ||
FieldSchema, CollectionSchema, DataType, | ||
Collection, AnnSearchRequest, RRFRanker, connections, | ||
) | ||
|
||
# 1. prepare a small corpus to search | ||
docs = [ | ||
"Artificial intelligence was founded as an academic discipline in 1956.", | ||
"Alan Turing was the first person to conduct substantial research in AI.", | ||
"Born in Maida Vale, London, Turing was raised in southern England.", | ||
] | ||
# add some randomly generated texts | ||
docs.extend([' '.join(''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(1, 8))) for _ in range(10)) for _ in range(1000)]) | ||
query = "Who started AI research?" | ||
|
||
def random_embedding(texts): | ||
rng = np.random.default_rng() | ||
return { | ||
"dense": np.random.rand(len(texts), 768), | ||
"sparse": [{d: rng.random() for d in random.sample(range(1000), random.randint(20, 30))} for _ in texts], | ||
} | ||
|
||
dense_dim = 768 | ||
ef = random_embedding | ||
|
||
if use_bge_m3: | ||
# BGE-M3 model can embed texts as dense and sparse vectors. | ||
# It is included in the optional `model` module in pymilvus, to install it, | ||
# simply run "pip install pymilvus[model]". | ||
from pymilvus.model.hybrid import BGEM3EmbeddingFunction | ||
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu") | ||
dense_dim = ef.dim["dense"] | ||
|
||
docs_embeddings = ef(docs) | ||
query_embeddings = ef([query]) | ||
|
||
# 2. setup Milvus collection and index | ||
connections.connect("default", host="localhost", port="19530") | ||
|
||
# Specify the data schema for the new Collection. | ||
fields = [ | ||
# Use auto generated id as primary key | ||
FieldSchema(name="pk", dtype=DataType.VARCHAR, | ||
is_primary=True, auto_id=True, max_length=100), | ||
# Store the original text to retrieve based on semantically distance | ||
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512), | ||
# Milvus now supports both sparse and dense vectors, we can store each in | ||
# a separate field to conduct hybrid search on both vectors. | ||
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), | ||
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, | ||
dim=dense_dim), | ||
] | ||
schema = CollectionSchema(fields, "") | ||
col_name = 'hybrid_demo' | ||
# Now we can create the new collection with above name and schema. | ||
col = Collection(col_name, schema, consistency_level="Strong") | ||
|
||
# We need to create indices for the vector fields. The indices will be loaded | ||
# into memory for efficient search. | ||
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"} | ||
col.create_index("sparse_vector", sparse_index) | ||
dense_index = {"index_type": "FLAT", "metric_type": "L2"} | ||
col.create_index("dense_vector", dense_index) | ||
col.load() | ||
|
||
# 3. insert text and sparse/dense vector representations into the collection | ||
entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]] | ||
col.insert(entities) | ||
col.flush() | ||
|
||
# 4. search and inspect the result! | ||
k = 2 # we want to get the top 2 docs closest to the query | ||
|
||
# Prepare the search requests for both vector fields | ||
sparse_search_params = {"metric_type": "IP"} | ||
sparse_req = AnnSearchRequest(query_embeddings["sparse"], | ||
"sparse_vector", sparse_search_params, limit=k) | ||
dense_search_params = {"metric_type": "L2"} | ||
dense_req = AnnSearchRequest(query_embeddings["dense"], | ||
"dense_vector", dense_search_params, limit=k) | ||
|
||
# Search topK docs based on dense and sparse vectors and rerank with RRF. | ||
res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(), | ||
limit=k, output_fields=['text']) | ||
|
||
# Currently Milvus only support 1 query in the same hybrid search request, so | ||
# we inspect res[0] directly. In future release Milvus will accept batch | ||
# hybrid search queries in the same call. | ||
for hit in res[0]: | ||
print(f'text: {hit.fields["text"]} distance {hit.distance}') | ||
|
||
# If you are using BGE-M3 to generate the embedding, you should see the following: | ||
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656 | ||
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897 | ||
|
||
# Drop the collection to clean up the data. | ||
utility.drop_collection(col_name) |