Skip to content

Commit

Permalink
feat: support relyt vector database (#3367)
Browse files Browse the repository at this point in the history
Co-authored-by: jingsi <jingsi@leadincloud.com>
  • Loading branch information
klaus-xiong and jingsi authored Apr 15, 2024
1 parent 92f8c40 commit 3339783
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 2 deletions.
9 changes: 8 additions & 1 deletion api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ AZURE_BLOB_ACCOUNT_URL=https://<your_account_name>.blob.core.windows.net
WEB_API_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*
CONSOLE_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*

# Vector database configuration, support: weaviate, qdrant, milvus
# Vector database configuration, support: weaviate, qdrant, milvus, relyt
VECTOR_STORE=weaviate

# Weaviate configuration
Expand All @@ -78,6 +78,13 @@ MILVUS_USER=root
MILVUS_PASSWORD=Milvus
MILVUS_SECURE=false

# Relyt configuration
RELYT_HOST=127.0.0.1
RELYT_PORT=5432
RELYT_USER=postgres
RELYT_PASSWORD=postgres
RELYT_DATABASE=postgres

# Upload configuration
UPLOAD_FILE_SIZE_LIMIT=15
UPLOAD_FILE_BATCH_LIMIT=5
Expand Down
8 changes: 8 additions & 0 deletions api/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,14 @@ def migrate_knowledge_vector_database():
"vector_store": {"class_prefix": collection_name}
}
dataset.index_struct = json.dumps(index_struct_dict)
elif vector_type == "relyt":
dataset_id = dataset.id
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
index_struct_dict = {
"type": 'relyt',
"vector_store": {"class_prefix": collection_name}
}
dataset.index_struct = json.dumps(index_struct_dict)
else:
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")

Expand Down
9 changes: 8 additions & 1 deletion api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def __init__(self):

# ------------------------
# Vector Store Configurations.
# Currently, only support: qdrant, milvus, zilliz, weaviate
# Currently, only support: qdrant, milvus, zilliz, weaviate, relyt
# ------------------------
self.VECTOR_STORE = get_env('VECTOR_STORE')
self.KEYWORD_STORE = get_env('KEYWORD_STORE')
Expand All @@ -221,6 +221,13 @@ def __init__(self):
self.WEAVIATE_GRPC_ENABLED = get_bool_env('WEAVIATE_GRPC_ENABLED')
self.WEAVIATE_BATCH_SIZE = int(get_env('WEAVIATE_BATCH_SIZE'))

# relyt settings
self.RELYT_HOST = get_env('RELYT_HOST')
self.RELYT_PORT = get_env('RELYT_PORT')
self.RELYT_USER = get_env('RELYT_USER')
self.RELYT_PASSWORD = get_env('RELYT_PASSWORD')
self.RELYT_DATABASE = get_env('RELYT_DATABASE')

# ------------------------
# Mail Configurations.
# ------------------------
Expand Down
Empty file.
169 changes: 169 additions & 0 deletions api/core/rag/datasource/vdb/relyt/relyt_vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import logging
from typing import Any

from pgvecto_rs.sdk import PGVectoRs, Record
from pydantic import BaseModel, root_validator
from sqlalchemy import text as sql_text
from sqlalchemy.orm import Session

from core.rag.datasource.vdb.vector_base import BaseVector
from core.rag.models.document import Document
from extensions.ext_redis import redis_client

logger = logging.getLogger(__name__)

class RelytConfig(BaseModel):
host: str
port: int
user: str
password: str
database: str

@root_validator()
def validate_config(cls, values: dict) -> dict:
if not values['host']:
raise ValueError("config RELYT_HOST is required")
if not values['port']:
raise ValueError("config RELYT_PORT is required")
if not values['user']:
raise ValueError("config RELYT_USER is required")
if not values['password']:
raise ValueError("config RELYT_PASSWORD is required")
if not values['database']:
raise ValueError("config RELYT_DATABASE is required")
return values


class RelytVector(BaseVector):

def __init__(self, collection_name: str, config: RelytConfig, dim: int):
super().__init__(collection_name)
self._client_config = config
self._url = f"postgresql+psycopg2://{config.user}:{config.password}@{config.host}:{config.port}/{config.database}"
self._client = PGVectoRs(
db_url=self._url,
collection_name=self._collection_name,
dimension=dim
)
self._fields = []

def get_type(self) -> str:
return 'relyt'

def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
index_params = {}
metadatas = [d.metadata for d in texts]
self.create_collection(len(embeddings[0]))
self.add_texts(texts, embeddings)

def create_collection(self, dimension: int):
lock_name = 'vector_indexing_lock_{}'.format(self._collection_name)
with redis_client.lock(lock_name, timeout=20):
collection_exist_cache_key = 'vector_indexing_{}'.format(self._collection_name)
if redis_client.get(collection_exist_cache_key):
return
index_name = f"{self._collection_name}_embedding_index"
with Session(self._client._engine) as session:
drop_statement = sql_text(f"DROP TABLE IF EXISTS collection_{self._collection_name}")
session.execute(drop_statement)
create_statement = sql_text(f"""
CREATE TABLE IF NOT EXISTS collection_{self._collection_name} (
id UUID PRIMARY KEY,
text TEXT NOT NULL,
meta JSONB NOT NULL,
embedding vector({dimension}) NOT NULL
) using heap;
""")
session.execute(create_statement)
index_statement = sql_text(f"""
CREATE INDEX {index_name}
ON collection_{self._collection_name} USING vectors(embedding vector_l2_ops)
WITH (options = $$
optimizing.optimizing_threads = 30
segment.max_growing_segment_size = 2000
segment.max_sealed_segment_size = 30000000
[indexing.hnsw]
m=30
ef_construction=500
$$);
""")
session.execute(index_statement)
session.commit()
redis_client.set(collection_exist_cache_key, 1, ex=3600)

def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
records = [Record.from_text(d.page_content, e, d.metadata) for d, e in zip(documents, embeddings)]
pks = [str(r.id) for r in records]
self._client.insert(records)
return pks

def delete_by_document_id(self, document_id: str):
ids = self.get_ids_by_metadata_field('document_id', document_id)
if ids:
self._client.delete_by_ids(ids)

def get_ids_by_metadata_field(self, key: str, value: str):
result = None
with Session(self._client._engine) as session:
select_statement = sql_text(
f"SELECT id FROM collection_{self._collection_name} WHERE meta->>'{key}' = '{value}'; "
)
result = session.execute(select_statement).fetchall()
if result:
return [item[0] for item in result]
else:
return None

def delete_by_metadata_field(self, key: str, value: str):

ids = self.get_ids_by_metadata_field(key, value)
if ids:
self._client.delete_by_ids(ids)

def delete_by_ids(self, doc_ids: list[str]) -> None:
with Session(self._client._engine) as session:
select_statement = sql_text(
f"SELECT id FROM collection_{self._collection_name} WHERE meta->>'doc_id' in ('{doc_ids}'); "
)
result = session.execute(select_statement).fetchall()
if result:
ids = [item[0] for item in result]
self._client.delete_by_ids(ids)

def delete(self) -> None:
with Session(self._client._engine) as session:
session.execute(sql_text(f"DROP TABLE IF EXISTS collection_{self._collection_name}"))
session.commit()

def text_exists(self, id: str) -> bool:
with Session(self._client._engine) as session:
select_statement = sql_text(
f"SELECT id FROM collection_{self._collection_name} WHERE meta->>'doc_id' = '{id}' limit 1; "
)
result = session.execute(select_statement).fetchall()
return len(result) > 0

def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
from pgvecto_rs.sdk import filters
filter_condition = filters.meta_contains(kwargs.get('filter'))
results = self._client.search(
top_k=int(kwargs.get('top_k')),
embedding=query_vector,
filter=filter_condition
)

# Organize results.
docs = []
for record, dis in results:
metadata = record.meta
metadata['score'] = dis
score_threshold = kwargs.get('score_threshold') if kwargs.get('score_threshold') else 0.0
if dis > score_threshold:
doc = Document(page_content=record.text,
metadata=metadata)
docs.append(doc)
return docs

def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
# milvus/zilliz/relyt doesn't support bm25 search
return []
25 changes: 25 additions & 0 deletions api/core/rag/datasource/vdb/vector_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,31 @@ def _init_vector(self) -> BaseVector:
database=config.get('MILVUS_DATABASE'),
)
)
elif vector_type == "relyt":
from core.rag.datasource.vdb.relyt.relyt_vector import RelytConfig, RelytVector
if self._dataset.index_struct_dict:
class_prefix: str = self._dataset.index_struct_dict['vector_store']['class_prefix']
collection_name = class_prefix
else:
dataset_id = self._dataset.id
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
index_struct_dict = {
"type": 'relyt',
"vector_store": {"class_prefix": collection_name}
}
self._dataset.index_struct = json.dumps(index_struct_dict)
dim = len(self._embeddings.embed_query("hello relyt"))
return RelytVector(
collection_name=collection_name,
config=RelytConfig(
host=config.get('RELYT_HOST'),
port=config.get('RELYT_PORT'),
user=config.get('RELYT_USER'),
password=config.get('RELYT_PASSWORD'),
database=config.get('RELYT_DATABASE'),
),
dim=dim
)
else:
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")

Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,4 @@ azure-storage-blob==12.9.0
azure-identity==1.15.0
lxml==5.1.0
xlrd~=2.0.1
pgvecto-rs==0.1.4
6 changes: 6 additions & 0 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ services:
# the api-key for resend (https://resend.com)
RESEND_API_KEY: ''
RESEND_API_URL: https://api.resend.com
# relyt configurations
RELYT_HOST: db
RELYT_PORT: 5432
RELYT_USER: postgres
RELYT_PASSWORD: difyai123456
RELYT_DATABASE: postgres
depends_on:
- db
- redis
Expand Down

0 comments on commit 3339783

Please sign in to comment.