nv-morpheus · rapids-bot · Oct 13, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 27, 2023
@@ -108,3 +108,6 @@ dependencies:
     - pip:
         # Add additional dev dependencies here
         - pytest-kafka==0.6.0
+        - pymilvus==2.3.1
+        - llama-index==0.8.21
+        - llama-cpp-python
diff --git a/morpheus/controllers/milvus_langchain_controller.py b/morpheus/controllers/milvus_langchain_controller.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+
+import pandas as pd
+from langchain.document_loaders import DataFrameLoader
+from langchain.embeddings.huggingface import OpenAIEmbeddings
+from langchain.embeddings.openai import HuggingFaceEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import Milvus
+
+
+class MilvusLangChainController:
+    """
+    """
+
+    def __init__(self, host: str, port: str, **kwargs: typing.Any):
+        self._host = host
+        self._port = port
+        self._kwargs = kwargs
+        self._embeddings_map = {"openai": OpenAIEmbeddings, "huggingface": HuggingFaceEmbeddings}
+
+    def transform(self, df: pd.DataFrame, document_column: str, **kwargs: typing.Any):
+        """
+        """
+        loader = DataFrameLoader(data_frame=df, page_content_column=document_column)
+        documents = loader.load()
+
+        if "split" in kwargs:
+            chunk_size = kwargs.get("chunk_size", 1024)
+            chunk_overlap = kwargs.get("chunk_overlap", 0)
+            text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+            documents = text_splitter.split_documents(documents)
+
+        return documents
+
+    def store(self, documents: list, **kwargs: typing.Any):
+        """
+        """
+        embedding = kwargs.get("embedding", "openai")
+        embeddings = self._embeddings_map[embedding](**kwargs)
+        vector_store = Milvus.from_documents(documents,
+                                             embedding=embeddings,
+                                             connection_args={
+                                                 "host": self._host, "port": self._port
+                                             })
+        return vector_store
+
+    def query(self, query: str, vector_store: typing.Any, **kwargs: typing.Any):
+        """
+        """
+        response = vector_store.similarity_search(query, **kwargs)
+        return response
diff --git a/morpheus/controllers/milvus_llamaindex_controller.py b/morpheus/controllers/milvus_llamaindex_controller.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import typing
+
+import pandas as pd
+from llama_index import Document
+from llama_index import StorageContext
+from llama_index import VectorStoreIndex
+from llama_index.vector_stores import MilvusVectorStore
+
+
+class MilvusLlamaIndexController:
+    """
+    """
+
+    def __init__(self, host: str, port: str, collection_name: str, **kwargs: typing.Any):
+        self.milvus_vector_store = MilvusVectorStore(host=host, port=port, collection_name=collection_name, **kwargs)
+
+    def transform(self, df: pd.DataFrame, document_column: str, **kwargs: typing.Any):
+        """
+        """
+        documents = []
+        for idx, row in df.iterrows():
+            document_text = row[document_column]
+            document = Document(text=document_text, node_id=str(idx))
+            documents.append(document)
+        return documents
+
+    def store(self, documents: list):
+        """
+        """
+        storage_context = StorageContext.from_defaults(vector_store=self.milvus_vector_store)
+        vector_store = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
+        return vector_store
+
+    def query(self, query: str, vector_store: typing.Any, **kwargs: typing.Any):
+        """
+        """
+        query_engine = vector_store.as_query_engine()
+        response = query_engine.query(query, **kwargs)
+        return response
diff --git a/morpheus/controllers/milvus_vector_db_controller.py b/morpheus/controllers/milvus_vector_db_controller.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import typing
+
+import pandas as pd
+from pymilvus import BulkInsertState
+from pymilvus import Collection
+from pymilvus import CollectionSchema
+from pymilvus import DataType
+from pymilvus import FieldSchema
+from pymilvus import connections
+from pymilvus import utility
+
+from morpheus.controllers.vector_db_controller import VectorDBController
+from morpheus.utils.vector_db_utils import MILVUS_DATA_TYPE_MAP
+from morpheus.utils.vector_db_utils import with_mutex
+
+logger = logging.getLogger(__name__)
+
+
+class MilvusVectorDBController(VectorDBController):
+    """
+    """
+
+    def __init__(self, host: str, port: str, alias: str = "default", **kwargs):
+        self._alias = alias
+        connections.connect(host=host, port=port, alias=self._alias, **kwargs)
+
+    def transform(self, data: typing.Any, **kwargs):
+        return data
+
+    def has_store_object(self, name) -> bool:
+        return utility.has_collection(name)
+
+    def list_store_objects(self) -> list[str]:
+        return utility.list_collections()
+
+    def _create_index(self, collection, field_name, index_params) -> None:
+        collection.create_index(field_name=field_name, index_params=index_params)
+
+    def _create_schema_field(self, field_conf: dict):
+        dtype = MILVUS_DATA_TYPE_MAP[field_conf["dtype"].lower()]
+        dim = field_conf.get("dim", None)
+
+        if (dtype == DataType.BINARY_VECTOR or dtype == DataType.FLOAT_VECTOR):
+            if not dim:
+                raise ValueError(f"Dimensions for {dtype} should not be None")
+            if not isinstance(dim, int):
+                raise ValueError(f"Dimensions for {dtype} should be an integer")
+
+        field_schema = FieldSchema(name=field_conf["name"],
+                                   dtype=dtype,
+                                   description=field_conf.get("description", ""),
+                                   is_primary=field_conf["is_primary"],
+                                   dim=dim)
+        return field_schema
+
+    @with_mutex("_mutex")
+    def create(self, name, **kwargs):
+        collection_conf = kwargs.get("collection_conf")
+        index_conf = collection_conf.get("index_conf", None)
+        partition_conf = collection_conf.get("partition_conf", None)
+
+        schema_conf = collection_conf.get("schema_conf")
+        schema_fields_conf = schema_conf.get("schema_fields")
+
+        if not self.has_store_object(name):
+
+            if len(schema_fields_conf) == 0:
+                raise ValueError("Cannot create collection as provided empty schema_fields configuration")
+
+            schema_fields = [self._create_schema_field(field_conf=field_conf) for field_conf in schema_fields_conf]
+
+            schema = CollectionSchema(fields=schema_fields,
+                                      auto_id=schema_conf.get("auto_id", False),
+                                      description=schema_conf.get("description", ""))
+
+            collection = Collection(name=name,
+                                    schema=schema,
+                                    using=self._alias,
+                                    shards_num=collection_conf.get("shards", 2),
+                                    consistency_level=collection_conf.get("consistency_level", "Strong"))
+
+            if partition_conf:
+                # Iterate over each partition configuration
+                for part in partition_conf:
+                    collection.create_partition(part["name"], description=part.get("description", ""))
+            if index_conf:
+                self._create_index(collection=collection,
+                                   field_name=index_conf["field_name"],
+                                   index_params=index_conf["index_params"])
+
+    @with_mutex("_mutex")
+    def insert(self, name, data, **kwargs):
+
+        collection_conf = kwargs.get("collection_conf", {})
+        partition_name = collection_conf.get("partition_name", "_default")
+
+        # TODO (Bhargav): Load input data from a file
+        if isinstance(data, list):
+            if not self.has_store_object(name):
+                raise ValueError(f"Collection {name} doesn't exist.")
+            collection = Collection(name=name)
+            collection.insert(data, partition_name=partition_name)
+            collection.flush()
+
+        elif isinstance(data, pd.DataFrame):
+
+            index_conf = collection_conf.get("index_conf", None)
+            params = collection_conf.get("params", {})
+
+            collection, ins_res = Collection.construct_from_dataframe(
+                name,
+                data,
+                primary_field=collection_conf["primary_field"],
+                auto_id=collection_conf.get("auto_id", False),
+                description=collection_conf.get("description", ""),
+                partition_name=partition_name,
+                **params
+            )
+
+            if index_conf:
+                self._create_index(collection=collection,
+                                   field_name=index_conf["field_name"],
+                                   index_params=index_conf["index_params"])
+
+            collection.flush()
+        else:
+            raise ValueError("Unsupported data type for insertion.")
+
+    @with_mutex("_mutex")
+    def search(self, name, query=None, **kwargs):
+        is_partition_load = kwargs.get("is_partition_load", False)
+
+        collection = Collection(name=name)
+
+        try:
+            if is_partition_load:
+                partitions = kwargs.get("partitions")
+                collection.load(partitions)
+            else:
+                collection.load()
+
+            if query:
+                result = collection.query(expr=query, **kwargs)
+            else:
+                result = collection.search(**kwargs)
+
+            return result
+
+        except Exception as exec_info:
+            raise RuntimeError(f"Error while performing search: {exec_info}") from exec_info
+        finally:
+            collection.release()
+
+    @with_mutex("_mutex")
+    def drop(self, name, **kwargs):
+
+        type = kwargs.get("type", "collection")
+
+        collection = Collection(name=name)
+        if type == "index":
+            collection.drop_index()
+        elif type == "partition":
+            partition_name = kwargs["partition_name"]
+            collection.drop_partition(partition_name)
+        else:
+            collection.drop()
+
+    @with_mutex("_mutex")
+    def close(self):
+        connections.remove_connection(alias=self._alias)