-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
retrieve from weaviate using weaviate instances
- Loading branch information
1 parent
73b1116
commit bb5cb2c
Showing
6 changed files
with
64 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: Load from huggingface hub | ||
description: Component that loads a dataset from huggingface hub | ||
image: fndnt/load_from_hf_hub:0.6.2 | ||
|
||
produces: | ||
text: | ||
fields: | ||
data: | ||
type: string | ||
|
||
args: | ||
dataset_name: | ||
description: Name of dataset on the hub | ||
type: str | ||
column_name_mapping: | ||
description: Mapping of the consumed hub dataset to fondant column names | ||
type: dict | ||
default: {} | ||
image_column_names: | ||
description: Optional argument, a list containing the original image column names in case the | ||
dataset on the hub contains them. Used to format the image from HF hub format to a byte string. | ||
type: list | ||
default: [] | ||
n_rows_to_load: | ||
description: Optional argument that defines the number of rows to load. Useful for testing pipeline runs on a small scale | ||
type: int | ||
default: None | ||
index_column: | ||
description: Column to set index to in the load component, if not specified a default globally unique index will be set | ||
type: str | ||
default: None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,2 @@ | ||
llama-index==0.8.68 | ||
weaviate-client | ||
transformers | ||
torch | ||
fondant[docker]==0.6.2 | ||
weaviate-client==3.24.1 | ||
fondant[component]==0.7.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,45 @@ | ||
import dask | ||
import pandas as pd | ||
from fondant.component import PandasTransformComponent | ||
from llama_index import ServiceContext, VectorStoreIndex | ||
from llama_index.embeddings import HuggingFaceEmbedding | ||
from llama_index.vector_stores import WeaviateVectorStore | ||
|
||
import weaviate | ||
|
||
dask.config.set({"dataframe.convert-string": False}) | ||
|
||
|
||
class RetrieveChunks(PandasTransformComponent): | ||
def __init__( | ||
self, | ||
*_, | ||
weaviate_url: str, | ||
class_name: str, | ||
text_property_name: str, | ||
hf_embed_model: str, | ||
top_k: int | ||
) -> None: | ||
self, | ||
*_, | ||
weaviate_url: str, | ||
class_name: str, | ||
top_k: int | ||
) -> None: | ||
""" | ||
Args: | ||
weaviate_url: An argument passed to the component | ||
""" | ||
# Initialize your component here based on the arguments | ||
self.client = weaviate.Client(weaviate_url) | ||
self.class_name = class_name | ||
self.text_property_name = text_property_name | ||
self.model = HuggingFaceEmbedding(hf_embed_model) | ||
self.k = top_k | ||
self.retriever = self._set_retriever( | ||
self.client, self.class_name, self.model, self.k | ||
) | ||
|
||
def _set_retriever(self, client, class_name, model, k): | ||
vector_store = WeaviateVectorStore( | ||
weaviate_client=client, index_name=class_name | ||
) | ||
service_context = ServiceContext.from_defaults(llm=None, embed_model=model) | ||
indexed_vector_db = VectorStoreIndex.from_vector_store( | ||
vector_store=vector_store, service_context=service_context | ||
|
||
def retrieve_chunks(self, vector_query: str): | ||
"""Get results from weaviate database""" | ||
result = ( | ||
self.client.query | ||
.get(self.class_name, ["passage"]) | ||
.with_near_vector({"vector":vector_query}) | ||
.with_limit(self.k) | ||
.with_additional(["distance"]) | ||
.do() | ||
) | ||
return indexed_vector_db.as_retriever(similarity_top_k=k) | ||
result_dict = result["data"]["Get"][self.class_name] | ||
text = [retrieved_chunk["passage"] for retrieved_chunk in result_dict] | ||
|
||
def retrieve_chunks(self, query: str): | ||
retrievals = self.retriever.retrieve(query) | ||
return [chunk.metadata[self.text_property_name] for chunk in retrievals] | ||
return text | ||
|
||
def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: | ||
dataframe[("text", "retrieved_chunks")] = dataframe[("text", "question")].apply( | ||
dataframe[("text", "retrieved_chunks")] = dataframe[("text", "embedding")].apply( | ||
self.retrieve_chunks | ||
) | ||
return dataframe |