Skip to content

Commit

Permalink
Deep Lake mini upgrades (#3375)
Browse files Browse the repository at this point in the history
Improvements
* set default num_workers for ingestion to 0
* upgraded notebooks for avoiding dataset creation ambiguity
* added `force_delete_dataset_by_path`
* bumped deeplake to 3.3.0
* creds arg passing to deeplake object that would allow custom S3

Notes
* please double check if poetry is not messed up (thanks!)

Asks
* Would be great to create a shared slack channel for quick questions

---------

Co-authored-by: Davit Buniatyan <d@activeloop.ai>
  • Loading branch information
2 people authored and vowelparrot committed Apr 26, 2023
1 parent 27f1463 commit bf0bbc8
Show file tree
Hide file tree
Showing 7 changed files with 475 additions and 164 deletions.
484 changes: 398 additions & 86 deletions docs/modules/indexes/vectorstores/examples/deeplake.ipynb

Large diffs are not rendered by default.

84 changes: 25 additions & 59 deletions docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,24 @@
"from langchain.vectorstores import DeepLake\n",
"\n",
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
"embeddings = OpenAIEmbeddings()"
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings(disallowed_special=())"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
]
},
{
Expand Down Expand Up @@ -120,7 +136,9 @@
"metadata": {},
"outputs": [],
"source": [
"db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
"username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
"db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
"db.add_documents(texts)"
]
},
{
Expand All @@ -133,61 +151,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"hub://davitbun/twitter-algorithm loaded successfully.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
"\n",
" tensor htype shape dtype compression\n",
" ------- ------- ------- ------- ------- \n",
" embedding generic (23152, 1536) float32 None \n",
" ids text (23152, 1) str None \n",
" metadata json (23152, 1) str None \n",
" text text (23152, 1) str None \n"
]
}
],
"outputs": [],
"source": [
"db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
]
Expand All @@ -203,7 +169,7 @@
"retriever.search_kwargs['distance_metric'] = 'cos'\n",
"retriever.search_kwargs['fetch_k'] = 100\n",
"retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
"retriever.search_kwargs['k'] = 20"
"retriever.search_kwargs['k'] = 10"
]
},
{
Expand Down Expand Up @@ -241,7 +207,7 @@
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"\n",
"model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
"model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
"qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"\n",
"dataset_path = 'hub://'+org+'/data'\n",
"embeddings = OpenAIEmbeddings()\n",
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
]
},
{
Expand Down
41 changes: 34 additions & 7 deletions langchain/vectorstores/deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def vector_search(
returns:
nearest_indices: List, indices of nearest neighbors
"""
if data_vectors.shape[0] == 0:
return [], []

# Calculate the distance between the query_vector and all data_vectors
distances = distance_metric_map[distance_metric](query_embedding, data_vectors)
nearest_indices = np.argsort(distances)
Expand Down Expand Up @@ -87,7 +90,7 @@ class DeepLake(VectorStore):
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
"""

_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/"

def __init__(
self,
Expand All @@ -96,7 +99,7 @@ def __init__(
embedding_function: Optional[Embeddings] = None,
read_only: Optional[bool] = False,
ingestion_batch_size: int = 1024,
num_workers: int = 4,
num_workers: int = 0,
**kwargs: Any,
) -> None:
"""Initialize with Deep Lake client."""
Expand All @@ -112,8 +115,13 @@ def __init__(
"Please install it with `pip install deeplake`."
)
self._deeplake = deeplake
self.dataset_path = dataset_path
creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {}

if deeplake.exists(dataset_path, token=token):
if (
deeplake.exists(dataset_path, token=token, **creds_args)
and "overwrite" not in kwargs
):
self.ds = deeplake.load(
dataset_path, token=token, read_only=read_only, **kwargs
)
Expand All @@ -123,6 +131,9 @@ def __init__(
)
self.ds.summary()
else:
if "overwrite" in kwargs:
del kwargs["overwrite"]

self.ds = deeplake.empty(
dataset_path, token=token, overwrite=True, **kwargs
)
Expand Down Expand Up @@ -215,14 +226,18 @@ def ingest(sample_in: list, sample_out: list) -> None:
)

batch_size = min(self.ingestion_batch_size, len(elements))
if batch_size == 0:
return []

batched = [
elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
]

ingest().eval(
batched,
self.ds,
num_workers=min(self.num_workers, len(batched) // self.num_workers),
num_workers=min(self.num_workers, len(batched) // max(self.num_workers, 1)),
**kwargs,
)
self.ds.commit(allow_empty=True)
self.ds.summary()
Expand Down Expand Up @@ -443,8 +458,8 @@ def from_texts(
) -> DeepLake:
"""Create a Deep Lake dataset from a raw documents.
If a dataset_path is specified, the dataset will be persisted there.
Otherwise, the data will be ephemeral in-memory.
If a dataset_path is specified, the dataset will be persisted in that location,
otherwise by default at `./deeplake`
Args:
path (str, pathlib.Path): - The full path to the dataset. Can be:
Expand Down Expand Up @@ -493,7 +508,7 @@ def delete(
Defaults to None.
"""
if delete_all:
self.ds.delete()
self.ds.delete(large_ok=True)
return True

view = None
Expand All @@ -515,6 +530,18 @@ def delete(

return True

@classmethod
def force_delete_by_path(cls, path: str) -> None:
"""Force delete dataset by path"""
try:
import deeplake
except ImportError:
raise ValueError(
"Could not import deeplake python package. "
"Please install it with `pip install deeplake`."
)
deeplake.delete(path, large_ok=True, force=True)

def delete_dataset(self) -> None:
"""Delete the collection."""
self.delete(delete_all=True)
Expand Down
19 changes: 9 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ arxiv = {version = "^1.4", optional = true}
pypdf = {version = "^3.4.0", optional = true}
networkx = {version="^2.6.3", optional = true}
aleph-alpha-client = {version="^2.15.0", optional = true}
deeplake = {version = "^3.2.21", optional = true}
deeplake = {version = "^3.3.0", optional = true}
pgvector = {version = "^0.1.6", optional = true}
psycopg2-binary = {version = "^2.9.5", optional = true}
#boto3 = {version = "^1.26.96", optional = true} # TODO: fix it, commented because the version failed with deeplake
Expand Down
7 changes: 7 additions & 0 deletions tests/integration_tests/vectorstores/test_deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,10 @@ def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
assert len(deeplake_datastore.ds) == 2

deeplake_datastore.delete_dataset()


def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
path = deeplake_datastore.dataset_path
DeepLake.force_delete_by_path(path)
assert not deeplake.exists(path)

0 comments on commit bf0bbc8

Please sign in to comment.