Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deep Lake mini upgrades #3375

Merged
merged 49 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
97be64f
Merge pull request #1 from hwchase17/master
davidbuniat Apr 5, 2023
7ec34e1
deeplake vector store advances
Apr 5, 2023
987c377
merge
Apr 5, 2023
a2cc2ec
Merge branch 'master' of https://github.com/activeloopai/langchain
Apr 5, 2023
b9ab944
remove comments
Apr 5, 2023
a969c7a
demo update
Apr 5, 2023
f151697
Merge branch 'master' of https://github.com/hwchase17/langchain
Apr 5, 2023
313a620
typo fix
Apr 5, 2023
78c99c8
mypy fixes
Apr 5, 2023
1e1271b
filter fix on delete
Apr 5, 2023
99379be
formatting update
Apr 5, 2023
be0bafb
unused imports
Apr 5, 2023
a8816ca
ruff fix
Apr 5, 2023
4986056
fix comments
Apr 5, 2023
894d5bd
refmormat
Apr 5, 2023
c81bb90
Merge branch 'hwchase17:master' into master
davidbuniat Apr 7, 2023
236002f
deeplake vectro store improved
Apr 8, 2023
93acd8e
deeplake faster and custom filters
Apr 8, 2023
28f89ab
dretriever example added
Apr 8, 2023
5641667
Merge branch 'hwchase17:master' into master
davidbuniat Apr 8, 2023
fbf8110
typo
Apr 8, 2023
7f0b925
Merge branch 'master' of https://github.com/activeloopai/langchain
Apr 8, 2023
374491e
minor updates
Apr 8, 2023
b346833
ruf fix
Apr 8, 2023
166a2d6
added use case
Apr 8, 2023
ed21551
added code
Apr 8, 2023
e516ae8
added retriever pointer in the docs
Apr 8, 2023
598332e
Merge branch 'hwchase17:master' into master
davidbuniat Apr 8, 2023
0a34694
merge
Apr 10, 2023
40d170a
Merge branch 'hwchase17:master' into master
davidbuniat Apr 15, 2023
a4e4a4d
improve token auth and tests mode on
Apr 15, 2023
ecd6ea8
remove few flags
Apr 15, 2023
0d42983
tests update
Apr 15, 2023
c80a7d3
remove modules notebook
Apr 15, 2023
781fdc4
reemove semi-sensitive data
Apr 15, 2023
3f89c5e
Merge branch 'hwchase17:master' into master
davidbuniat Apr 21, 2023
0357e60
Merge branch 'hwchase17:master' into master
davidbuniat Apr 22, 2023
e1ee292
upgrade deeplake version and twitter notebook
Apr 23, 2023
629988d
Merge branch 'hwchase17:master' into master
davidbuniat Apr 23, 2023
061d60b
upgraded notebookss, moved to local storage instead of in-memory, set…
Apr 23, 2023
1841305
Merge branch 'master' of https://github.com/activeloopai/langchain
Apr 23, 2023
6b7c3b2
doc update
Apr 23, 2023
4eeb26d
Merge branch 'hwchase17:master' into master
davidbuniat Apr 23, 2023
07fd0c2
reformat
Apr 23, 2023
d270d59
fixed typo and added assert
Apr 23, 2023
396b6ee
reeformatting
Apr 23, 2023
619f6e5
Merge branch 'hwchase17:master' into master
davidbuniat Apr 23, 2023
8c7ecc3
added disallowed_special=() to bypass utf-8 encoding issue in example
Apr 23, 2023
4294a60
creds fix for exists
Apr 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
484 changes: 398 additions & 86 deletions docs/modules/indexes/vectorstores/examples/deeplake.ipynb

Large diffs are not rendered by default.

84 changes: 25 additions & 59 deletions docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,24 @@
"from langchain.vectorstores import DeepLake\n",
"\n",
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
"embeddings = OpenAIEmbeddings()"
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings(disallowed_special=())"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
]
},
{
Expand Down Expand Up @@ -120,7 +136,9 @@
"metadata": {},
"outputs": [],
"source": [
"db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
"username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
"db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
"db.add_documents(texts)"
]
},
{
Expand All @@ -133,61 +151,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"hub://davitbun/twitter-algorithm loaded successfully.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
"\n",
" tensor htype shape dtype compression\n",
" ------- ------- ------- ------- ------- \n",
" embedding generic (23152, 1536) float32 None \n",
" ids text (23152, 1) str None \n",
" metadata json (23152, 1) str None \n",
" text text (23152, 1) str None \n"
]
}
],
"outputs": [],
"source": [
"db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
]
Expand All @@ -203,7 +169,7 @@
"retriever.search_kwargs['distance_metric'] = 'cos'\n",
"retriever.search_kwargs['fetch_k'] = 100\n",
"retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
"retriever.search_kwargs['k'] = 20"
"retriever.search_kwargs['k'] = 10"
]
},
{
Expand Down Expand Up @@ -241,7 +207,7 @@
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"\n",
"model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
"model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
"qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
]
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"\n",
"dataset_path = 'hub://'+org+'/data'\n",
"embeddings = OpenAIEmbeddings()\n",
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
]
},
{
Expand Down
41 changes: 34 additions & 7 deletions langchain/vectorstores/deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def vector_search(
returns:
nearest_indices: List, indices of nearest neighbors
"""
if data_vectors.shape[0] == 0:
return [], []

# Calculate the distance between the query_vector and all data_vectors
distances = distance_metric_map[distance_metric](query_embedding, data_vectors)
nearest_indices = np.argsort(distances)
Expand Down Expand Up @@ -87,7 +90,7 @@ class DeepLake(VectorStore):
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
"""

_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/"

def __init__(
self,
Expand All @@ -96,7 +99,7 @@ def __init__(
embedding_function: Optional[Embeddings] = None,
read_only: Optional[bool] = False,
ingestion_batch_size: int = 1024,
num_workers: int = 4,
num_workers: int = 0,
**kwargs: Any,
) -> None:
"""Initialize with Deep Lake client."""
Expand All @@ -112,8 +115,13 @@ def __init__(
"Please install it with `pip install deeplake`."
)
self._deeplake = deeplake
self.dataset_path = dataset_path
creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {}

if deeplake.exists(dataset_path, token=token):
if (
deeplake.exists(dataset_path, token=token, **creds_args)
and "overwrite" not in kwargs
):
self.ds = deeplake.load(
dataset_path, token=token, read_only=read_only, **kwargs
)
Expand All @@ -123,6 +131,9 @@ def __init__(
)
self.ds.summary()
else:
if "overwrite" in kwargs:
del kwargs["overwrite"]

self.ds = deeplake.empty(
dataset_path, token=token, overwrite=True, **kwargs
)
Expand Down Expand Up @@ -215,14 +226,18 @@ def ingest(sample_in: list, sample_out: list) -> None:
)

batch_size = min(self.ingestion_batch_size, len(elements))
if batch_size == 0:
return []

batched = [
elements[i : i + batch_size] for i in range(0, len(elements), batch_size)
]

ingest().eval(
batched,
self.ds,
num_workers=min(self.num_workers, len(batched) // self.num_workers),
num_workers=min(self.num_workers, len(batched) // max(self.num_workers, 1)),
**kwargs,
)
self.ds.commit(allow_empty=True)
self.ds.summary()
Expand Down Expand Up @@ -443,8 +458,8 @@ def from_texts(
) -> DeepLake:
"""Create a Deep Lake dataset from a raw documents.

If a dataset_path is specified, the dataset will be persisted there.
Otherwise, the data will be ephemeral in-memory.
If a dataset_path is specified, the dataset will be persisted in that location,
otherwise by default at `./deeplake`

Args:
path (str, pathlib.Path): - The full path to the dataset. Can be:
Expand Down Expand Up @@ -493,7 +508,7 @@ def delete(
Defaults to None.
"""
if delete_all:
self.ds.delete()
self.ds.delete(large_ok=True)
return True

view = None
Expand All @@ -515,6 +530,18 @@ def delete(

return True

@classmethod
def force_delete_by_path(cls, path: str) -> None:
"""Force delete dataset by path"""
try:
import deeplake
except ImportError:
raise ValueError(
"Could not import deeplake python package. "
"Please install it with `pip install deeplake`."
)
deeplake.delete(path, large_ok=True, force=True)

def delete_dataset(self) -> None:
"""Delete the collection."""
self.delete(delete_all=True)
Expand Down
19 changes: 9 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ arxiv = {version = "^1.4", optional = true}
pypdf = {version = "^3.4.0", optional = true}
networkx = {version="^2.6.3", optional = true}
aleph-alpha-client = {version="^2.15.0", optional = true}
deeplake = {version = "^3.2.21", optional = true}
deeplake = {version = "^3.3.0", optional = true}
pgvector = {version = "^0.1.6", optional = true}
psycopg2-binary = {version = "^2.9.5", optional = true}
#boto3 = {version = "^1.26.96", optional = true} # TODO: fix it, commented because the version failed with deeplake
Expand Down
7 changes: 7 additions & 0 deletions tests/integration_tests/vectorstores/test_deeplake.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,10 @@ def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None:
assert len(deeplake_datastore.ds) == 2

deeplake_datastore.delete_dataset()


def test_delete_by_path(deeplake_datastore: DeepLake) -> None:
"""Test delete dataset."""
path = deeplake_datastore.dataset_path
DeepLake.force_delete_by_path(path)
davidbuniat marked this conversation as resolved.
Show resolved Hide resolved
assert not deeplake.exists(path)