Skip to content

Commit

Permalink
DynamicMemoryIndex bug fixes (#404)
Browse files Browse the repository at this point in the history
* While simply creating a unit test to repro Issue #400, I found a number of bugs that I needed to address just to get it to work the way I had intended. This does not yet have what I would consider a comprehensive suite of test coverage for the DynamicMemoryIndex, but we at least do save it with the metadata file, we can load it correctly, and saving *always* consolidate_deletes() prior to save if any item has been marked for deletion prior to save.

* We actually cannot save without compacting before save anyway. Removing the parameter from save() and hardcoding it to True until we can actually support it.

* Addressing some PR comments and readying a 0.5.0.rc5 release
  • Loading branch information
daxpryce authored Jul 26, 2023
1 parent e1a8d78 commit 44445de
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "diskannpy"
version = "0.5.0.rc4"
version = "0.5.0.rc5"

description = "DiskANN Python extension module"
readme = "python/README.md"
Expand Down
2 changes: 2 additions & 0 deletions python/include/dynamic_memory_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class DynamicMemoryIndex
uint64_t num_queries, uint64_t knn, uint64_t complexity,
uint32_t num_threads);
void consolidate_delete();
size_t num_points();


private:
const uint32_t _initial_search_complexity;
Expand Down
28 changes: 25 additions & 3 deletions python/src/_dynamic_memory_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import os
import warnings

import numpy as np
Expand All @@ -21,12 +22,14 @@
_assert,
_assert_2d,
_assert_dtype,
_assert_existing_directory,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_ensure_index_metadata,
_valid_metric,
_valid_index_prefix,
_write_index_metadata
)
from ._diskannpy import defaults

Expand Down Expand Up @@ -158,6 +161,7 @@ def __init__(
"""

dap_metric = _valid_metric(distance_metric)
self._dap_metric = dap_metric
_assert_dtype(vector_dtype)
_assert_is_positive_uint32(dimensions, "dimensions")

Expand Down Expand Up @@ -199,6 +203,7 @@ def __init__(
search_threads=search_threads,
concurrent_consolidation=concurrent_consolidation
)
self._points_deleted = False

def search(
self, query: VectorLike, k_neighbors: int, complexity: int
Expand Down Expand Up @@ -293,16 +298,31 @@ def batch_search(
num_threads=num_threads,
)

def save(self, save_path: str, compact_before_save: bool = True):
def save(self, save_path: str, index_prefix: str = "ann"):
"""
Saves this index to file.
:param save_path: The path to save these index files to.
:type save_path: str
:param compact_before_save:
:param index_prefix: The prefix to use for the index files. Default is "ann".
:type index_prefix: str
"""
if save_path == "":
raise ValueError("save_path cannot be empty")
self._index.save(save_path=save_path, compact_before_save=compact_before_save)
if index_prefix == "":
raise ValueError("index_prefix cannot be empty")
_assert_existing_directory(save_path, "save_path")
save_path = os.path.join(save_path, index_prefix)
if self._points_deleted is True:
warnings.warn(
"DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called "
"prior to save when items have been marked for deletion. This is being done automatically now, though"
"it will increase the time it takes to save; on large sets of data it can take a substantial amount of "
"time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is "
"required."
)
self._index.consolidate_delete()
self._index.save(save_path=save_path, compact_before_save=True) # we do not yet support uncompacted saves
_write_index_metadata(save_path, self._vector_dtype, self._dap_metric, self._index.num_points(), self._dimensions)

def insert(self, vector: VectorLike, vector_id: VectorIdentifier):
"""
Expand Down Expand Up @@ -349,10 +369,12 @@ def mark_deleted(self, vector_id: VectorIdentifier):
:type vector_id: int
"""
_assert_is_positive_uint32(vector_id, "vector_id")
self._points_deleted = True
self._index.mark_deleted(np.uintc(vector_id))

def consolidate_delete(self):
"""
This method actually restructures the DiskANN index to remove the items that have been marked for deletion.
"""
self._index.consolidate_delete()
self._points_deleted = False
5 changes: 5 additions & 0 deletions python/src/dynamic_memory_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,11 @@ template <class DT> void DynamicMemoryIndex<DT>::consolidate_delete()
_index.consolidate_deletes(_write_parameters);
}

template <class DT> size_t DynamicMemoryIndex<DT>::num_points()
{
return _index.get_num_points();
}

template class DynamicMemoryIndex<float>;
template class DynamicMemoryIndex<uint8_t>;
template class DynamicMemoryIndex<int8_t>;
Expand Down
3 changes: 2 additions & 1 deletion python/src/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ template <typename T> inline void add_variant(py::module_ &m, const Variant &var
.def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
.def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
.def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete);
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete)
.def("num_points", &diskannpy::DynamicMemoryIndex<T>::num_points);

py::class_<diskannpy::StaticDiskIndex<T>>(m, variant.static_disk_index_name.c_str())
.def(py::init<const diskann::Metric, const std::string &, const uint32_t, const size_t, const uint32_t>(),
Expand Down
45 changes: 44 additions & 1 deletion python/tests/test_dynamic_memory_index.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import os
import shutil
import tempfile
import unittest

import diskannpy as dap
Expand Down Expand Up @@ -296,3 +296,46 @@ def test_value_ranges_batch_search(self):
index.batch_search(
queries=np.array([[]], dtype=np.single), **kwargs
)

# Issue #400
def test_issue400(self):
_, _, _, index_vectors, ann_dir, _, generated_tags = self._test_matrix[0]

deletion_tag = generated_tags[10] # arbitrary choice
deletion_vector = index_vectors[10]

index = dap.DynamicMemoryIndex.from_file(
index_directory=ann_dir,
num_threads=16,
initial_search_complexity=32,
max_vectors=10100,
complexity=64,
graph_degree=32
)
index.insert(np.array([1.0] * 10, dtype=np.single), 10099)
index.insert(np.array([2.0] * 10, dtype=np.single), 10050)
index.insert(np.array([3.0] * 10, dtype=np.single), 10053)
tags, distances = index.search(np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64)
self.assertIn(10053, tags)
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertIn(deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet")
index.mark_deleted(deletion_tag)
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertNotIn(deletion_tag, tags, "deletion_tag should not exist, as we have marked it for deletion")
with tempfile.TemporaryDirectory() as tmpdir:
index.save(tmpdir)

index2 = dap.DynamicMemoryIndex.from_file(
index_directory=tmpdir,
num_threads=16,
initial_search_complexity=32,
max_vectors=10100,
complexity=64,
graph_degree=32
)
tags, distances = index2.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertNotIn(
deletion_tag,
tags,
"deletion_tag should not exist, as we saved and reloaded the index without it"
)

0 comments on commit 44445de

Please sign in to comment.