Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DynamicMemoryIndex bug fixes #404

Merged
merged 4 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "diskannpy"
version = "0.5.0.rc4"
version = "0.5.0.rc5"

description = "DiskANN Python extension module"
readme = "python/README.md"
Expand Down
2 changes: 2 additions & 0 deletions python/include/dynamic_memory_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ class DynamicMemoryIndex
uint64_t num_queries, uint64_t knn, uint64_t complexity,
uint32_t num_threads);
void consolidate_delete();
size_t num_points();


private:
const uint32_t _initial_search_complexity;
Expand Down
28 changes: 25 additions & 3 deletions python/src/_dynamic_memory_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import os
import warnings

import numpy as np
Expand All @@ -21,12 +22,14 @@
_assert,
_assert_2d,
_assert_dtype,
_assert_existing_directory,
_assert_is_nonnegative_uint32,
_assert_is_positive_uint32,
_castable_dtype_or_raise,
_ensure_index_metadata,
_valid_metric,
_valid_index_prefix,
_write_index_metadata
)
from ._diskannpy import defaults

Expand Down Expand Up @@ -158,6 +161,7 @@ def __init__(
"""

dap_metric = _valid_metric(distance_metric)
self._dap_metric = dap_metric
_assert_dtype(vector_dtype)
_assert_is_positive_uint32(dimensions, "dimensions")

Expand Down Expand Up @@ -199,6 +203,7 @@ def __init__(
search_threads=search_threads,
concurrent_consolidation=concurrent_consolidation
)
self._points_deleted = False

def search(
self, query: VectorLike, k_neighbors: int, complexity: int
Expand Down Expand Up @@ -293,16 +298,31 @@ def batch_search(
num_threads=num_threads,
)

def save(self, save_path: str, compact_before_save: bool = True):
def save(self, save_path: str, index_prefix: str = "ann"):
"""
Saves this index to file.
:param save_path: The path to save these index files to.
:type save_path: str
:param compact_before_save:
:param index_prefix: The prefix to use for the index files. Default is "ann".
:type index_prefix: str
"""
if save_path == "":
raise ValueError("save_path cannot be empty")
self._index.save(save_path=save_path, compact_before_save=compact_before_save)
if index_prefix == "":
raise ValueError("index_prefix cannot be empty")
_assert_existing_directory(save_path, "save_path")
save_path = os.path.join(save_path, index_prefix)
if self._points_deleted is True:
warnings.warn(
"DynamicMemoryIndex.save() currently requires DynamicMemoryIndex.consolidate_delete() to be called "
"prior to save when items have been marked for deletion. This is being done automatically now, though"
"it will increase the time it takes to save; on large sets of data it can take a substantial amount of "
"time. In the future, we will implement a faster save with unconsolidated deletes, but for now this is "
"required."
)
daxpryce marked this conversation as resolved.
Show resolved Hide resolved
self._index.consolidate_delete()
self._index.save(save_path=save_path, compact_before_save=True) # we do not yet support uncompacted saves
_write_index_metadata(save_path, self._vector_dtype, self._dap_metric, self._index.num_points(), self._dimensions)

def insert(self, vector: VectorLike, vector_id: VectorIdentifier):
"""
Expand Down Expand Up @@ -349,10 +369,12 @@ def mark_deleted(self, vector_id: VectorIdentifier):
:type vector_id: int
"""
_assert_is_positive_uint32(vector_id, "vector_id")
self._points_deleted = True
self._index.mark_deleted(np.uintc(vector_id))

def consolidate_delete(self):
"""
This method actually restructures the DiskANN index to remove the items that have been marked for deletion.
"""
self._index.consolidate_delete()
self._points_deleted = False
5 changes: 5 additions & 0 deletions python/src/dynamic_memory_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,11 @@ template <class DT> void DynamicMemoryIndex<DT>::consolidate_delete()
_index.consolidate_deletes(_write_parameters);
}

template <class DT> size_t DynamicMemoryIndex<DT>::num_points()
{
return _index.get_num_points();
}

template class DynamicMemoryIndex<float>;
template class DynamicMemoryIndex<uint8_t>;
template class DynamicMemoryIndex<int8_t>;
Expand Down
3 changes: 2 additions & 1 deletion python/src/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ template <typename T> inline void add_variant(py::module_ &m, const Variant &var
.def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
.def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
.def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete);
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete)
.def("num_points", &diskannpy::DynamicMemoryIndex<T>::num_points);

py::class_<diskannpy::StaticDiskIndex<T>>(m, variant.static_disk_index_name.c_str())
.def(py::init<const diskann::Metric, const std::string &, const uint32_t, const size_t, const uint32_t>(),
Expand Down
45 changes: 44 additions & 1 deletion python/tests/test_dynamic_memory_index.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

import os
import shutil
import tempfile
import unittest

import diskannpy as dap
Expand Down Expand Up @@ -296,3 +296,46 @@ def test_value_ranges_batch_search(self):
index.batch_search(
queries=np.array([[]], dtype=np.single), **kwargs
)

# Issue #400
def test_issue400(self):
_, _, _, index_vectors, ann_dir, _, generated_tags = self._test_matrix[0]

deletion_tag = generated_tags[10] # arbitrary choice
deletion_vector = index_vectors[10]

index = dap.DynamicMemoryIndex.from_file(
index_directory=ann_dir,
num_threads=16,
initial_search_complexity=32,
max_vectors=10100,
complexity=64,
graph_degree=32
)
index.insert(np.array([1.0] * 10, dtype=np.single), 10099)
index.insert(np.array([2.0] * 10, dtype=np.single), 10050)
index.insert(np.array([3.0] * 10, dtype=np.single), 10053)
tags, distances = index.search(np.array([3.0] * 10, dtype=np.single), k_neighbors=5, complexity=64)
self.assertIn(10053, tags)
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertIn(deletion_tag, tags, "deletion_tag should exist, as we have not deleted yet")
index.mark_deleted(deletion_tag)
tags, distances = index.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertNotIn(deletion_tag, tags, "deletion_tag should not exist, as we have marked it for deletion")
with tempfile.TemporaryDirectory() as tmpdir:
index.save(tmpdir)

index2 = dap.DynamicMemoryIndex.from_file(
index_directory=tmpdir,
num_threads=16,
initial_search_complexity=32,
max_vectors=10100,
complexity=64,
graph_degree=32
)
tags, distances = index2.search(deletion_vector, k_neighbors=5, complexity=64)
self.assertNotIn(
deletion_tag,
tags,
"deletion_tag should not exist, as we saved and reloaded the index without it"
)