Skip to content

Commit

Permalink
Rebasing main's latest commits onto ravi/filter_support_rebased (#225)
Browse files Browse the repository at this point in the history
- add code for two variants of filtered index, readme and CI tests

- add utils for synthetic label generation and CI tests.


* Add co-authors

Co-authored-by: ravishankar <rakri@microsoft.com>
Co-authored-by: Varun Sivashankar <t-varunsi@microsoft.com>

---------

Co-authored-by: ravishankar <rakri@microsoft.com>
Co-authored-by: David Kaczynski <dkaczynski@microsoft.com>
Co-authored-by: Siddharth Gollapudi <t-gollapudis@microsoft.com>
Co-authored-by: Neelam Mahapatro <nmahapatro@microsoft.com>
Co-authored-by: Harsha Vardhan Simhadri <harshasi@microsoft.com>
Co-authored-by: Harsha Vardhan Simhadri <harsha-simhadri@users.noreply.github.com>
Co-authored-by: REDMOND\patelyash <patelyash@microsoft.com>
Co-authored-by: Varun Sivashankar <t-varunsi@microsoft.com>
  • Loading branch information
9 people authored Mar 15, 2023
1 parent 5ba6a5d commit 5ec769a
Show file tree
Hide file tree
Showing 33 changed files with 4,149 additions and 893 deletions.
42 changes: 40 additions & 2 deletions .github/workflows/pr-test.yml

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions CMakeSettings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"configurations": [
{
"name": "x64-Release",
"generator": "Ninja",
"configurationType": "Release",
"inheritEnvironments": [ "msvc_x64" ],
"buildRoot": "${projectDir}\\out\\build\\${name}",
"installRoot": "${projectDir}\\out\\install\\${name}",
"cmakeCommandArgs": "",
"buildCommandArgs": "",
"ctestCommandArgs": ""
},
{
"name": "WSL-GCC-Release",
"generator": "Ninja",
"configurationType": "RelWithDebInfo",
"buildRoot": "${projectDir}\\out\\build\\${name}",
"installRoot": "${projectDir}\\out\\install\\${name}",
"cmakeExecutable": "cmake",
"cmakeCommandArgs": "",
"buildCommandArgs": "",
"ctestCommandArgs": "",
"inheritEnvironments": [ "linux_x64" ],
"wslPath": "${defaultWSLPath}"
}
]
}
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,17 @@ Please see the following pages on using the compiled code:
- [Commandline interface for building and search SSD based indices](workflows/SSD_index.md)
- [Commandline interface for building and search in memory indices](workflows/in_memory_index.md)
- [Commandline examples for using in-memory streaming indices](workflows/dynamic_index.md)
- [Commandline interface for building and search in memory indices with label data and filters](workflows/filtered_in_memory.md)
- To be added: Python interfaces and docker files

Please cite this software in your work as:

```
@misc{diskann-github,
author = {Simhadri, Harsha Vardhan and Krishnaswamy, Ravishankar and Srinivasa, Gopal and Subramanya, Suhas Jayaram and Antonijevic, Andrija and Pryce, Dax and Kaczynski, David and Williams, Shane and Gollapudi, Siddarth and Sivashankar, Varun and Karia, Neel and Singh, Aditi and Jaiswal, Shikhar and Mahapatro, Neelam and Adams, Philip and Tower, Bryan}},
title = {{DiskANN: Scalable, efficient and Feature-rich ANNS}},
url = {https://github.com/Microsoft/DiskANN},
version = {0.5},
year = {2023}
}
```
51 changes: 30 additions & 21 deletions include/disk_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ namespace diskann {
const uint32_t WARMUP_L = 20;
const uint32_t NUM_KMEANS_REPS = 12;

template<typename T>
template<typename T, typename LabelT>
class PQFlashIndex;

DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
Expand Down Expand Up @@ -68,38 +68,47 @@ namespace diskann {
uint64_t warmup_aligned_dim);
#endif

DISKANN_DLLEXPORT int merge_shards(const std::string &vamana_prefix,
const std::string &vamana_suffix,
const std::string &idmaps_prefix,
const std::string &idmaps_suffix,
const _u64 nshards, unsigned max_degree,
const std::string &output_vamana,
const std::string &medoids_file);
DISKANN_DLLEXPORT int merge_shards(
const std::string &vamana_prefix, const std::string &vamana_suffix,
const std::string &idmaps_prefix, const std::string &idmaps_suffix,
const _u64 nshards, unsigned max_degree, const std::string &output_vamana,
const std::string &medoids_file, bool use_filters = false,
const std::string &labels_to_medoids_file = std::string(""));

DISKANN_DLLEXPORT void extract_shard_labels(
const std::string &in_label_file, const std::string &shard_ids_bin,
const std::string &shard_label_file);

template<typename T>
DISKANN_DLLEXPORT std::string preprocess_base_file(
const std::string &infile, const std::string &indexPrefix,
diskann::Metric &distMetric);

template<typename T>
template<typename T, typename LabelT = uint32_t>
DISKANN_DLLEXPORT int build_merged_vamana_index(
std::string base_file, diskann::Metric _compareMetric, unsigned L,
unsigned R, double sampling_rate, double ram_budget,
std::string mem_index_path, std::string medoids_file,
std::string centroids_file, size_t build_pq_bytes, bool use_opq);
std::string centroids_file, size_t build_pq_bytes, bool use_opq,
bool use_filters = false, const std::string &label_file = std::string(""),
const std::string &labels_to_medoids_file = std::string(""),
const std::string &universal_label = "", const _u32 Lf = 0);

template<typename T>
template<typename T, typename LabelT>
DISKANN_DLLEXPORT uint32_t optimize_beamwidth(
std::unique_ptr<diskann::PQFlashIndex<T>> &_pFlashIndex, T *tuning_sample,
_u64 tuning_sample_num, _u64 tuning_sample_aligned_dim, uint32_t L,
uint32_t nthreads, uint32_t start_bw = 2);

template<typename T>
DISKANN_DLLEXPORT int build_disk_index(const char *dataFilePath,
const char *indexFilePath,
const char *indexBuildParameters,
diskann::Metric _compareMetric,
bool use_opq = false);
std::unique_ptr<diskann::PQFlashIndex<T, LabelT>> &_pFlashIndex,
T *tuning_sample, _u64 tuning_sample_num, _u64 tuning_sample_aligned_dim,
uint32_t L, uint32_t nthreads, uint32_t start_bw = 2);

template<typename T, typename LabelT = uint32_t>
DISKANN_DLLEXPORT int build_disk_index(
const char *dataFilePath, const char *indexFilePath,
const char *indexBuildParameters, diskann::Metric _compareMetric,
bool use_opq = false, bool use_filters = false,
const std::string &label_file =
std::string(""), // default is empty string for no label_file
const std::string &universal_label = "", const _u32 filter_threshold = 0,
const _u32 Lf = 0); // default is empty string for no universal label

template<typename T>
DISKANN_DLLEXPORT void create_disk_layout(
Expand Down
50 changes: 46 additions & 4 deletions include/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define DEFAULT_MAXC 750

namespace diskann {

inline double estimate_ram_usage(_u64 size, _u32 dim, _u32 datasize,
_u32 degree) {
double size_of_data = ((double) size) * ROUND_UP(dim, 8) * datasize;
Expand Down Expand Up @@ -60,7 +61,7 @@ namespace diskann {
}
};

template<typename T, typename TagT = uint32_t>
template<typename T, typename TagT = uint32_t, typename LabelT = uint32_t>
class Index {
/**************************************************************************
*
Expand Down Expand Up @@ -129,6 +130,17 @@ namespace diskann {
Parameters &parameters,
const std::vector<TagT> &tags);

// Filtered Support
DISKANN_DLLEXPORT void build_filtered_index(
const char *filename, const std::string &label_file,
const size_t num_points_to_load, Parameters &parameters,
const std::vector<TagT> &tags = std::vector<TagT>());

DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);

// Get converted integer label from string to int map (_label_map)
DISKANN_DLLEXPORT LabelT get_converted_label(const std::string &raw_label);

// Set starting point of an index before inserting any points incrementally
DISKANN_DLLEXPORT void set_start_point(T *data);
// Set starting point to a random point on a sphere of certain radius
Expand All @@ -155,6 +167,12 @@ namespace diskann {
float *distances,
std::vector<T *> &res_vectors);

// Filter support search
template<typename IndexType>
DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_filters(
const T *query, const LabelT &filter_label, const size_t K,
const unsigned L, IndexType *indices, float *distances);

// Will fail if tag already in the index or if tag=0.
DISKANN_DLLEXPORT int insert_point(const T *point, const TagT tag);

Expand All @@ -177,6 +195,8 @@ namespace diskann {
DISKANN_DLLEXPORT consolidation_report
consolidate_deletes(const Parameters &parameters);

DISKANN_DLLEXPORT void prune_all_nbrs(const Parameters &parameters);

DISKANN_DLLEXPORT bool is_index_saved();

// repositions frozen points to the end of _data - if they have been moved
Expand Down Expand Up @@ -208,8 +228,8 @@ namespace diskann {

protected:
// No copy/assign.
Index(const Index<T, TagT> &) = delete;
Index<T, TagT> &operator=(const Index<T, TagT> &) = delete;
Index(const Index<T, TagT, LabelT> &) = delete;
Index<T, TagT, LabelT> &operator=(const Index<T, TagT, LabelT> &) = delete;

// Use after _data and _nd have been populated
// Acquire exclusive _update_lock before calling
Expand All @@ -223,14 +243,23 @@ namespace diskann {
// determines navigating node of the graph by calculating medoid of datafopt
unsigned calculate_entry_point();

void parse_label_file(const std::string &label_file,
size_t &num_pts_labels);

std::unordered_map<std::string, LabelT> load_label_map(
const std::string &map_file);

std::pair<uint32_t, uint32_t> iterate_to_fixed_point(
const T *node_coords, const unsigned Lindex,
const std::vector<unsigned> &init_ids, InMemQueryScratch<T> *scratch,
bool use_filter, const std::vector<LabelT> &filters,
bool ret_frozen = true, bool search_invocation = false);

void search_for_point_and_prune(int location, _u32 Lindex,
std::vector<unsigned> &pruned_list,
InMemQueryScratch<T> *scratch);
InMemQueryScratch<T> *scratch,
bool use_filter = false,
_u32 filteredLindex = 0);

void prune_neighbors(const unsigned location, std::vector<Neighbor> &pool,
std::vector<unsigned> &pruned_list,
Expand Down Expand Up @@ -342,6 +371,19 @@ namespace diskann {
bool _enable_tags = false;
bool _normalize_vecs = false; // Using normalied L2 for cosine.

// Filter Support

bool _filtered_index = false;
std::vector<std::vector<LabelT>> _pts_to_labels;
tsl::robin_set<LabelT> _labels;
std::string _labels_file;
std::unordered_map<LabelT, _u32> _label_to_medoid_id;
std::unordered_map<_u32, _u32> _medoid_counts;
bool _use_universal_label = false;
LabelT _universal_label = 0;
uint32_t _filterIndexingQueueSize;
std::unordered_map<std::string, LabelT> _label_map;

// Indexing parameters
uint32_t _indexingQueueSize;
uint32_t _indexingRange;
Expand Down
45 changes: 44 additions & 1 deletion include/pq_flash_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

namespace diskann {

template<typename T>
template<typename T, typename LabelT = uint32_t>
class PQFlashIndex {
public:
DISKANN_DLLEXPORT PQFlashIndex(
Expand Down Expand Up @@ -70,11 +70,26 @@ namespace diskann {
float *res_dists, const _u64 beam_width,
const bool use_reorder_data = false, QueryStats *stats = nullptr);

DISKANN_DLLEXPORT void cached_beam_search(
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
float *res_dists, const _u64 beam_width, const bool use_filter,
const LabelT &filter_label, const bool use_reorder_data = false,
QueryStats *stats = nullptr);

DISKANN_DLLEXPORT void cached_beam_search(
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
float *res_dists, const _u64 beam_width, const _u32 io_limit,
const bool use_reorder_data = false, QueryStats *stats = nullptr);

DISKANN_DLLEXPORT void cached_beam_search(
const T *query, const _u64 k_search, const _u64 l_search, _u64 *res_ids,
float *res_dists, const _u64 beam_width, const bool use_filter,
const LabelT &filter_label, const _u32 io_limit,
const bool use_reorder_data = false, QueryStats *stats = nullptr);

DISKANN_DLLEXPORT LabelT
get_converted_label(const std::string &filter_label);

DISKANN_DLLEXPORT _u32 range_search(const T *query1, const double range,
const _u64 min_l_search,
const _u64 max_l_search,
Expand All @@ -94,12 +109,26 @@ namespace diskann {
DISKANN_DLLEXPORT void setup_thread_data(_u64 nthreads,
_u64 visited_reserve = 4096);

DISKANN_DLLEXPORT void set_universal_label(const LabelT &label);

private:
DISKANN_DLLEXPORT inline bool point_has_label(_u32 point_id, _u32 label_id);
std::unordered_map<std::string, LabelT> load_label_map(
const std::string &map_file);
DISKANN_DLLEXPORT void parse_label_file(const std::string &map_file,
size_t &num_pts_labels);
DISKANN_DLLEXPORT void get_label_file_metadata(std::string map_file,
_u32 &num_pts,
_u32 &num_total_labels);
DISKANN_DLLEXPORT inline int32_t get_filter_number(
const LabelT &filter_label);

// index info
// nhood of node `i` is in sector: [i / nnodes_per_sector]
// offset in sector: [(i % nnodes_per_sector) * max_node_len]
// nnbrs of node `i`: *(unsigned*) (buf)
// nbrs of node `i`: ((unsigned*)buf) + 1

_u64 max_node_len = 0, nnodes_per_sector = 0, max_degree = 0;

// Data used for searching with re-order vectors
Expand Down Expand Up @@ -171,6 +200,20 @@ namespace diskann {
bool reorder_data_exists = false;
_u64 reoreder_data_offset = 0;

// filter support
_u32 *_pts_to_label_offsets = nullptr;
_u32 *_pts_to_labels = nullptr;
tsl::robin_set<LabelT> _labels;
std::unordered_map<LabelT, _u32> _filter_to_medoid_id;
bool _use_universal_label;
_u32 _universal_filter_num;
std::vector<LabelT> _filter_list;
tsl::robin_set<_u32> _dummy_pts;
tsl::robin_set<_u32> _has_dummy_pts;
tsl::robin_map<_u32, _u32> _dummy_to_real_map;
tsl::robin_map<_u32, std::vector<_u32>> _real_to_dummy_map;
std::unordered_map<std::string, LabelT> _label_map;

#ifdef EXEC_ENV_OLS
// Set to a larger value than the actual header to accommodate
// any additions we make to the header. This is an outer limit
Expand Down
Loading

0 comments on commit 5ec769a

Please sign in to comment.