Skip to content

Commit

Permalink
Rakri/cosine bug fix (#450)
Browse files Browse the repository at this point in the history
* compiles, but need to verify

* fixed windows compiler warning

* minor typo

* added cosine unit test with unnormalized data

* minor typo in user prompt cosine/l2

* cosine was already supported in groundtruth, edited the message to say so

* clang-format

---------

Co-authored-by: rakri <rakri@microsoft.com>
  • Loading branch information
rakri and rakri authored Feb 6, 2024
1 parent 58de98d commit 13df0cf
Show file tree
Hide file tree
Showing 8 changed files with 106 additions and 39 deletions.
3 changes: 3 additions & 0 deletions .github/actions/generate-random/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@ runs:
echo "Generating random vectors for index"
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0
dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
echo "Generating random vectors for query"
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0
dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0
dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
echo "Computing ground truth for floats across l2, mips, and cosine distance functions"
dist/bin/compute_groundtruth --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
dist/bin/compute_groundtruth --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100
echo "Computing ground truth for int8s across l2, mips, and cosine distance functions"
dist/bin/compute_groundtruth --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/disk-pq.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ jobs:
run: |
dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, cosine, no diskPQ) (float)
if: success() || failure()
run: |
dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1
dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (one shot graph build, L2, no diskPQ) (int8)
if: success() || failure()
run: |
Expand Down Expand Up @@ -66,6 +71,11 @@ jobs:
run: |
dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, cosine, no diskPQ) (float)
if: success() || failure()
run: |
dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
- name: build and search disk index (sharded graph build, L2, no diskPQ) (int8)
run: |
dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
Expand Down
2 changes: 2 additions & 0 deletions apps/build_disk_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ int main(int argc, char **argv)
metric = diskann::Metric::L2;
else if (dist_fn == std::string("mips"))
metric = diskann::Metric::INNER_PRODUCT;
else if (dist_fn == std::string("cosine"))
metric = diskann::Metric::COSINE;
else
{
std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;
Expand Down
3 changes: 2 additions & 1 deletion apps/utils/compute_groundtruth.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,8 @@ int main(int argc, char **argv)
desc.add_options()("help,h", "Print information on arguments");

desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(), "distance function <l2/mips>");
desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
"distance function <l2/mips/cosine>");
desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
"File containing the base vectors in binary format");
desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),
Expand Down
52 changes: 40 additions & 12 deletions apps/utils/rand_data_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,31 @@

namespace po = boost::program_options;

int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, float norm)
int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm,
float rand_scale)
{
auto vec = new float[ndims];

std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0, 1};
std::uniform_real_distribution<> unif_dis(1.0, rand_scale);

for (size_t i = 0; i < npts; i++)
{
float sum = 0;
float scale = 1.0f;
if (rand_scale > 1.0f)
scale = (float)unif_dis(gen);
for (size_t d = 0; d < ndims; ++d)
vec[d] = (float)normal_rand(gen);
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
vec[d] = scale * (float)normal_rand(gen);
if (normalization)
{
for (size_t d = 0; d < ndims; ++d)
sum += vec[d] * vec[d];
for (size_t d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
}

writer.write((char *)vec, ndims * sizeof(float));
}
Expand Down Expand Up @@ -104,8 +112,8 @@ int main(int argc, char **argv)
{
std::string data_type, output_file;
size_t ndims, npts;
float norm;

float norm, rand_scaling;
bool normalization = false;
try
{
po::options_description desc{"Arguments"};
Expand All @@ -117,7 +125,11 @@ int main(int argc, char **argv)
"File name for saving the random vectors");
desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(), "Dimensoinality of the vector");
desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(), "Number of vectors");
desc.add_options()("norm", po::value<float>(&norm)->required(), "Norm of the vectors");
desc.add_options()("norm", po::value<float>(&norm)->default_value(-1.0f),
"Norm of the vectors (if not specified, vectors are not normalized)");
desc.add_options()("rand_scaling", po::value<float>(&rand_scaling)->default_value(1.0f),
"Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from "
"[1, rand_scale]. Only applicable for floating point data");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
if (vm.count("help"))
Expand All @@ -139,9 +151,20 @@ int main(int argc, char **argv)
return -1;
}

if (norm <= 0.0)
if (norm > 0.0)
{
normalization = true;
}

if (rand_scaling < 1.0)
{
std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl;
return -1;
}

if ((rand_scaling > 1.0) && (normalization == true))
{
std::cerr << "Error: Norm must be a positive number" << std::endl;
std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl;
return -1;
}

Expand All @@ -155,6 +178,11 @@ int main(int argc, char **argv)
<< std::endl;
return -1;
}
if (rand_scaling > 1.0)
{
std::cout << "Data scaling only supported for floating point data." << std::endl;
return -1;
}
}

try
Expand All @@ -177,7 +205,7 @@ int main(int argc, char **argv)
size_t cblk_size = std::min(npts - i * blk_size, blk_size);
if (data_type == std::string("float"))
{
ret = block_write_float(writer, ndims, cblk_size, norm);
ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling);
}
else if (data_type == std::string("int8"))
{
Expand Down
32 changes: 25 additions & 7 deletions src/disk_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,11 +1129,12 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
return -1;
}

if (!std::is_same<T, float>::value && compareMetric == diskann::Metric::INNER_PRODUCT)
if (!std::is_same<T, float>::value &&
(compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE))
{
std::stringstream stream;
stream << "DiskANN currently only supports floating point data for Max "
"Inner Product Search. "
stream << "Disk-index build currently only supports floating point data for Max "
"Inner Product Search/ cosine similarity. "
<< std::endl;
throw diskann::ANNException(stream.str(), -1);
}
Expand Down Expand Up @@ -1195,6 +1196,10 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin";
// optional, used if disk index must store pq data
std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin";
std::string prepped_base =
index_prefix_path +
"_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics
bool created_temp_file_for_processed_data = false;

// output a new base file which contains extra dimension with sqrt(1 -
// ||x||^2/M^2) for every x, M is max norm of all points. Extra space on
Expand All @@ -1205,14 +1210,26 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::cout << "Using Inner Product search, so need to pre-process base "
"data into temp file. Please ensure there is additional "
"(n*(d+1)*4) bytes for storing pre-processed base vectors, "
"apart from the intermin indices and final index."
"apart from the interim indices created by DiskANN and the final index."
<< std::endl;
std::string prepped_base = index_prefix_path + "_prepped_base.bin";
data_file_to_use = prepped_base;
float max_norm_of_base = diskann::prepare_base_for_inner_products<T>(base_file, prepped_base);
std::string norm_file = disk_index_path + "_max_base_norm.bin";
diskann::save_bin<float>(norm_file, &max_norm_of_base, 1, 1);
diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl;
created_temp_file_for_processed_data = true;
}
else if (compareMetric == diskann::Metric::COSINE)
{
Timer timer;
std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional "
"(n*d*4) bytes for storing normalized base vectors, "
"apart from the interim indices created by DiskANN and the final index."
<< std::endl;
data_file_to_use = prepped_base;
diskann::normalize_data_file(base_file, prepped_base);
diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl;
created_temp_file_for_processed_data = true;
}

uint32_t R = (uint32_t)atoi(param_list[0].c_str());
Expand Down Expand Up @@ -1304,7 +1321,7 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
#if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
MallocExtension::instance()->ReleaseFreeMemory();
#endif

// Whether it is cosine or inner product, we still L2 metric due to the pre-processing.
timer.reset();
diskann::build_merged_vamana_index<T, LabelT>(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val,
indexing_ram_budget, mem_index_path, medoids_path, centroids_path,
Expand Down Expand Up @@ -1345,7 +1362,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
std::remove(augmented_labels_file.c_str());
std::remove(labels_file_to_use.c_str());
}

if (created_temp_file_for_processed_data)
std::remove(prepped_base.c_str());
std::remove(mem_index_path.c_str());
if (use_disk_pq)
std::remove(disk_pq_compressed_vectors_path.c_str());
Expand Down
27 changes: 16 additions & 11 deletions src/pq_flash_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ template <typename T, typename LabelT>
PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileReader, diskann::Metric m)
: reader(fileReader), metric(m), _thread_data(nullptr)
{
diskann::Metric metric_to_invoke = m;
if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT)
{
if (std::is_floating_point<T>::value)
{
diskann::cout << "Cosine metric chosen for (normalized) float data."
"Changing distance to L2 to boost accuracy."
diskann::cout << "Since data is floating point, we assume that it has been appropriately pre-processed "
"(normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we "
"shall invoke an l2 distance function."
<< std::endl;
metric = diskann::Metric::L2;
metric_to_invoke = diskann::Metric::L2;
}
else
{
Expand All @@ -49,8 +51,8 @@ PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileRe
}
}

this->_dist_cmp.reset(diskann::get_distance_function<T>(metric));
this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric));
this->_dist_cmp.reset(diskann::get_distance_function<T>(metric_to_invoke));
this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric_to_invoke));
}

template <typename T, typename LabelT> PQFlashIndex<T, LabelT>::~PQFlashIndex()
Expand Down Expand Up @@ -1292,20 +1294,23 @@ void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t
float *query_float = pq_query_scratch->aligned_query_float;
float *query_rotated = pq_query_scratch->rotated_query;

// if inner product, we laso normalize the query and set the last coordinate
// to 0 (this is the extra coordindate used to convert MIPS to L2 search)
if (metric == diskann::Metric::INNER_PRODUCT)
// normalization step. for cosine, we simply normalize the query
// for mips, we normalize the first d-1 dims, and add a 0 for last dim, since an extra coordinate was used to
// convert MIPS to L2 search
if (metric == diskann::Metric::INNER_PRODUCT || metric == diskann::Metric::COSINE)
{
for (size_t i = 0; i < this->_data_dim - 1; i++)
uint64_t inherent_dim = (metric == diskann::Metric::COSINE) ? this->_data_dim : (uint64_t)(this->_data_dim - 1);
for (size_t i = 0; i < inherent_dim; i++)
{
aligned_query_T[i] = query1[i];
query_norm += query1[i] * query1[i];
}
aligned_query_T[this->_data_dim - 1] = 0;
if (metric == diskann::Metric::INNER_PRODUCT)
aligned_query_T[this->_data_dim - 1] = 0;

query_norm = std::sqrt(query_norm);

for (size_t i = 0; i < this->_data_dim - 1; i++)
for (size_t i = 0; i < inherent_dim; i++)
{
aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm);
}
Expand Down
Loading

0 comments on commit 13df0cf

Please sign in to comment.