Rakri/cosine bug fix (#450)

* compiles, but need to verify * fixed windows compiler warning * minor typo * added cosine unit test with unnormalized data * minor typo in user prompt cosine/l2 * cosine was already supported in groundtruth, edited the message to say so * clang-format --------- Co-authored-by: rakri <rakri@microsoft.com>
microsoft · Feb 6, 2024 · 13df0cf · 13df0cf
1 parent 58de98d
commit 13df0cf
Show file tree

Hide file tree

Showing 8 changed files with 106 additions and 39 deletions.
diff --git a/.github/actions/generate-random/action.yml b/.github/actions/generate-random/action.yml
@@ -9,18 +9,21 @@ runs:
         
         echo "Generating random vectors for index"
         dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_norm1.0.bin -D 10 -N 10000 --norm 1.0
+        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_10K_unnorm.bin -D 10 -N 10000 --rand_scaling 2.0
         dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
         dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_10K_norm50.0.bin -D 10 -N 10000 --norm 50.0
         
         echo "Generating random vectors for query"
         dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_norm1.0.bin -D 10 -N 1000 --norm 1.0
+        dist/bin/rand_data_gen --data_type float --output_file data/rand_float_10D_1K_unnorm.bin -D 10 -N 1000 --rand_scaling 2.0
         dist/bin/rand_data_gen --data_type int8 --output_file data/rand_int8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
         dist/bin/rand_data_gen --data_type uint8 --output_file data/rand_uint8_10D_1K_norm50.0.bin -D 10 -N 1000 --norm 50.0
 
         echo "Computing ground truth for floats across l2, mips, and cosine distance functions"
         dist/bin/compute_groundtruth  --data_type float --dist_fn l2 --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
         dist/bin/compute_groundtruth  --data_type float --dist_fn mips --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/mips_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
         dist/bin/compute_groundtruth  --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_norm1.0.bin --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/cosine_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --K 100
+        dist/bin/compute_groundtruth  --data_type float --dist_fn cosine --base_file data/rand_float_10D_10K_unnorm.bin --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --K 100
         
         echo "Computing ground truth for int8s across l2, mips, and cosine distance functions"
         dist/bin/compute_groundtruth  --data_type int8 --dist_fn l2 --base_file data/rand_int8_10D_10K_norm50.0.bin --query_file data/rand_int8_10D_1K_norm50.0.bin --gt_file data/l2_rand_int8_10D_10K_norm50.0_10D_1K_norm50.0_gt100 --K 100

diff --git a/.github/workflows/disk-pq.yml b/.github/workflows/disk-pq.yml
@@ -34,6 +34,11 @@ jobs:
         run: |
           dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
           dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
+      - name: build and search disk index (one shot graph build, cosine, no diskPQ) (float)
+        if: success() || failure()
+        run: |
+          dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot -R 16 -L 32 -B 0.00003 -M 1 
+          dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_oneshot --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
       - name: build and search disk index (one shot graph build, L2, no diskPQ) (int8)
         if: success() || failure()
         run: |
@@ -66,6 +71,11 @@ jobs:
         run: |
           dist/bin/build_disk_index --data_type float --dist_fn l2 --data_path data/rand_float_10D_10K_norm1.0.bin --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
           dist/bin/search_disk_index --data_type float --dist_fn l2 --fail_if_recall_below 70 --index_path_prefix data/disk_index_l2_rand_float_10D_10K_norm1.0_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_norm1.0.bin --gt_file data/l2_rand_float_10D_10K_norm1.0_10D_1K_norm1.0_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
+      - name: build and search disk index (sharded graph build, cosine, no diskPQ) (float)
+        if: success() || failure()
+        run: |
+          dist/bin/build_disk_index --data_type float --dist_fn cosine --data_path data/rand_float_10D_10K_unnorm.bin --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006
+          dist/bin/search_disk_index --data_type float --dist_fn cosine --fail_if_recall_below 70 --index_path_prefix data/disk_index_cosine_rand_float_10D_10K_unnorm_diskfull_sharded --result_path /tmp/res --query_file data/rand_float_10D_1K_unnorm.bin --gt_file data/cosine_rand_float_10D_10K_unnorm_10D_1K_unnorm_gt100 --recall_at 5 -L 5 12 -W 2 --num_nodes_to_cache 10 -T 16
       - name: build and search disk index (sharded graph build, L2, no diskPQ) (int8)
         run: |
           dist/bin/build_disk_index --data_type int8 --dist_fn l2 --data_path data/rand_int8_10D_10K_norm50.0.bin --index_path_prefix data/disk_index_l2_rand_int8_10D_10K_norm50.0_diskfull_sharded -R 16 -L 32 -B 0.00003 -M 0.00006

diff --git a/apps/build_disk_index.cpp b/apps/build_disk_index.cpp
@@ -107,6 +107,8 @@ int main(int argc, char **argv)
         metric = diskann::Metric::L2;
     else if (dist_fn == std::string("mips"))
         metric = diskann::Metric::INNER_PRODUCT;
+    else if (dist_fn == std::string("cosine"))
+        metric = diskann::Metric::COSINE;
     else
     {
         std::cout << "Error. Only l2 and mips distance functions are supported" << std::endl;

diff --git a/apps/utils/compute_groundtruth.cpp b/apps/utils/compute_groundtruth.cpp
@@ -499,7 +499,8 @@ int main(int argc, char **argv)
         desc.add_options()("help,h", "Print information on arguments");
 
         desc.add_options()("data_type", po::value<std::string>(&data_type)->required(), "data type <int8/uint8/float>");
-        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(), "distance function <l2/mips>");
+        desc.add_options()("dist_fn", po::value<std::string>(&dist_fn)->required(),
+                           "distance function <l2/mips/cosine>");
         desc.add_options()("base_file", po::value<std::string>(&base_file)->required(),
                            "File containing the base vectors in binary format");
         desc.add_options()("query_file", po::value<std::string>(&query_file)->required(),

diff --git a/apps/utils/rand_data_gen.cpp b/apps/utils/rand_data_gen.cpp
@@ -11,23 +11,31 @@
 
 namespace po = boost::program_options;
 
-int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, float norm)
+int block_write_float(std::ofstream &writer, size_t ndims, size_t npts, bool normalization, float norm,
+                      float rand_scale)
 {
     auto vec = new float[ndims];
 
     std::random_device rd{};
     std::mt19937 gen{rd()};
     std::normal_distribution<> normal_rand{0, 1};
+    std::uniform_real_distribution<> unif_dis(1.0, rand_scale);
 
     for (size_t i = 0; i < npts; i++)
     {
         float sum = 0;
+        float scale = 1.0f;
+        if (rand_scale > 1.0f)
+            scale = (float)unif_dis(gen);
         for (size_t d = 0; d < ndims; ++d)
-            vec[d] = (float)normal_rand(gen);
-        for (size_t d = 0; d < ndims; ++d)
-            sum += vec[d] * vec[d];
-        for (size_t d = 0; d < ndims; ++d)
-            vec[d] = vec[d] * norm / std::sqrt(sum);
+            vec[d] = scale * (float)normal_rand(gen);
+        if (normalization)
+        {
+            for (size_t d = 0; d < ndims; ++d)
+                sum += vec[d] * vec[d];
+            for (size_t d = 0; d < ndims; ++d)
+                vec[d] = vec[d] * norm / std::sqrt(sum);
+        }
 
         writer.write((char *)vec, ndims * sizeof(float));
     }
@@ -104,8 +112,8 @@ int main(int argc, char **argv)
 {
     std::string data_type, output_file;
     size_t ndims, npts;
-    float norm;
-
+    float norm, rand_scaling;
+    bool normalization = false;
     try
     {
         po::options_description desc{"Arguments"};
@@ -117,7 +125,11 @@ int main(int argc, char **argv)
                            "File name for saving the random vectors");
         desc.add_options()("ndims,D", po::value<uint64_t>(&ndims)->required(), "Dimensoinality of the vector");
         desc.add_options()("npts,N", po::value<uint64_t>(&npts)->required(), "Number of vectors");
-        desc.add_options()("norm", po::value<float>(&norm)->required(), "Norm of the vectors");
+        desc.add_options()("norm", po::value<float>(&norm)->default_value(-1.0f),
+                           "Norm of the vectors (if not specified, vectors are not normalized)");
+        desc.add_options()("rand_scaling", po::value<float>(&rand_scaling)->default_value(1.0f),
+                           "Each vector will be scaled (if not explicitly normalized) by a factor randomly chosen from "
+                           "[1, rand_scale]. Only applicable for floating point data");
         po::variables_map vm;
         po::store(po::parse_command_line(argc, argv, desc), vm);
         if (vm.count("help"))
@@ -139,9 +151,20 @@ int main(int argc, char **argv)
         return -1;
     }
 
-    if (norm <= 0.0)
+    if (norm > 0.0)
+    {
+        normalization = true;
+    }
+
+    if (rand_scaling < 1.0)
+    {
+        std::cout << "We will only scale the vector norms randomly in [1, value], so value must be >= 1." << std::endl;
+        return -1;
+    }
+
+    if ((rand_scaling > 1.0) && (normalization == true))
     {
-        std::cerr << "Error: Norm must be a positive number" << std::endl;
+        std::cout << "Data cannot be normalized and randomly scaled at same time. Use one or the other." << std::endl;
         return -1;
     }
 
@@ -155,6 +178,11 @@ int main(int argc, char **argv)
                       << std::endl;
             return -1;
         }
+        if (rand_scaling > 1.0)
+        {
+            std::cout << "Data scaling only supported for floating point data." << std::endl;
+            return -1;
+        }
     }
 
     try
@@ -177,7 +205,7 @@ int main(int argc, char **argv)
             size_t cblk_size = std::min(npts - i * blk_size, blk_size);
             if (data_type == std::string("float"))
             {
-                ret = block_write_float(writer, ndims, cblk_size, norm);
+                ret = block_write_float(writer, ndims, cblk_size, normalization, norm, rand_scaling);
             }
             else if (data_type == std::string("int8"))
             {

diff --git a/src/disk_utils.cpp b/src/disk_utils.cpp
@@ -1129,11 +1129,12 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
         return -1;
     }
 
-    if (!std::is_same<T, float>::value && compareMetric == diskann::Metric::INNER_PRODUCT)
+    if (!std::is_same<T, float>::value &&
+        (compareMetric == diskann::Metric::INNER_PRODUCT || compareMetric == diskann::Metric::COSINE))
     {
         std::stringstream stream;
-        stream << "DiskANN currently only supports floating point data for Max "
-                  "Inner Product Search. "
+        stream << "Disk-index build currently only supports floating point data for Max "
+                  "Inner Product Search/ cosine similarity. "
                << std::endl;
         throw diskann::ANNException(stream.str(), -1);
     }
@@ -1195,6 +1196,10 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
     std::string disk_pq_pivots_path = index_prefix_path + "_disk.index_pq_pivots.bin";
     // optional, used if disk index must store pq data
     std::string disk_pq_compressed_vectors_path = index_prefix_path + "_disk.index_pq_compressed.bin";
+    std::string prepped_base =
+        index_prefix_path +
+        "_prepped_base.bin"; // temp file for storing pre-processed base file for cosine/ mips metrics
+    bool created_temp_file_for_processed_data = false;
 
     // output a new base file which contains extra dimension with sqrt(1 -
     // ||x||^2/M^2) for every x, M is max norm of all points. Extra space on
@@ -1205,14 +1210,26 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
         std::cout << "Using Inner Product search, so need to pre-process base "
                      "data into temp file. Please ensure there is additional "
                      "(n*(d+1)*4) bytes for storing pre-processed base vectors, "
-                     "apart from the intermin indices and final index."
+                     "apart from the interim indices created by DiskANN and the final index."
                   << std::endl;
-        std::string prepped_base = index_prefix_path + "_prepped_base.bin";
         data_file_to_use = prepped_base;
         float max_norm_of_base = diskann::prepare_base_for_inner_products<T>(base_file, prepped_base);
         std::string norm_file = disk_index_path + "_max_base_norm.bin";
         diskann::save_bin<float>(norm_file, &max_norm_of_base, 1, 1);
         diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for inner product") << std::endl;
+        created_temp_file_for_processed_data = true;
+    }
+    else if (compareMetric == diskann::Metric::COSINE)
+    {
+        Timer timer;
+        std::cout << "Normalizing data for cosine to temporary file, please ensure there is additional "
+                     "(n*d*4) bytes for storing normalized base vectors, "
+                     "apart from the interim indices created by DiskANN and the final index."
+                  << std::endl;
+        data_file_to_use = prepped_base;
+        diskann::normalize_data_file(base_file, prepped_base);
+        diskann::cout << timer.elapsed_seconds_for_step("preprocessing data for cosine") << std::endl;
+        created_temp_file_for_processed_data = true;
     }
 
     uint32_t R = (uint32_t)atoi(param_list[0].c_str());
@@ -1304,7 +1321,7 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
 #if defined(DISKANN_RELEASE_UNUSED_TCMALLOC_MEMORY_AT_CHECKPOINTS) && defined(DISKANN_BUILD)
     MallocExtension::instance()->ReleaseFreeMemory();
 #endif
-
+    // Whether it is cosine or inner product, we still L2 metric due to the pre-processing.
     timer.reset();
     diskann::build_merged_vamana_index<T, LabelT>(data_file_to_use.c_str(), diskann::Metric::L2, L, R, p_val,
                                                   indexing_ram_budget, mem_index_path, medoids_path, centroids_path,
@@ -1345,7 +1362,8 @@ int build_disk_index(const char *dataFilePath, const char *indexFilePath, const
         std::remove(augmented_labels_file.c_str());
         std::remove(labels_file_to_use.c_str());
     }
-
+    if (created_temp_file_for_processed_data)
+        std::remove(prepped_base.c_str());
     std::remove(mem_index_path.c_str());
     if (use_disk_pq)
         std::remove(disk_pq_compressed_vectors_path.c_str());

diff --git a/src/pq_flash_index.cpp b/src/pq_flash_index.cpp
@@ -32,14 +32,16 @@ template <typename T, typename LabelT>
 PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileReader, diskann::Metric m)
     : reader(fileReader), metric(m), _thread_data(nullptr)
 {
+    diskann::Metric metric_to_invoke = m;
     if (m == diskann::Metric::COSINE || m == diskann::Metric::INNER_PRODUCT)
     {
         if (std::is_floating_point<T>::value)
         {
-            diskann::cout << "Cosine metric chosen for (normalized) float data."
-                             "Changing distance to L2 to boost accuracy."
+            diskann::cout << "Since data is floating point, we assume that it has been appropriately pre-processed "
+                             "(normalization for cosine, and convert-to-l2 by adding extra dimension for MIPS). So we "
+                             "shall invoke an l2 distance function."
                           << std::endl;
-            metric = diskann::Metric::L2;
+            metric_to_invoke = diskann::Metric::L2;
         }
         else
         {
@@ -49,8 +51,8 @@ PQFlashIndex<T, LabelT>::PQFlashIndex(std::shared_ptr<AlignedFileReader> &fileRe
         }
     }
 
-    this->_dist_cmp.reset(diskann::get_distance_function<T>(metric));
-    this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric));
+    this->_dist_cmp.reset(diskann::get_distance_function<T>(metric_to_invoke));
+    this->_dist_cmp_float.reset(diskann::get_distance_function<float>(metric_to_invoke));
 }
 
 template <typename T, typename LabelT> PQFlashIndex<T, LabelT>::~PQFlashIndex()
@@ -1292,20 +1294,23 @@ void PQFlashIndex<T, LabelT>::cached_beam_search(const T *query1, const uint64_t
     float *query_float = pq_query_scratch->aligned_query_float;
     float *query_rotated = pq_query_scratch->rotated_query;
 
-    // if inner product, we laso normalize the query and set the last coordinate
-    // to 0 (this is the extra coordindate used to convert MIPS to L2 search)
-    if (metric == diskann::Metric::INNER_PRODUCT)
+    // normalization step. for cosine, we simply normalize the query
+    // for mips, we normalize the first d-1 dims, and add a 0 for last dim, since an extra coordinate was used to
+    // convert MIPS to L2 search
+    if (metric == diskann::Metric::INNER_PRODUCT || metric == diskann::Metric::COSINE)
     {
-        for (size_t i = 0; i < this->_data_dim - 1; i++)
+        uint64_t inherent_dim = (metric == diskann::Metric::COSINE) ? this->_data_dim : (uint64_t)(this->_data_dim - 1);
+        for (size_t i = 0; i < inherent_dim; i++)
         {
             aligned_query_T[i] = query1[i];
             query_norm += query1[i] * query1[i];
         }
-        aligned_query_T[this->_data_dim - 1] = 0;
+        if (metric == diskann::Metric::INNER_PRODUCT)
+            aligned_query_T[this->_data_dim - 1] = 0;
 
         query_norm = std::sqrt(query_norm);
 
-        for (size_t i = 0; i < this->_data_dim - 1; i++)
+        for (size_t i = 0; i < inherent_dim; i++)
         {
             aligned_query_T[i] = (T)(aligned_query_T[i] / query_norm);
         }