Skip to content

Commit

Permalink
Merged PR 19685: Marianize LSH as operators for mmapping and use in Q…
Browse files Browse the repository at this point in the history
…uicksand

This PR turns the LSH index and search into a set of operators that live in the expression graph. This makes creation etc. thread-safe (one index per graph) and allows to later implement GPU versions.

This allows to mmap the LSH as a Marian parameter since now we only need to turn the index into something that can be saved to disk using the existing tensors. This happens in marian_conv or the equivalent interface function in the Quicksand interface.
  • Loading branch information
emjotde committed Jul 9, 2021
1 parent d6c09b2 commit 35c822e
Show file tree
Hide file tree
Showing 24 changed files with 499 additions and 737 deletions.
119 changes: 0 additions & 119 deletions src/3rd_party/faiss/Index.cpp

This file was deleted.

177 changes: 0 additions & 177 deletions src/3rd_party/faiss/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,6 @@

namespace faiss {

/// Forward declarations see AuxIndexStructures.h
struct IDSelector;
struct RangeSearchResult;
struct DistanceComputer;

/** Abstract structure for an index, supports adding vectors and searching them.
*
* All vectors provided at add or search time are 32-bit float arrays,
Expand All @@ -53,178 +48,6 @@ struct Index {
using idx_t = int64_t; ///< all indices are this type
using component_t = float;
using distance_t = float;

int d; ///< vector dimension
idx_t ntotal; ///< total nb of indexed vectors
bool verbose; ///< verbosity level

/// set if the Index does not require training, or if training is
/// done already
bool is_trained;

/// type of metric this index uses for search
MetricType metric_type;
float metric_arg; ///< argument of the metric type

explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
d((int)d),
ntotal(0),
verbose(false),
is_trained(true),
metric_type (metric),
metric_arg(0) {}

virtual ~Index ();


/** Perform training on a representative set of vectors
*
* @param n nb of training vectors
* @param x training vecors, size n * d
*/
virtual void train(idx_t n, const float* x);

/** Add n vectors of dimension d to the index.
*
* Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
* This function slices the input vectors in chuncks smaller than
* blocksize_add and calls add_core.
* @param x input matrix, size n * d
*/
virtual void add (idx_t n, const float *x) = 0;

/** Same as add, but stores xids instead of sequential ids.
*
* The default implementation fails with an assertion, as it is
* not supported by all indexes.
*
* @param xids if non-null, ids to store for the vectors (size n)
*/
virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);

/** query n vectors of dimension d to the index.
*
* return at most k vectors. If there are not enough results for a
* query, the result array is padded with -1s.
*
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
* @param distances output pairwise distances, size n*k
*/
virtual void search (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels) const = 0;

/** query n vectors of dimension d to the index.
*
* return all vectors with distance < radius. Note that many
* indexes do not implement the range_search (only the k-NN search
* is mandatory).
*
* @param x input vectors to search, size n * d
* @param radius search radius
* @param result result table
*/
virtual void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const;

/** return the indexes of the k vectors closest to the query x.
*
* This function is identical as search but only return labels of neighbors.
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
*/
void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);

/// removes all elements from the database.
virtual void reset() = 0;

/** removes IDs from the index. Not supported by all
* indexes. Returns the number of elements removed.
*/
virtual size_t remove_ids (const IDSelector & sel);

/** Reconstruct a stored vector (or an approximation if lossy coding)
*
* this function may not be defined for some indexes
* @param key id of the vector to reconstruct
* @param recons reconstucted vector (size d)
*/
virtual void reconstruct (idx_t key, float * recons) const;

/** Reconstruct vectors i0 to i0 + ni - 1
*
* this function may not be defined for some indexes
* @param recons reconstucted vector (size ni * d)
*/
virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;

/** Similar to search, but also reconstructs the stored vectors (or an
* approximation in the case of lossy coding) for the search results.
*
* If there are not enough results for a query, the resulting arrays
* is padded with -1s.
*
* @param recons reconstructed vectors size (n, k, d)
**/
virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *recons) const;

/** Computes a residual vector after indexing encoding.
*
* The residual vector is the difference between a vector and the
* reconstruction that can be decoded from its representation in
* the index. The residual can be used for multiple-stage indexing
* methods, like IndexIVF's methods.
*
* @param x input vector, size d
* @param residual output residual vector, size d
* @param key encoded index, as returned by search and assign
*/
virtual void compute_residual (const float * x,
float * residual, idx_t key) const;

/** Computes a residual vector after indexing encoding (batch form).
* Equivalent to calling compute_residual for each vector.
*
* The residual vector is the difference between a vector and the
* reconstruction that can be decoded from its representation in
* the index. The residual can be used for multiple-stage indexing
* methods, like IndexIVF's methods.
*
* @param n number of vectors
* @param xs input vectors, size (n x d)
* @param residuals output residual vectors, size (n x d)
* @param keys encoded index, as returned by search and assign
*/
virtual void compute_residual_n (idx_t n, const float* xs,
float* residuals,
const idx_t* keys) const;

/* The standalone codec interface */

/** size of the produced codes in bytes */
virtual size_t sa_code_size () const;

/** encode a set of vectors
*
* @param n number of vectors
* @param x input vectors, size n * d
* @param bytes output encoded vectors, size n * sa_code_size()
*/
virtual void sa_encode (idx_t n, const float *x,
uint8_t *bytes) const;

/** encode a set of vectors
*
* @param n number of vectors
* @param bytes input encoded vectors, size n * sa_code_size()
* @param x output vectors, size n * d
*/
virtual void sa_decode (idx_t n, const uint8_t *bytes,
float *x) const;


};

}
Expand Down
Loading

0 comments on commit 35c822e

Please sign in to comment.