Merged PR 19685: Marianize LSH as operators for mmapping and use in Q…

…uicksand This PR turns the LSH index and search into a set of operators that live in the expression graph. This makes creation etc. thread-safe (one index per graph) and allows to later implement GPU versions. This allows to mmap the LSH as a Marian parameter since now we only need to turn the index into something that can be saved to disk using the existing tensors. This happens in marian_conv or the equivalent interface function in the Quicksand interface.
marian-nmt · Jul 9, 2021 · 35c822e · 35c822e
1 parent d6c09b2
commit 35c822e
Show file tree

Hide file tree

Showing 24 changed files with 499 additions and 737 deletions.
diff --git a/src/3rd_party/faiss/Index.cpp b/src/3rd_party/faiss/Index.cpp
diff --git a/src/3rd_party/faiss/Index.h b/src/3rd_party/faiss/Index.h
@@ -39,11 +39,6 @@
 
 namespace faiss {
 
-/// Forward declarations see AuxIndexStructures.h
-struct IDSelector;
-struct RangeSearchResult;
-struct DistanceComputer;
-
 /** Abstract structure for an index, supports adding vectors and searching them.
  *
  * All vectors provided at add or search time are 32-bit float arrays,
@@ -53,178 +48,6 @@ struct Index {
     using idx_t = int64_t;  ///< all indices are this type
     using component_t = float;
     using distance_t = float;
-
-    int d;                 ///< vector dimension
-    idx_t ntotal;          ///< total nb of indexed vectors
-    bool verbose;          ///< verbosity level
-
-    /// set if the Index does not require training, or if training is
-    /// done already
-    bool is_trained;
-
-    /// type of metric this index uses for search
-    MetricType metric_type;
-    float metric_arg;     ///< argument of the metric type
-
-    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
-                    d((int)d),
-                    ntotal(0),
-                    verbose(false),
-                    is_trained(true),
-                    metric_type (metric),
-                    metric_arg(0) {}
-
-    virtual ~Index ();
-
-
-    /** Perform training on a representative set of vectors
-     *
-     * @param n      nb of training vectors
-     * @param x      training vecors, size n * d
-     */
-    virtual void train(idx_t n, const float* x);
-
-    /** Add n vectors of dimension d to the index.
-     *
-     * Vectors are implicitly assigned labels ntotal .. ntotal + n - 1
-     * This function slices the input vectors in chuncks smaller than
-     * blocksize_add and calls add_core.
-     * @param x      input matrix, size n * d
-     */
-    virtual void add (idx_t n, const float *x) = 0;
-
-    /** Same as add, but stores xids instead of sequential ids.
-     *
-     * The default implementation fails with an assertion, as it is
-     * not supported by all indexes.
-     *
-     * @param xids if non-null, ids to store for the vectors (size n)
-     */
-    virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
-
-    /** query n vectors of dimension d to the index.
-     *
-     * return at most k vectors. If there are not enough results for a
-     * query, the result array is padded with -1s.
-     *
-     * @param x           input vectors to search, size n * d
-     * @param labels      output labels of the NNs, size n*k
-     * @param distances   output pairwise distances, size n*k
-     */
-    virtual void search (idx_t n, const float *x, idx_t k,
-                         float *distances, idx_t *labels) const = 0;
-
-    /** query n vectors of dimension d to the index.
-     *
-     * return all vectors with distance < radius. Note that many
-     * indexes do not implement the range_search (only the k-NN search
-     * is mandatory).
-     *
-     * @param x           input vectors to search, size n * d
-     * @param radius      search radius
-     * @param result      result table
-     */
-    virtual void range_search (idx_t n, const float *x, float radius,
-                               RangeSearchResult *result) const;
-
-    /** return the indexes of the k vectors closest to the query x.
-     *
-     * This function is identical as search but only return labels of neighbors.
-     * @param x           input vectors to search, size n * d
-     * @param labels      output labels of the NNs, size n*k
-     */
-    void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
-
-    /// removes all elements from the database.
-    virtual void reset() = 0;
-
-    /** removes IDs from the index. Not supported by all
-     * indexes. Returns the number of elements removed.
-     */
-    virtual size_t remove_ids (const IDSelector & sel);
-
-    /** Reconstruct a stored vector (or an approximation if lossy coding)
-     *
-     * this function may not be defined for some indexes
-     * @param key         id of the vector to reconstruct
-     * @param recons      reconstucted vector (size d)
-     */
-    virtual void reconstruct (idx_t key, float * recons) const;
-
-    /** Reconstruct vectors i0 to i0 + ni - 1
-     *
-     * this function may not be defined for some indexes
-     * @param recons      reconstucted vector (size ni * d)
-     */
-    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
-
-    /** Similar to search, but also reconstructs the stored vectors (or an
-     * approximation in the case of lossy coding) for the search results.
-     *
-     * If there are not enough results for a query, the resulting arrays
-     * is padded with -1s.
-     *
-     * @param recons      reconstructed vectors size (n, k, d)
-     **/
-    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
-                                         float *distances, idx_t *labels,
-                                         float *recons) const;
-
-    /** Computes a residual vector after indexing encoding.
-     *
-     * The residual vector is the difference between a vector and the
-     * reconstruction that can be decoded from its representation in
-     * the index. The residual can be used for multiple-stage indexing
-     * methods, like IndexIVF's methods.
-     *
-     * @param x           input vector, size d
-     * @param residual    output residual vector, size d
-     * @param key         encoded index, as returned by search and assign
-     */
-    virtual void compute_residual (const float * x,
-                                   float * residual, idx_t key) const;
-
-    /** Computes a residual vector after indexing encoding (batch form).
-     * Equivalent to calling compute_residual for each vector.
-     *
-     * The residual vector is the difference between a vector and the
-     * reconstruction that can be decoded from its representation in
-     * the index. The residual can be used for multiple-stage indexing
-     * methods, like IndexIVF's methods.
-     *
-     * @param n           number of vectors
-     * @param xs          input vectors, size (n x d)
-     * @param residuals   output residual vectors, size (n x d)
-     * @param keys        encoded index, as returned by search and assign
-     */
-    virtual void compute_residual_n (idx_t n, const float* xs,
-                                     float* residuals,
-                                     const idx_t* keys) const;
-
-    /* The standalone codec interface */
-
-    /** size of the produced codes in bytes */
-    virtual size_t sa_code_size () const;
-
-    /** encode a set of vectors
-     *
-     * @param n       number of vectors
-     * @param x       input vectors, size n * d
-     * @param bytes   output encoded vectors, size n * sa_code_size()
-     */
-    virtual void sa_encode (idx_t n, const float *x,
-                                  uint8_t *bytes) const;
-
-    /** encode a set of vectors
-     *
-     * @param n       number of vectors
-     * @param bytes   input encoded vectors, size n * sa_code_size()
-     * @param x       output vectors, size n * d
-     */
-    virtual void sa_decode (idx_t n, const uint8_t *bytes,
-                                    float *x) const;
-
-
 };
 
 }