community[patch]: Add pgvector index using HNSW (#5564)

jl4nz · web-flow · commit 43829d6bdcbd · 2024-06-25T10:35:30.000-07:00
* Add pgvector hnsw

* Update hnsw index name to use column name

* Fix import typo

* Fix pgvector index test

* Refactor, set dimensions for hnsw mandatory, fix docs

* Add pg-format, refactor create hnsw index with sql identifiers pg-format

* Revert docs gitignore

* Revert docs gitignore

* Revert pg-format

* Revert gitignore docs
diff --git a/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx b/docs/core_docs/docs/integrations/vectorstores/pgvector.mdx
@@ -74,3 +74,24 @@ before using the constructor.
 import ConnectionReuseExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_pool.ts";
 
 <CodeBlock language="typescript">{ConnectionReuseExample}</CodeBlock>
+
+### Create HNSW Index
+
+By default, the extension performs a sequential scan search, with 100% recall. You might consider creating an HNSW index for approximate nearest neighbor (ANN) search to speed up similaritySearchVectorWithScore execution time. To create the HNSW index on your vector column, use the `createHnswIndex()` method:
+
+The method parameters include:
+
+**dimensions**: Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's `text-embedding-ada-002` and Amazon's `amazon.titan-embed-text-v1` models.
+
+**m?**: The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries.
+
+**efConstruction?**: The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time.
+
+**distanceFunction?**: The distance function name you want to use, is automatically selected based on the distanceStrategy.
+
+More info at the [`Pgvector GitHub project`](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw) and the HNSW paper from Malkov Yu A. and Yashunin D. A.. 2020. [`Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs`](https://arxiv.org/pdf/1603.09320)
+
+import HnswExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts";
+
+<CodeBlock language="typescript">{HnswExample}</CodeBlock>
+
diff --git a/examples/src/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts b/examples/src/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts
@@ -0,0 +1,54 @@
+import { OpenAIEmbeddings } from "@langchain/openai";
+import {
+  DistanceStrategy,
+  PGVectorStore,
+} from "@langchain/community/vectorstores/pgvector";
+import { PoolConfig } from "pg";
+
+// First, follow set-up instructions at
+// https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/pgvector
+
+const config = {
+  postgresConnectionOptions: {
+    type: "postgres",
+    host: "127.0.0.1",
+    port: 5433,
+    user: "myuser",
+    password: "ChangeMe",
+    database: "api",
+  } as PoolConfig,
+  tableName: "testlangchain",
+  columns: {
+    idColumnName: "id",
+    vectorColumnName: "vector",
+    contentColumnName: "content",
+    metadataColumnName: "metadata",
+  },
+  // supported distance strategies: cosine (default), innerProduct, or euclidean
+  distanceStrategy: "cosine" as DistanceStrategy,
+};
+
+const pgvectorStore = await PGVectorStore.initialize(
+  new OpenAIEmbeddings(),
+  config
+);
+
+// create the index
+await pgvectorStore.createHnswIndex({
+  dimensions: 1536,
+  efConstruction: 64,
+  m: 16,
+});
+
+await pgvectorStore.addDocuments([
+  { pageContent: "what's this", metadata: { a: 2, b: ["tag1", "tag2"] } },
+  { pageContent: "Cat drinks milk", metadata: { a: 1, b: ["tag2"] } },
+]);
+
+const model = new OpenAIEmbeddings();
+const query = await model.embedQuery("water");
+const results = await pgvectorStore.similaritySearchVectorWithScore(query, 1);
+
+console.log(results);
+
+await pgvectorStore.end();
diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts
@@ -677,4 +677,55 @@ export class PGVectorStore extends VectorStore {
     this.client?.release();
     return this.pool.end();
   }
+
+  /**
+   * Method to create the HNSW index on the vector column.
+   *
+   * @param dimensions - Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's text-embedding-ada-002 and Amazon's amazon.titan-embed-text-v1 models.
+   * @param m - The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries.
+   * @param efConstruction -  The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time.
+   * @param distanceFunction -  The distance function name you want to use, is automatically selected based on the distanceStrategy.
+   * @returns Promise that resolves with the query response of creating the index.
+   */
+  async createHnswIndex(config: {
+    dimensions: number;
+    m?: number;
+    efConstruction?: number;
+    distanceFunction?: string;
+  }): Promise<void> {
+    let idxDistanceFunction = config?.distanceFunction || "vector_cosine_ops";
+
+    switch (this.distanceStrategy) {
+      case "cosine":
+        idxDistanceFunction = "vector_cosine_ops";
+        break;
+      case "innerProduct":
+        idxDistanceFunction = "vector_ip_ops";
+        break;
+      case "euclidean":
+        idxDistanceFunction = "vector_l2_ops";
+        break;
+      default:
+        throw new Error(`Unknown distance strategy: ${this.distanceStrategy}`);
+    }
+
+    const createIndexQuery = `CREATE INDEX IF NOT EXISTS ${
+      this.vectorColumnName
+    }_embedding_hnsw_idx
+        ON ${this.computedTableName} USING hnsw ((${
+      this.vectorColumnName
+    }::vector(${config.dimensions})) ${idxDistanceFunction})
+        WITH (
+            m=${config?.m || 16},
+            ef_construction=${config?.efConstruction || 64}
+        );`;
+
+    try {
+      await this.pool.query(createIndexQuery);
+    } catch (e) {
+      console.error(
+        `Failed to create HNSW index on table ${this.computedTableName}, error: ${e}`
+      );
+    }
+  }
 }
diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts
@@ -2,21 +2,30 @@ import { expect, test } from "@jest/globals";
 import pg, { PoolConfig } from "pg";
 import { OpenAIEmbeddings } from "@langchain/openai";
 import { PGVectorStore, PGVectorStoreArgs } from "../../pgvector.js";
+// import { BedrockEmbeddings } from "../../../embeddings/bedrock.js";
+
+const embeddingsEngine = new OpenAIEmbeddings();
+
+// const embeddingsEngine = new BedrockEmbeddings({
+//   region: "us-east-1",
+// });
+
+const postgresConnectionOptions = {
+  type: "postgres",
+  host: "127.0.0.1",
+  port: 5432,
+  user: "myuser",
+  password: "ChangeMe",
+  database: "api",
+} as PoolConfig;
 
 describe("PGVectorStore", () => {
   let pgvectorVectorStore: PGVectorStore;
   const tableName = "testlangchain";
 
   beforeAll(async () => {
-    const config = {
-      postgresConnectionOptions: {
-        type: "postgres",
-        host: "127.0.0.1",
-        port: 5432,
-        user: "myuser",
-        password: "ChangeMe",
-        database: "api",
-      } as PoolConfig,
+    const config: PGVectorStoreArgs = {
+      postgresConnectionOptions,
       tableName: "testlangchain",
       // collectionTableName: "langchain_pg_collection",
       // collectionName: "langchain",
@@ -29,7 +38,7 @@ describe("PGVectorStore", () => {
     };
 
     pgvectorVectorStore = await PGVectorStore.initialize(
-      new OpenAIEmbeddings(),
+      embeddingsEngine,
       config
     );
   });
@@ -297,14 +306,7 @@ describe("PGVectorStore with collection", () => {
 
   beforeAll(async () => {
     const config = {
-      postgresConnectionOptions: {
-        type: "postgres",
-        host: "127.0.0.1",
-        port: 5432,
-        user: "myuser",
-        password: "ChangeMe",
-        database: "api",
-      } as PoolConfig,
+      postgresConnectionOptions,
       tableName,
       collectionTableName,
       collectionName: "langchain",
@@ -317,7 +319,7 @@ describe("PGVectorStore with collection", () => {
     };
 
     pgvectorVectorStore = await PGVectorStore.initialize(
-      new OpenAIEmbeddings(),
+      embeddingsEngine,
       config
     );
   });
@@ -535,13 +537,7 @@ describe("PGVectorStore with schema", () => {
   let pool: pg.Pool;
 
   beforeAll(async () => {
-    pool = new pg.Pool({
-      host: "127.0.0.1",
-      port: 5432,
-      user: "myuser",
-      password: "ChangeMe",
-      database: "api",
-    });
+    pool = new pg.Pool(postgresConnectionOptions);
 
     const config: PGVectorStoreArgs = {
       pool,
@@ -560,7 +556,7 @@ describe("PGVectorStore with schema", () => {
     await pool.query(`CREATE SCHEMA IF NOT EXISTS ${schema}`);
 
     pgvectorVectorStore = await PGVectorStore.initialize(
-      new OpenAIEmbeddings(),
+      embeddingsEngine,
       config
     );
     computedTableName = pgvectorVectorStore.computedTableName;
@@ -773,3 +769,74 @@ describe("PGVectorStore with schema", () => {
     }
   });
 });
+
+describe("PGVectorStore with HNSW index", () => {
+  let pgvectorVectorStore: PGVectorStore;
+  const tableName = "testlangchain";
+
+  beforeAll(async () => {
+    const config: PGVectorStoreArgs = {
+      postgresConnectionOptions,
+      tableName: "testlangchain",
+      columns: {
+        idColumnName: "id",
+        vectorColumnName: "vector",
+        contentColumnName: "content",
+        metadataColumnName: "metadata",
+      },
+      distanceStrategy: "cosine",
+    };
+
+    pgvectorVectorStore = await PGVectorStore.initialize(
+      embeddingsEngine,
+      config
+    );
+
+    // Create the index
+    await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 });
+  });
+
+  afterEach(async () => {
+    // Drop table, then recreate it for the next test.
+    await pgvectorVectorStore.pool.query(`DROP TABLE "${tableName}"`);
+    await pgvectorVectorStore.ensureTableInDatabase();
+    await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 });
+  });
+
+  afterAll(async () => {
+    await pgvectorVectorStore.end();
+  });
+
+  test("Ensure table has HNSW index", async () => {
+    const result = await pgvectorVectorStore.pool.query(
+      `SELECT indexname, tablename, indexdef FROM pg_indexes where indexname='vector_embedding_hnsw_idx';`
+    );
+    const { indexdef } = result.rows[0];
+    expect(result.rowCount).toBe(1);
+    expect(indexdef.includes("USING hnsw")).toBe(true);
+  });
+
+  test("Test embeddings creation", async () => {
+    const documents = [
+      {
+        pageContent: "hello",
+        metadata: { a: 1 },
+      },
+      {
+        pageContent: "Cat drinks milk",
+        metadata: { a: 2 },
+      },
+      { pageContent: "hi", metadata: { a: 1 } },
+    ];
+    await pgvectorVectorStore.addDocuments(documents);
+
+    const query = await embeddingsEngine.embedQuery("milk");
+    const results = await pgvectorVectorStore.similaritySearchVectorWithScore(
+      query,
+      1
+    );
+
+    expect(results).toHaveLength(1);
+    expect(results[0][0].pageContent).toEqual("Cat drinks milk");
+  });
+});