Skip to content

Commit 43829d6

Browse files
authored
community[patch]: Add pgvector index using HNSW (#5564)
* Add pgvector hnsw * Update hnsw index name to use column name * Fix import typo * Fix pgvector index test * Refactor, set dimensions for hnsw mandatory, fix docs * Add pg-format, refactor create hnsw index with sql identifiers pg-format * Revert docs gitignore * Revert docs gitignore * Revert pg-format * Revert gitignore docs
1 parent f6ef32d commit 43829d6

File tree

4 files changed

+220
-27
lines changed

4 files changed

+220
-27
lines changed

docs/core_docs/docs/integrations/vectorstores/pgvector.mdx

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,24 @@ before using the constructor.
7474
import ConnectionReuseExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_pool.ts";
7575

7676
<CodeBlock language="typescript">{ConnectionReuseExample}</CodeBlock>
77+
78+
### Create HNSW Index
79+
80+
By default, the extension performs a sequential scan search, with 100% recall. You might consider creating an HNSW index for approximate nearest neighbor (ANN) search to speed up similaritySearchVectorWithScore execution time. To create the HNSW index on your vector column, use the `createHnswIndex()` method:
81+
82+
The method parameters include:
83+
84+
**dimensions**: Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's `text-embedding-ada-002` and Amazon's `amazon.titan-embed-text-v1` models.
85+
86+
**m?**: The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries.
87+
88+
**efConstruction?**: The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time.
89+
90+
**distanceFunction?**: The distance function name you want to use, is automatically selected based on the distanceStrategy.
91+
92+
More info at the [`Pgvector GitHub project`](https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw) and the HNSW paper from Malkov Yu A. and Yashunin D. A.. 2020. [`Efficient and robust approximate nearest neighbor search using hierarchical navigable small world graphs`](https://arxiv.org/pdf/1603.09320)
93+
94+
import HnswExample from "@examples/indexes/vector_stores/pgvector_vectorstore/pgvector_hnsw.ts";
95+
96+
<CodeBlock language="typescript">{HnswExample}</CodeBlock>
97+
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { OpenAIEmbeddings } from "@langchain/openai";
2+
import {
3+
DistanceStrategy,
4+
PGVectorStore,
5+
} from "@langchain/community/vectorstores/pgvector";
6+
import { PoolConfig } from "pg";
7+
8+
// First, follow set-up instructions at
9+
// https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/pgvector
10+
11+
const config = {
12+
postgresConnectionOptions: {
13+
type: "postgres",
14+
host: "127.0.0.1",
15+
port: 5433,
16+
user: "myuser",
17+
password: "ChangeMe",
18+
database: "api",
19+
} as PoolConfig,
20+
tableName: "testlangchain",
21+
columns: {
22+
idColumnName: "id",
23+
vectorColumnName: "vector",
24+
contentColumnName: "content",
25+
metadataColumnName: "metadata",
26+
},
27+
// supported distance strategies: cosine (default), innerProduct, or euclidean
28+
distanceStrategy: "cosine" as DistanceStrategy,
29+
};
30+
31+
const pgvectorStore = await PGVectorStore.initialize(
32+
new OpenAIEmbeddings(),
33+
config
34+
);
35+
36+
// create the index
37+
await pgvectorStore.createHnswIndex({
38+
dimensions: 1536,
39+
efConstruction: 64,
40+
m: 16,
41+
});
42+
43+
await pgvectorStore.addDocuments([
44+
{ pageContent: "what's this", metadata: { a: 2, b: ["tag1", "tag2"] } },
45+
{ pageContent: "Cat drinks milk", metadata: { a: 1, b: ["tag2"] } },
46+
]);
47+
48+
const model = new OpenAIEmbeddings();
49+
const query = await model.embedQuery("water");
50+
const results = await pgvectorStore.similaritySearchVectorWithScore(query, 1);
51+
52+
console.log(results);
53+
54+
await pgvectorStore.end();

libs/langchain-community/src/vectorstores/pgvector.ts

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,4 +677,55 @@ export class PGVectorStore extends VectorStore {
677677
this.client?.release();
678678
return this.pool.end();
679679
}
680+
681+
/**
682+
* Method to create the HNSW index on the vector column.
683+
*
684+
* @param dimensions - Defines the number of dimensions in your vector data type, up to 2000. For example, use 1536 for OpenAI's text-embedding-ada-002 and Amazon's amazon.titan-embed-text-v1 models.
685+
* @param m - The max number of connections per layer (16 by default). Index build time improves with smaller values, while higher values can speed up search queries.
686+
* @param efConstruction - The size of the dynamic candidate list for constructing the graph (64 by default). A higher value can potentially improve the index quality at the cost of index build time.
687+
* @param distanceFunction - The distance function name you want to use, is automatically selected based on the distanceStrategy.
688+
* @returns Promise that resolves with the query response of creating the index.
689+
*/
690+
async createHnswIndex(config: {
691+
dimensions: number;
692+
m?: number;
693+
efConstruction?: number;
694+
distanceFunction?: string;
695+
}): Promise<void> {
696+
let idxDistanceFunction = config?.distanceFunction || "vector_cosine_ops";
697+
698+
switch (this.distanceStrategy) {
699+
case "cosine":
700+
idxDistanceFunction = "vector_cosine_ops";
701+
break;
702+
case "innerProduct":
703+
idxDistanceFunction = "vector_ip_ops";
704+
break;
705+
case "euclidean":
706+
idxDistanceFunction = "vector_l2_ops";
707+
break;
708+
default:
709+
throw new Error(`Unknown distance strategy: ${this.distanceStrategy}`);
710+
}
711+
712+
const createIndexQuery = `CREATE INDEX IF NOT EXISTS ${
713+
this.vectorColumnName
714+
}_embedding_hnsw_idx
715+
ON ${this.computedTableName} USING hnsw ((${
716+
this.vectorColumnName
717+
}::vector(${config.dimensions})) ${idxDistanceFunction})
718+
WITH (
719+
m=${config?.m || 16},
720+
ef_construction=${config?.efConstruction || 64}
721+
);`;
722+
723+
try {
724+
await this.pool.query(createIndexQuery);
725+
} catch (e) {
726+
console.error(
727+
`Failed to create HNSW index on table ${this.computedTableName}, error: ${e}`
728+
);
729+
}
730+
}
680731
}

libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts

Lines changed: 94 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,30 @@ import { expect, test } from "@jest/globals";
22
import pg, { PoolConfig } from "pg";
33
import { OpenAIEmbeddings } from "@langchain/openai";
44
import { PGVectorStore, PGVectorStoreArgs } from "../../pgvector.js";
5+
// import { BedrockEmbeddings } from "../../../embeddings/bedrock.js";
6+
7+
const embeddingsEngine = new OpenAIEmbeddings();
8+
9+
// const embeddingsEngine = new BedrockEmbeddings({
10+
// region: "us-east-1",
11+
// });
12+
13+
const postgresConnectionOptions = {
14+
type: "postgres",
15+
host: "127.0.0.1",
16+
port: 5432,
17+
user: "myuser",
18+
password: "ChangeMe",
19+
database: "api",
20+
} as PoolConfig;
521

622
describe("PGVectorStore", () => {
723
let pgvectorVectorStore: PGVectorStore;
824
const tableName = "testlangchain";
925

1026
beforeAll(async () => {
11-
const config = {
12-
postgresConnectionOptions: {
13-
type: "postgres",
14-
host: "127.0.0.1",
15-
port: 5432,
16-
user: "myuser",
17-
password: "ChangeMe",
18-
database: "api",
19-
} as PoolConfig,
27+
const config: PGVectorStoreArgs = {
28+
postgresConnectionOptions,
2029
tableName: "testlangchain",
2130
// collectionTableName: "langchain_pg_collection",
2231
// collectionName: "langchain",
@@ -29,7 +38,7 @@ describe("PGVectorStore", () => {
2938
};
3039

3140
pgvectorVectorStore = await PGVectorStore.initialize(
32-
new OpenAIEmbeddings(),
41+
embeddingsEngine,
3342
config
3443
);
3544
});
@@ -297,14 +306,7 @@ describe("PGVectorStore with collection", () => {
297306

298307
beforeAll(async () => {
299308
const config = {
300-
postgresConnectionOptions: {
301-
type: "postgres",
302-
host: "127.0.0.1",
303-
port: 5432,
304-
user: "myuser",
305-
password: "ChangeMe",
306-
database: "api",
307-
} as PoolConfig,
309+
postgresConnectionOptions,
308310
tableName,
309311
collectionTableName,
310312
collectionName: "langchain",
@@ -317,7 +319,7 @@ describe("PGVectorStore with collection", () => {
317319
};
318320

319321
pgvectorVectorStore = await PGVectorStore.initialize(
320-
new OpenAIEmbeddings(),
322+
embeddingsEngine,
321323
config
322324
);
323325
});
@@ -535,13 +537,7 @@ describe("PGVectorStore with schema", () => {
535537
let pool: pg.Pool;
536538

537539
beforeAll(async () => {
538-
pool = new pg.Pool({
539-
host: "127.0.0.1",
540-
port: 5432,
541-
user: "myuser",
542-
password: "ChangeMe",
543-
database: "api",
544-
});
540+
pool = new pg.Pool(postgresConnectionOptions);
545541

546542
const config: PGVectorStoreArgs = {
547543
pool,
@@ -560,7 +556,7 @@ describe("PGVectorStore with schema", () => {
560556
await pool.query(`CREATE SCHEMA IF NOT EXISTS ${schema}`);
561557

562558
pgvectorVectorStore = await PGVectorStore.initialize(
563-
new OpenAIEmbeddings(),
559+
embeddingsEngine,
564560
config
565561
);
566562
computedTableName = pgvectorVectorStore.computedTableName;
@@ -773,3 +769,74 @@ describe("PGVectorStore with schema", () => {
773769
}
774770
});
775771
});
772+
773+
describe("PGVectorStore with HNSW index", () => {
774+
let pgvectorVectorStore: PGVectorStore;
775+
const tableName = "testlangchain";
776+
777+
beforeAll(async () => {
778+
const config: PGVectorStoreArgs = {
779+
postgresConnectionOptions,
780+
tableName: "testlangchain",
781+
columns: {
782+
idColumnName: "id",
783+
vectorColumnName: "vector",
784+
contentColumnName: "content",
785+
metadataColumnName: "metadata",
786+
},
787+
distanceStrategy: "cosine",
788+
};
789+
790+
pgvectorVectorStore = await PGVectorStore.initialize(
791+
embeddingsEngine,
792+
config
793+
);
794+
795+
// Create the index
796+
await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 });
797+
});
798+
799+
afterEach(async () => {
800+
// Drop table, then recreate it for the next test.
801+
await pgvectorVectorStore.pool.query(`DROP TABLE "${tableName}"`);
802+
await pgvectorVectorStore.ensureTableInDatabase();
803+
await pgvectorVectorStore.createHnswIndex({ dimensions: 1536 });
804+
});
805+
806+
afterAll(async () => {
807+
await pgvectorVectorStore.end();
808+
});
809+
810+
test("Ensure table has HNSW index", async () => {
811+
const result = await pgvectorVectorStore.pool.query(
812+
`SELECT indexname, tablename, indexdef FROM pg_indexes where indexname='vector_embedding_hnsw_idx';`
813+
);
814+
const { indexdef } = result.rows[0];
815+
expect(result.rowCount).toBe(1);
816+
expect(indexdef.includes("USING hnsw")).toBe(true);
817+
});
818+
819+
test("Test embeddings creation", async () => {
820+
const documents = [
821+
{
822+
pageContent: "hello",
823+
metadata: { a: 1 },
824+
},
825+
{
826+
pageContent: "Cat drinks milk",
827+
metadata: { a: 2 },
828+
},
829+
{ pageContent: "hi", metadata: { a: 1 } },
830+
];
831+
await pgvectorVectorStore.addDocuments(documents);
832+
833+
const query = await embeddingsEngine.embedQuery("milk");
834+
const results = await pgvectorVectorStore.similaritySearchVectorWithScore(
835+
query,
836+
1
837+
);
838+
839+
expect(results).toHaveLength(1);
840+
expect(results[0][0].pageContent).toEqual("Cat drinks milk");
841+
});
842+
});

0 commit comments

Comments
 (0)