Skip to content

Commit

Permalink
langchain[minor]: Couchbase document loader (#4364)
Browse files Browse the repository at this point in the history
* added couchbase document loader

* fixed loader to use stringify

* add doc file

* updated tests

* update types as per new requirement

* update comments for typedoc

* fix formatting issues and remove print in tests

* Format

---------

Co-authored-by: jacoblee93 <jacoblee93@gmail.com>
  • Loading branch information
lokesh-couchbase and jacoblee93 authored Feb 16, 2024
1 parent 63c13f5 commit f310559
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
---
hide_table_of_contents: true
sidebar_class_name: node-only
---

# Couchbase

[Couchbase](http://couchbase.com/) is an award-winning distributed NoSQL cloud database that delivers unmatched versatility, performance, scalability, and financial value for all of your cloud, mobile, AI, and edge computing applications.

This guide shows how to use load documents from couchbase database.

# Installation

```bash npm2yarn
npm install couchbase
```

## Usage

### Querying for Documents from Couchbase

For more details on connecting to a Couchbase cluster, please check the [Node.js SDK documentation](https://docs.couchbase.com/nodejs-sdk/current/howtos/managing-connections.html#connection-strings).

For help with querying for documents using SQL++ (SQL for JSON), please check the [documentation](https://docs.couchbase.com/server/current/n1ql/n1ql-language-reference/index.html).

```typescript
import { CouchbaseDocumentLoader } from "langchain/document_loaders/web/couchbase";
import { Cluster } from "couchbase";

const connectionString = "couchbase://localhost"; // valid couchbase connection string
const dbUsername = "Administrator"; // valid database user with read access to the bucket being queried
const dbPassword = "Password"; // password for the database user

// query is a valid SQL++ query
const query = `
SELECT h.* FROM \`travel-sample\`.inventory.hotel h
WHERE h.country = 'United States'
LIMIT 1
`;
```

### Connect to Couchbase Cluster

```typescript
const couchbaseClient = await Cluster.connect(connectionString, {
username: dbUsername,
password: dbPassword,
configProfile: "wanDevelopment",
});
```

### Create the Loader

```typescript
const loader = new CouchbaseDocumentLoader(
couchbaseClient, // The connected couchbase cluster client
query // A valid SQL++ query which will return the required data
);
```

### Load Documents

You can fetch the documents by calling the `load` method of the loader. It will return a list with all the documents. If you want to avoid this blocking call, you can call `lazy_load` method that returns an Iterator.

```typescript
// using load method
docs = await loader.load();
console.log(docs);
```

```typescript
// using lazy_load
for await (const doc of this.lazyLoad()) {
console.log(doc);
break; // break based on required condition
}
```

### Specifying Fields with Content and Metadata

The fields that are part of the Document content can be specified using the `pageContentFields` parameter.
The metadata fields for the Document can be specified using the `metadataFields` parameter.

```typescript
const loaderWithSelectedFields = new CouchbaseDocumentLoader(
couchbaseClient,
query,
// pageContentFields
[
"address",
"name",
"city",
"phone",
"country",
"geo",
"description",
"reviews",
],
["id"] // metadataFields
);

const filtered_docs = await loaderWithSelectedFields.load();
console.log(filtered_docs);
```
4 changes: 4 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,10 @@ document_loaders/web/confluence.cjs
document_loaders/web/confluence.js
document_loaders/web/confluence.d.ts
document_loaders/web/confluence.d.cts
document_loaders/web/couchbase.cjs
document_loaders/web/couchbase.js
document_loaders/web/couchbase.d.ts
document_loaders/web/couchbase.d.cts
document_loaders/web/searchapi.cjs
document_loaders/web/searchapi.js
document_loaders/web/searchapi.d.ts
Expand Down
2 changes: 2 additions & 0 deletions langchain/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ export const config = {
"document_loaders/web/sitemap": "document_loaders/web/sitemap",
"document_loaders/web/sonix_audio": "document_loaders/web/sonix_audio",
"document_loaders/web/confluence": "document_loaders/web/confluence",
"document_loaders/web/couchbase": "document_loaders/web/couchbase",
"document_loaders/web/searchapi": "document_loaders/web/searchapi",
"document_loaders/web/serpapi": "document_loaders/web/serpapi",
"document_loaders/web/sort_xyz_blockchain":
Expand Down Expand Up @@ -644,6 +645,7 @@ export const config = {
"document_loaders/web/sitemap",
"document_loaders/web/sonix_audio",
"document_loaders/web/confluence",
"document_loaders/web/couchbase",
"document_loaders/web/youtube",
"document_loaders/fs/directory",
"document_loaders/fs/buffer",
Expand Down
18 changes: 18 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,10 @@
"document_loaders/web/confluence.js",
"document_loaders/web/confluence.d.ts",
"document_loaders/web/confluence.d.cts",
"document_loaders/web/couchbase.cjs",
"document_loaders/web/couchbase.js",
"document_loaders/web/couchbase.d.ts",
"document_loaders/web/couchbase.d.cts",
"document_loaders/web/searchapi.cjs",
"document_loaders/web/searchapi.js",
"document_loaders/web/searchapi.d.ts",
Expand Down Expand Up @@ -1247,6 +1251,7 @@
"cheerio": "^1.0.0-rc.12",
"chromadb": "^1.5.3",
"convex": "^1.3.1",
"couchbase": "^4.2.10",
"d3-dsv": "^2.0.0",
"dotenv": "^16.0.3",
"dpdm": "^3.12.0",
Expand Down Expand Up @@ -1317,6 +1322,7 @@
"cheerio": "^1.0.0-rc.12",
"chromadb": "*",
"convex": "^1.3.1",
"couchbase": "^4.2.10",
"d3-dsv": "^2.0.0",
"epub2": "^3.0.1",
"fast-xml-parser": "^4.2.7",
Expand Down Expand Up @@ -1411,6 +1417,9 @@
"convex": {
"optional": true
},
"couchbase": {
"optional": true
},
"d3-dsv": {
"optional": true
},
Expand Down Expand Up @@ -2899,6 +2908,15 @@
"import": "./document_loaders/web/confluence.js",
"require": "./document_loaders/web/confluence.cjs"
},
"./document_loaders/web/couchbase": {
"types": {
"import": "./document_loaders/web/couchbase.d.ts",
"require": "./document_loaders/web/couchbase.d.cts",
"default": "./document_loaders/web/couchbase.d.ts"
},
"import": "./document_loaders/web/couchbase.js",
"require": "./document_loaders/web/couchbase.cjs"
},
"./document_loaders/web/searchapi": {
"types": {
"import": "./document_loaders/web/searchapi.d.ts",
Expand Down
36 changes: 36 additions & 0 deletions langchain/src/document_loaders/tests/couchbase.int.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { test, expect } from "@jest/globals";
import { Cluster } from "couchbase";
import { CouchbaseDocumentLoader } from "../web/couchbase.js";

test("Test Couchbase Cluster connection ", async () => {
const connectionString = "<enter-valid-couchbase-connection-string>";
const databaseUsername = "<enter-valid-couchbase-user>";
const databasePassword = "<enter-valid-couchbase-password>";
const query = `
SELECT h.* FROM \`travel-sample\`.inventory.hotel h
WHERE h.country = 'United States'
LIMIT 10
`;
const validPageContentFields = ["country", "name", "description"];
const validMetadataFields = ["id"];

const couchbaseClient = await Cluster.connect(connectionString, {
username: databaseUsername,
password: databasePassword,
configProfile: "wanDevelopment",
});
const loader = new CouchbaseDocumentLoader(
couchbaseClient,
query,
validPageContentFields,
validMetadataFields
);
const docs = await loader.load();
expect(docs.length).toBeGreaterThan(0);

for (const doc of docs) {
expect(doc.pageContent).not.toBe(""); // Assuming valid page content fields
expect(doc.metadata).toHaveProperty("id"); // Assuming metadata has id field
expect(doc.metadata.id).not.toBe("");
}
});
88 changes: 88 additions & 0 deletions langchain/src/document_loaders/web/couchbase.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import { Cluster, QueryResult } from "couchbase";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader, DocumentLoader } from "../base.js";

/**
* loader for couchbase document
*/
export class CouchbaseDocumentLoader
extends BaseDocumentLoader
implements DocumentLoader
{
private cluster: Cluster;

private query: string;

private pageContentFields?: string[];

private metadataFields?: string[];

/**
* construct Couchbase document loader with a requirement for couchbase cluster client
* @param client { Cluster } [ couchbase connected client to connect to database ]
* @param query { string } [ query to get results from while loading the data ]
* @param pageContentFields { Array<string> } [ filters fields of the document and shows these only ]
* @param metadataFields { Array<string> } [ metadata fields required ]
*/
constructor(
client: Cluster,
query: string,
pageContentFields?: string[],
metadataFields?: string[]
) {
super();
if (!client) {
throw new Error("Couchbase client cluster must be provided.");
}
this.cluster = client;
this.query = query;
this.pageContentFields = pageContentFields;
this.metadataFields = metadataFields;
}

/**
* Function to load document based on query from couchbase
* @returns {Promise<Document[]>} [ Returns a promise of all the documents as array ]
*/
async load(): Promise<Document[]> {
const documents: Document[] = [];
for await (const doc of this.lazyLoad()) {
documents.push(doc);
}
return documents;
}

/**
* Function to load documents based on iterator rather than full load
* @returns {AsyncIterable<Document>} [ Returns an iterator to fetch documents ]
*/
async *lazyLoad(): AsyncIterable<Document> {
// Run SQL++ Query
const result: QueryResult = await this.cluster.query(this.query);
for await (const row of result.rows) {
let { metadataFields, pageContentFields } = this;

if (!pageContentFields) {
pageContentFields = Object.keys(row);
}

if (!metadataFields) {
metadataFields = [];
}

const metadata = metadataFields.reduce(
(obj, field) => ({ ...obj, [field]: row[field] }),
{}
);

const document = pageContentFields
.map((k) => `${k}: ${JSON.stringify(row[k])}`)
.join("\n");

yield new Document({
pageContent: document,
metadata,
});
}
}
}
1 change: 1 addition & 0 deletions langchain/src/load/import_constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain/document_loaders/web/sitemap",
"langchain/document_loaders/web/sonix_audio",
"langchain/document_loaders/web/confluence",
"langchain/document_loaders/web/couchbase",
"langchain/document_loaders/web/youtube",
"langchain/document_loaders/fs/directory",
"langchain/document_loaders/fs/buffer",
Expand Down
Loading

0 comments on commit f310559

Please sign in to comment.