Skip to content

Commit

Permalink
langchain[minor]: Multi-file loader (#5584)
Browse files Browse the repository at this point in the history
* Multi-file loader

* Update imports, add entrypoint, format

---------

Co-authored-by: jacoblee93 <jacoblee93@gmail.com>
  • Loading branch information
theogravity and jacoblee93 authored Jun 25, 2024
1 parent 5984a6d commit 3f07d61
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
---
sidebar_position: 2
hide_table_of_contents: true
---

# Multiple individual files

This example goes over how to load data from multiple file paths. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together.

Example files:

```text
src/document_loaders/example_data/example/
├── example.txt
└── example.csv
src/document_loaders/example_data/example2/
├── example.json
└── example.jsonl
```

Example code:

```typescript
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file";
import {
JSONLoader,
JSONLinesLoader,
} from "langchain/document_loaders/fs/json";
import { TextLoader } from "langchain/document_loaders/fs/text";
import { CSVLoader } from "langchain/document_loaders/fs/csv";

const loader = new MultiFileLoader(
[
"src/document_loaders/example_data/example/example.txt",
"src/document_loaders/example_data/example/example.csv",
"src/document_loaders/example_data/example2/example.json",
"src/document_loaders/example_data/example2/example.jsonl",
],
{
".json": (path) => new JSONLoader(path, "/texts"),
".jsonl": (path) => new JSONLinesLoader(path, "/html"),
".txt": (path) => new TextLoader(path),
".csv": (path) => new CSVLoader(path, "text"),
}
);
const docs = await loader.load();
console.log({ docs });
```
4 changes: 4 additions & 0 deletions langchain/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ document_loaders/fs/directory.cjs
document_loaders/fs/directory.js
document_loaders/fs/directory.d.ts
document_loaders/fs/directory.d.cts
document_loaders/fs/multi_file.cjs
document_loaders/fs/multi_file.js
document_loaders/fs/multi_file.d.ts
document_loaders/fs/multi_file.d.cts
document_loaders/fs/buffer.cjs
document_loaders/fs/buffer.js
document_loaders/fs/buffer.d.ts
Expand Down
2 changes: 2 additions & 0 deletions langchain/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ export const config = {
"document_loaders/web/sort_xyz_blockchain",
"document_loaders/web/youtube": "document_loaders/web/youtube",
"document_loaders/fs/directory": "document_loaders/fs/directory",
"document_loaders/fs/multi_file": "document_loaders/fs/multi_file",
"document_loaders/fs/buffer": "document_loaders/fs/buffer",
"document_loaders/fs/chatgpt": "document_loaders/fs/chatgpt",
"document_loaders/fs/text": "document_loaders/fs/text",
Expand Down Expand Up @@ -254,6 +255,7 @@ export const config = {
"document_loaders/web/couchbase",
"document_loaders/web/youtube",
"document_loaders/fs/directory",
"document_loaders/fs/multi_file",
"document_loaders/fs/buffer",
"document_loaders/fs/chatgpt",
"document_loaders/fs/text",
Expand Down
13 changes: 13 additions & 0 deletions langchain/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,10 @@
"document_loaders/fs/directory.js",
"document_loaders/fs/directory.d.ts",
"document_loaders/fs/directory.d.cts",
"document_loaders/fs/multi_file.cjs",
"document_loaders/fs/multi_file.js",
"document_loaders/fs/multi_file.d.ts",
"document_loaders/fs/multi_file.d.cts",
"document_loaders/fs/buffer.cjs",
"document_loaders/fs/buffer.js",
"document_loaders/fs/buffer.d.ts",
Expand Down Expand Up @@ -1540,6 +1544,15 @@
"import": "./document_loaders/fs/directory.js",
"require": "./document_loaders/fs/directory.cjs"
},
"./document_loaders/fs/multi_file": {
"types": {
"import": "./document_loaders/fs/multi_file.d.ts",
"require": "./document_loaders/fs/multi_file.d.cts",
"default": "./document_loaders/fs/multi_file.d.ts"
},
"import": "./document_loaders/fs/multi_file.js",
"require": "./document_loaders/fs/multi_file.cjs"
},
"./document_loaders/fs/buffer": {
"types": {
"import": "./document_loaders/fs/buffer.d.ts",
Expand Down
87 changes: 87 additions & 0 deletions langchain/src/document_loaders/fs/multi_file.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { extname, resolve } from "node:path";
import { stat } from "node:fs/promises";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "../base.js";
import { type LoadersMapping, UnknownHandling } from "./directory.js";

/**
* A document loader that loads documents from multiple files. It extends the
* `BaseDocumentLoader` class and implements the `load()` method.
* @example
* ```typescript
*
* const multiFileLoader = new MultiFileLoader(
* ["path/to/file1.pdf", "path/to/file2.txt"],
* {
* ".pdf": (path: string) => new PDFLoader(path),
* },
* );
*
* const docs = await multiFileLoader.load();
* console.log({ docs });
*
* ```
*/
export class MultiFileLoader extends BaseDocumentLoader {
constructor(
public filePaths: string[],
public loaders: LoadersMapping,
public unknown: UnknownHandling = UnknownHandling.Warn
) {
super();

if (Object.keys(loaders).length === 0) {
throw new Error("Must provide at least one loader");
}
for (const extension in loaders) {
if (Object.hasOwn(loaders, extension)) {
if (extension[0] !== ".") {
throw new Error(`Extension must start with a dot: ${extension}`);
}
}
}
}

/**
* Loads the documents from the provided file paths. It checks if the file
* is a directory and ignores it. If a file is a file, it checks if there
* is a corresponding loader function for the file extension in the `loaders`
* mapping. If there is, it loads the documents. If there is no
* corresponding loader function and `unknown` is set to `Warn`, it logs a
* warning message. If `unknown` is set to `Error`, it throws an error.
* @returns A promise that resolves to an array of loaded documents.
*/
public async load(): Promise<Document[]> {
const documents: Document[] = [];

for (const filePath of this.filePaths) {
const fullPath = resolve(filePath);
const fileStat = await stat(fullPath);

if (fileStat.isDirectory()) {
console.warn(`Ignoring directory: ${fullPath}`);
continue;
}

const loaderFactory = this.loaders[extname(fullPath)];
if (loaderFactory) {
const loader = loaderFactory(fullPath);
documents.push(...(await loader.load()));
} else {
switch (this.unknown) {
case UnknownHandling.Ignore:
break;
case UnknownHandling.Warn:
console.warn(`Unknown file type: ${fullPath}`);
break;
case UnknownHandling.Error:
throw new Error(`Unknown file type: ${fullPath}`);
default:
throw new Error(`Unknown unknown handling: ${this.unknown}`);
}
}
}

return documents;
}
}
84 changes: 84 additions & 0 deletions langchain/src/document_loaders/tests/multi_file.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import * as url from "node:url";
import * as path from "node:path";
import { test, expect } from "@jest/globals";
import { MultiFileLoader } from "../fs/multi_file.js";
import { CSVLoader } from "../fs/csv.js";
import { PDFLoader } from "../fs/pdf.js";
import { TextLoader } from "../fs/text.js";
import { JSONLoader } from "../fs/json.js";
import { UnknownHandling } from "../fs/directory.js";

test("Test MultiFileLoader", async () => {
const baseDirectory = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data"
);

const filePaths = [
path.resolve(baseDirectory, "1706.03762.pdf"),
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
path.resolve(
baseDirectory,
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv"
),
path.resolve(
baseDirectory,
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"
),
path.resolve(baseDirectory, "complex.json"),
path.resolve(baseDirectory, "example.txt"),
path.resolve(baseDirectory, "example_separator.csv"),
];

const loader = new MultiFileLoader(
filePaths,
{
".csv": (p) => {
if (p.includes("separator.csv")) {
return new CSVLoader(p, { column: "html", separator: "|" });
}
return new CSVLoader(p, "html");
},
".pdf": (p) => new PDFLoader(p),
".txt": (p) => new TextLoader(p),
".json": (p) => new JSONLoader(p),
},
UnknownHandling.Ignore
);

const docs = await loader.load();
expect(docs.length).toBe(123);

const expectedSources = [
// PDF
...Array.from({ length: 15 }, (_) =>
path.resolve(baseDirectory, "1706.03762.pdf")
),
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
// CSV
...Array.from({ length: 32 }, (_) =>
path.resolve(
baseDirectory,
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv"
)
),
// JSON
...Array.from({ length: 32 }, (_) =>
path.resolve(
baseDirectory,
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"
)
),
...Array.from({ length: 10 }, (_) =>
path.resolve(baseDirectory, "complex.json")
),
// TXT
path.resolve(baseDirectory, "example.txt"),
// CSV
...Array.from({ length: 32 }, (_) =>
path.resolve(baseDirectory, "example_separator.csv")
),
];

expect(docs.map((d) => d.metadata.source).sort()).toEqual(expectedSources);
});
1 change: 1 addition & 0 deletions langchain/src/load/import_constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ export const optionalImportEntrypoints: string[] = [
"langchain/document_loaders/web/couchbase",
"langchain/document_loaders/web/youtube",
"langchain/document_loaders/fs/directory",
"langchain/document_loaders/fs/multi_file",
"langchain/document_loaders/fs/buffer",
"langchain/document_loaders/fs/chatgpt",
"langchain/document_loaders/fs/text",
Expand Down

0 comments on commit 3f07d61

Please sign in to comment.