-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
langchain[minor]: Multi-file loader #5584
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
--- | ||
sidebar_position: 2 | ||
hide_table_of_contents: true | ||
--- | ||
|
||
# Multiple individual files | ||
|
||
This example goes over how to load data from multiple file paths. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together. | ||
|
||
Example files: | ||
|
||
```text | ||
src/document_loaders/example_data/example/ | ||
├── example.txt | ||
└── example.csv | ||
|
||
src/document_loaders/example_data/example2/ | ||
├── example.json | ||
└── example.jsonl | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file"; | ||
import { | ||
JSONLoader, | ||
JSONLinesLoader, | ||
} from "langchain/document_loaders/fs/json"; | ||
import { TextLoader } from "langchain/document_loaders/fs/text"; | ||
import { CSVLoader } from "langchain/document_loaders/fs/csv"; | ||
|
||
const loader = new MultiFileLoader( | ||
[ | ||
"src/document_loaders/example_data/example/example.txt", | ||
"src/document_loaders/example_data/example/example.csv", | ||
"src/document_loaders/example_data/example2/example.json", | ||
"src/document_loaders/example_data/example2/example.jsonl", | ||
], | ||
{ | ||
".json": (path) => new JSONLoader(path, "/texts"), | ||
".jsonl": (path) => new JSONLinesLoader(path, "/html"), | ||
".txt": (path) => new TextLoader(path), | ||
".csv": (path) => new CSVLoader(path, "text"), | ||
} | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import type { extname as ExtnameT, resolve as ResolveT } from "node:path"; | ||
import type { stat as StatT } from "node:fs/promises"; | ||
import { Document } from "@langchain/core/documents"; | ||
import { getEnv } from "@langchain/core/utils/env"; | ||
import { BaseDocumentLoader } from "../base.js"; | ||
import { type LoadersMapping, UnknownHandling } from "./directory.js"; | ||
|
||
/** | ||
* A document loader that loads documents from multiple files. It extends the | ||
* `BaseDocumentLoader` class and implements the `load()` method. | ||
* @example | ||
* ```typescript | ||
* | ||
* const multiFileLoader = new MultiFileLoader( | ||
* ["path/to/file1.pdf", "path/to/file2.txt"], | ||
* { | ||
* ".pdf": (path: string) => new PDFLoader(path), | ||
* }, | ||
* ); | ||
* | ||
* const docs = await multiFileLoader.load(); | ||
* console.log({ docs }); | ||
* | ||
* ``` | ||
*/ | ||
export class MultiFileLoader extends BaseDocumentLoader { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation shares a lot in common with |
||
constructor( | ||
public filePaths: string[], | ||
public loaders: LoadersMapping, | ||
public unknown: UnknownHandling = UnknownHandling.Warn | ||
) { | ||
super(); | ||
|
||
if (Object.keys(loaders).length === 0) { | ||
throw new Error("Must provide at least one loader"); | ||
} | ||
for (const extension in loaders) { | ||
if (Object.hasOwn(loaders, extension)) { | ||
if (extension[0] !== ".") { | ||
throw new Error(`Extension must start with a dot: ${extension}`); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Loads the documents from the provided file paths. It checks if the file | ||
* is a directory and ignores it. If a file is a file, it checks if there | ||
* is a corresponding loader function for the file extension in the `loaders` | ||
* mapping. If there is, it loads the documents. If there is no | ||
* corresponding loader function and `unknown` is set to `Warn`, it logs a | ||
* warning message. If `unknown` is set to `Error`, it throws an error. | ||
* @returns A promise that resolves to an array of loaded documents. | ||
*/ | ||
public async load(): Promise<Document[]> { | ||
const { stat, extname, resolve } = await MultiFileLoader.imports(); | ||
const documents: Document[] = []; | ||
|
||
for (const filePath of this.filePaths) { | ||
const fullPath = resolve(filePath); | ||
const fileStat = await stat(fullPath); | ||
|
||
if (fileStat.isDirectory()) { | ||
console.warn(`Ignoring directory: ${fullPath}`); | ||
continue; | ||
} | ||
|
||
const loaderFactory = this.loaders[extname(fullPath)]; | ||
if (loaderFactory) { | ||
const loader = loaderFactory(fullPath); | ||
documents.push(...(await loader.load())); | ||
} else { | ||
switch (this.unknown) { | ||
case UnknownHandling.Ignore: | ||
break; | ||
case UnknownHandling.Warn: | ||
console.warn(`Unknown file type: ${fullPath}`); | ||
break; | ||
case UnknownHandling.Error: | ||
throw new Error(`Unknown file type: ${fullPath}`); | ||
default: | ||
throw new Error(`Unknown unknown handling: ${this.unknown}`); | ||
} | ||
} | ||
} | ||
|
||
return documents; | ||
} | ||
|
||
/** | ||
* Imports the necessary functions from the `node:path` and | ||
* `node:fs/promises` modules. It is used to dynamically import the | ||
* functions when needed. If the import fails, it throws an error | ||
* indicating that the modules failed to load. | ||
* @returns A promise that resolves to an object containing the imported functions. | ||
*/ | ||
static async imports(): Promise<{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't have to do this anymore, you can just import them at the top as normal: https://github.com/langchain-ai/langchainjs/blob/main/.github/contributing/INTEGRATIONS.md There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was pretty much copied from the directory loader. Should I also remove it from there as well? I can update the config file accordingly as mentioned in the doc you posted. |
||
stat: typeof StatT; | ||
extname: typeof ExtnameT; | ||
resolve: typeof ResolveT; | ||
}> { | ||
try { | ||
const { extname, resolve } = await import("node:path"); | ||
const { stat } = await import("node:fs/promises"); | ||
return { stat, extname, resolve }; | ||
} catch (e) { | ||
console.error(e); | ||
throw new Error( | ||
`Failed to load fs/promises. MultiFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https://<link to docs> for alternatives.` | ||
); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import * as url from "node:url"; | ||
import * as path from "node:path"; | ||
import { test, expect } from "@jest/globals"; | ||
import { MultiFileLoader } from "../fs/multi_file.js"; | ||
import { CSVLoader } from "../fs/csv.js"; | ||
import { PDFLoader } from "../fs/pdf.js"; | ||
import { TextLoader } from "../fs/text.js"; | ||
import { JSONLoader } from "../fs/json.js"; | ||
import { UnknownHandling } from "../fs/directory.js"; | ||
|
||
test("Test MultiFileLoader", async () => { | ||
const baseDirectory = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data" | ||
); | ||
|
||
const filePaths = [ | ||
path.resolve(baseDirectory, "1706.03762.pdf"), | ||
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"), | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv" | ||
), | ||
path.resolve(baseDirectory, "Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"), | ||
path.resolve(baseDirectory, "complex.json"), | ||
path.resolve(baseDirectory, "example.txt"), | ||
path.resolve(baseDirectory, "example_separator.csv"), | ||
]; | ||
|
||
const loader = new MultiFileLoader( | ||
filePaths, | ||
{ | ||
".csv": (p) => { | ||
if (p.includes("separator.csv")) { | ||
return new CSVLoader(p, { column: "html", separator: "|" }); | ||
} | ||
return new CSVLoader(p, "html"); | ||
}, | ||
".pdf": (p) => new PDFLoader(p), | ||
".txt": (p) => new TextLoader(p), | ||
".json": (p) => new JSONLoader(p), | ||
}, | ||
UnknownHandling.Ignore | ||
); | ||
|
||
const docs = await loader.load(); | ||
expect(docs.length).toBe(123); | ||
|
||
const expectedSources = [ | ||
...Array.from({ length: 15 }, (_) => | ||
path.resolve(baseDirectory, "1706.03762.pdf") | ||
), | ||
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"), | ||
// CSV | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv" | ||
) | ||
), | ||
// JSON | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json" | ||
) | ||
), | ||
...Array.from({ length: 10 }, (_) => | ||
path.resolve(baseDirectory, "complex.json") | ||
), | ||
// TXT | ||
path.resolve(baseDirectory, "example.txt"), | ||
// CSV | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve(baseDirectory, "example_separator.csv") | ||
), | ||
]; | ||
|
||
expect(docs.map((d) => d.metadata.source).sort()).toEqual(expectedSources); | ||
}); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if these imports should be refactored / moved to some common file instead of being imported from
directory.js
.Open to suggestions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fine