Skip to content

Commit

Permalink
langchain-community[patch]: unstructured support extractImageBlockTyp…
Browse files Browse the repository at this point in the history
…es option
  • Loading branch information
jeasonnow committed Jul 1, 2024
1 parent 22ec7c8 commit ecb03ef
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
11 changes: 11 additions & 0 deletions libs/langchain-community/src/document_loaders/fs/unstructured.ts
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ export type UnstructuredLoaderOptions = {
combineUnderNChars?: number;
newAfterNChars?: number;
maxCharacters?: number;
extractImageBlockTypes?: string[];
};

export type UnstructuredDirectoryLoaderOptions = UnstructuredLoaderOptions & {
Expand Down Expand Up @@ -178,6 +179,8 @@ export class UnstructuredLoader extends BaseDocumentLoader {

private maxCharacters?: number;

private extractImageBlockTypes?: string[];

constructor(
filepathOrBufferOptions: string | UnstructuredMemoryLoaderOptions,
unstructuredOptions: UnstructuredLoaderOptions | string = {}
Expand Down Expand Up @@ -221,6 +224,7 @@ export class UnstructuredLoader extends BaseDocumentLoader {
this.combineUnderNChars = options.combineUnderNChars;
this.newAfterNChars = options.newAfterNChars;
this.maxCharacters = options.maxCharacters;
this.extractImageBlockTypes = options.extractImageBlockTypes;
}
}

Expand Down Expand Up @@ -288,6 +292,13 @@ export class UnstructuredLoader extends BaseDocumentLoader {
formData.append("max_characters", String(this.maxCharacters));
}

if (this.extractImageBlockTypes !== undefined) {
formData.append(
"extract_image_block_types",
JSON.stringify(this.extractImageBlockTypes)
);
}

const headers = {
"UNSTRUCTURED-API-KEY": this.apiKey ?? "",
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,28 @@ test.skip("Test Unstructured base loader with fast strategy", async () => {

const loader = new UnstructuredLoader(filePath, options);
const docs = await loader.load();

expect(docs.length).toBeGreaterThan(10);
expect(typeof docs[0].pageContent).toBe("string");
});

test.skip("Test Unstructured base loader with extractImageBlockTypes", async () => {
const filePath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
"./example_data/1706.03762.pdf"
);

const options = {
apiKey: process.env.UNSTRUCTURED_API_KEY!,
extractImageBlockTypes: ["image"],
};

const loader = new UnstructuredLoader(filePath, options);
const docs = await loader.load();

expect(docs.length).toBeGreaterThan(10);
expect(docs.some((item) => item?.metadata?.category === "image")).toBe(true);
});

test.skip("Test Unstructured directory loader", async () => {
const directoryPath = path.resolve(
path.dirname(url.fileURLToPath(import.meta.url)),
Expand Down

0 comments on commit ecb03ef

Please sign in to comment.