Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
4361527
added first version
tmccaughey Nov 17, 2025
11c958e
set limit to 500 and fixed sync
tmccaughey Nov 25, 2025
ea67374
Fixed chunk size
tmccaughey Dec 11, 2025
0dae689
Update extensions/aws/src/module.ts
tmccaughey Jan 9, 2026
f9ddef4
Update extensions/aws/src/knowledge-connectors/helpers/text_extractor.ts
tmccaughey Jan 9, 2026
7711ecd
Update extensions/aws/src/knowledge-connectors/helpers/text_chunker.ts
tmccaughey Jan 9, 2026
99d76b5
Update extensions/aws/src/knowledge-connectors/helpers/text_chunker.ts
tmccaughey Jan 9, 2026
84940b1
Update extensions/aws/src/knowledge-connectors/helpers/list_files.ts
tmccaughey Jan 9, 2026
cad3a5f
deleted creds.env as not needed
tmccaughey Jan 9, 2026
75b65f7
removed logs, adjusted processing to match manual upload functionalit…
tmccaughey Jan 9, 2026
4da3e66
Update extensions/aws/src/knowledge-connectors/helpers/list_files.ts
tmccaughey Jan 12, 2026
1609e03
Update extensions/aws/src/knowledge-connectors/s3Connector.ts
tmccaughey Jan 12, 2026
e966b24
Update extensions/aws/src/knowledge-connectors/helpers/utils/config.ts
tmccaughey Jan 12, 2026
4c59dd8
implemented feedack: - added helpers into main file - renamed file , …
tmccaughey Jan 12, 2026
184bc62
Merge branch 'feature/knowledge-connector-s3' of github.com:Cognigy/E…
tmccaughey Jan 12, 2026
e23fe89
removed default parameter as never used
tmccaughey Jan 12, 2026
2a8a18b
Update extensions/aws/src/knowledge-connectors/helpers/text_chunker.ts
tmccaughey Jan 12, 2026
e11eaef
Removed logs
tmccaughey Jan 12, 2026
58c8307
removed loggs
tmccaughey Jan 12, 2026
56828f3
cleaned up files
tmccaughey Jan 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions extensions/aws/.npmrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
legacy-peer-deps=true
5,233 changes: 4,940 additions & 293 deletions extensions/aws/package-lock.json

Large diffs are not rendered by default.

22 changes: 16 additions & 6 deletions extensions/aws/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"main": "build/module.js",
"scripts": {
"transpile": "tsc -p .",
"zip": "tar cfz aws.tar.gz build/* package.json package-lock.json README.md icon.png",
"zip": "tar cfz aws.tar.gz build/* package.json package-lock.json .npmrc README.md icon.png",
"build": "npm run transpile && npm run lint && npm run zip",
"lint": "tslint -c tslint.json src/**/*.ts"
},
Expand All @@ -16,12 +16,22 @@
"author": "Cognigy GmbH",
"license": "MIT",
"dependencies": {
"@cognigy/extension-tools": "^0.13.0",
"aws-sdk": "^2.738.0",
"tslint": "^6.1.2"
"@aws-sdk/client-s3": "^3.932.0",
"@cognigy/extension-tools": "0.17.0-rc1",
"@langchain/community": "0.3.57",
"@langchain/core": "0.3.79",
"@langchain/textsplitters": "0.1.0",
"aws-sdk": "2.738.0",
"csv-parse": "6.1.0",
"langchain": "0.3.36",
"mammoth": "1.11.0",
"officeparser": "4.1.1",
"pdf-parse": "1.1.1",
"srt-parser-2": "^1.2.3",
"tslint": "6.1.2"
},
"devDependencies": {
"@types/node": "^13.13.15",
"typescript": "^3.8.3"
"@types/node": "^24.10.1",
"typescript": "^5.9.3"
}
}
90 changes: 90 additions & 0 deletions extensions/aws/src/knowledge-connectors/helpers/chunk_extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import type { IKnowledge } from "@cognigy/extension-tools";
import { S3Client, GetObjectCommand } from "@aws-sdk/client-s3";
import { lsExtractor } from "./text_extractor";
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';

export type ChunkContent = Pick<
IKnowledge.CreateKnowledgeChunkParams,
"text" | "data"
>;

export type S3Connection = {
accessKeyId: string;
secretAccessKey: string;
region: string;
};

const MAX_CHUNKS_PER_FILE = 500; // Limit chunks per file to avoid timeout issues

// Download file from S3 and extract chunks
export const getS3FileChunks = async (
connection: S3Connection,
bucketName: string,
fileKey: string
): Promise<ChunkContent[]> => {
const s3Client = new S3Client({
region: connection.region,
credentials: {
accessKeyId: connection.accessKeyId,
secretAccessKey: connection.secretAccessKey,
},
});

// Download file from S3
const command = new GetObjectCommand({
Bucket: bucketName,
Key: fileKey,
});

const response = await s3Client.send(command);
const bodyContents = await streamToBuffer(response.Body as any);

// Save to temp file (text_extractor needs file path)
const tempDir = os.tmpdir();
const tempFileName = `${Date.now()}_${path.basename(fileKey)}`;
const tempFilePath = path.join(tempDir, tempFileName);

fs.writeFileSync(tempFilePath, bodyContents);

try {
// Extract text using lsExtractor
const fileExtension = path.extname(fileKey).slice(1); // Remove the dot

const extractedText = await lsExtractor(fileExtension, tempFilePath);


// The lsExtractor returns text that's already been chunked and joined with \n\n
// Split by \n\n to get the individual chunks back
const chunks: ChunkContent[] = extractedText
.split('\n\n')
.filter(chunk => chunk.trim().length > 0)
.slice(0, MAX_CHUNKS_PER_FILE)
.map((chunk, index) => ({
text: chunk.trim(),
data: {
title: `${fileKey} - Part ${index + 1}`,
source: fileKey,
fileType: fileExtension,
},
}));

return chunks;

} finally {
// Clean up temp file
if (fs.existsSync(tempFilePath)) {
fs.unlinkSync(tempFilePath);
}
}
};

// Helper to convert stream to buffer
async function streamToBuffer(stream: any): Promise<Buffer> {
const chunks: Uint8Array[] = [];
for await (const chunk of stream) {
chunks.push(chunk);
}
return Buffer.concat(chunks);
}
60 changes: 60 additions & 0 deletions extensions/aws/src/knowledge-connectors/helpers/list_files.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import { S3Client, ListObjectsV2Command } from "@aws-sdk/client-s3";
interface S3Object {
Key: string;
Size: number;
LastModified: Date;
}

interface S3Connection {
accessKeyId: string;
secretAccessKey: string;
region: string;
}

export async function getS3Object(
connection: S3Connection,
bucketName: string,
prefix?: string,
): Promise<S3Object[]> {
const s3Client = new S3Client({
region: connection.region,
credentials: {
accessKeyId: connection.accessKeyId,
secretAccessKey: connection.secretAccessKey,
},
});

try {


const command = new ListObjectsV2Command({
Bucket: bucketName,
MaxKeys: 1000,
Prefix: prefix,
});

const response = await s3Client.send(command);

if (!response.Contents) {
return [];
}

const s3Objects: S3Object[] = response.Contents
.filter(obj => obj.Key && obj.Size && obj.Size > 0)
.map(obj => ({
Key: obj.Key!,
Size: obj.Size!,
LastModified: obj.LastModified!
})); // Filter out empty files and folders

// Log first few files for debugging
s3Objects.slice(0, 3).forEach((obj, index) => {
});

return s3Objects;

} catch (error) {
console.error("Error listing objects from S3:", error);
throw error;
}
}
31 changes: 31 additions & 0 deletions extensions/aws/src/knowledge-connectors/helpers/text_chunker.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import * as splitters from "@langchain/textsplitters";

import { getMaxChunkSize, langchainDefaultChunkSizeInChars } from "./utils/config";


export async function splitDocs(documents: any): Promise<any[]> {
let splitter;
splitter = getRecursiveCharacterTextSplitter();
const splitParagraphs = await splitter.splitDocuments(documents);
return splitParagraphs;
}

const getChunkSizeInChars = () => {
// Langchain has issues and creates chunks larger than the limit set.
// Therefore a margin is added to chunk size
const margin = 400;
const chunkMaxSize = Math.min(langchainDefaultChunkSizeInChars(), getMaxChunkSize()) - margin;
const chunkSize = chunkMaxSize > 0 ? chunkMaxSize : 1800;
return chunkSize;
};

const getRecursiveCharacterTextSplitter = () => {
const chunkSize = getChunkSizeInChars();
const chunkOverlap = 0;
const splitter = new splitters.RecursiveCharacterTextSplitter({
chunkSize,
chunkOverlap,
keepSeparator: false
});
return splitter;
};
130 changes: 130 additions & 0 deletions extensions/aws/src/knowledge-connectors/helpers/text_extractor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import { TextLoader } from 'langchain/document_loaders/fs/text';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
import { JSONLoader, JSONLinesLoader } from 'langchain/document_loaders/fs/json';
import { Document } from '@langchain/core/documents';

import { splitDocs } from './text_chunker';
import { BufferLoader } from 'langchain/document_loaders/fs/buffer';
import { parseOfficeAsync } from 'officeparser';

export const logger = {
log: (level: string, context: any, message: string) => {
const timestamp = new Date().toISOString();
console.log(`[${timestamp}] [${level.toUpperCase()}] ${message}`);
if (context && Object.keys(context).length > 0) {
console.log('Context:', JSON.stringify(context, null, 2));
}
}
};

export const removeUnnecessaryChars = (text: string): string => {
if (!text) return "";

return text
// Remove multiple spaces but preserve newlines
.replace(/[ \t]+/g, ' ')
// Remove multiple newlines (keep max 2)
.replace(/\n\s*\n\s*\n/g, '\n\n')
// Remove zero-width characters
.replace(/[\u200B-\u200D\uFEFF]/g, '')
// Trim whitespace
.trim();
};

export const lsExtractor = async (type: string, inputFile: string): Promise<string> => {
let documentLoader;
switch (type) {
case "txt":
documentLoader = new TextLoader(inputFile);
break;

case "pdf":
// possible config: { splitPage: true }
// https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/pdf
documentLoader = new PDFLoader(inputFile, { splitPages: false });
break;

case "docx":
// https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/docx
documentLoader = new DocxLoader(inputFile);
break;

case "csv":
// possible config: columnName
// https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/csv#usage-extracting-a-single-column
documentLoader = new CSVLoader(inputFile);
break;

case "json":
// possible config: pointer
// https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/json#using-json-pointer-example
documentLoader = new JSONLoader(inputFile);
break;

case "jsonl":
// possible config: pointer
// https://js.langchain.com/docs/modules/indexes/document_loaders/examples/file_loaders/jsonlines
documentLoader = new JSONLinesLoader(inputFile, "");
break;

case 'md':
documentLoader = new TextLoader(inputFile);
break;

case 'pptx':
// https://js.langchain.com/docs/integrations/document_loaders/file_loaders/pptx/
documentLoader = new PPTXLoader(inputFile);
break;

default:
documentLoader = new TextLoader(inputFile);
}

// load and extract document
const docs = await documentLoader.load();

// Clean up text for all file types
docs.forEach((doc) => {
doc.pageContent = removeUnnecessaryChars(doc?.pageContent);
});

// split document into paragraphs according to specified or default splitter
const splitDocuments = (
await splitDocs(docs)
).map((doc) => doc.pageContent);

// join the paragraphs into the format we want
const textParagraphs = splitDocuments.join('\n\n');

logger.log("info", null, "Successfully used langchain to extract content");

return textParagraphs;
};

/**
* Custom PPTXLoader class to handle pptx files. Implementation adapted
* from langchain's PPTXLoader, but it uses newer version of officeparser package
* to handle pptx entirely in memory, instead of writing to a temp file in the
* current directory.
*/
class PPTXLoader extends BufferLoader {
constructor(filePathOrBlob: string | Blob) {
super(filePathOrBlob);
}

async parse(raw: Buffer, metadata: Record<string, any>): Promise<Document[]> {
const pptx = await parseOfficeAsync(raw, {
outputErrorToConsole: true,
});
if (!pptx)
return [];
return [
new Document({
pageContent: pptx,
metadata,
}),
];
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
export const getMaxChunkSize = (defaultLimit: number = 2000): number => {
const parsed = parseInt(process.env.MAX_CHUNK_SIZE, 10);
return isNaN(parsed) ? defaultLimit : parsed;
};

export const langchainDefaultChunkSizeInChars = (defaultLimit: number = 2000): number => {
const parsed = parseInt(process.env.LANGCHAIN_DEFAULT_CHUNK_SIZE_IN_CHARS, 10);
return isNaN(parsed) ? defaultLimit : parsed;
};
Loading