Skip to content

Commit

Permalink
feat(cli): Add token counts for CLI outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
joshellington committed Aug 6, 2024
1 parent 9ba239e commit cc45bdd
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 13 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Perfect for when you need to feed your codebase to Large Language Models (LLMs)
## 🌟 Features

- **AI-Optimized**: Formats your codebase in a way that's easy for AI to understand and process.
- **Token Counting**: Provides token counts for each file and the entire repository, useful for LLM context limits.
- **Simple to Use**: Just one command to pack your entire repository.
- **Customizable**: Easily configure what to include or exclude.
- **Git-Aware**: Automatically respects your .gitignore files.
Expand Down
9 changes: 8 additions & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@
"jschardet": "^3.1.3",
"log-update": "^6.1.0",
"picocolors": "^1.0.1",
"strip-comments": "^2.0.1"
"strip-comments": "^2.0.1",
"tiktoken": "^1.0.15"
},
"devDependencies": {
"@eslint/js": "^9.8.0",
Expand All @@ -85,4 +86,4 @@
"node": ">=16.0.0",
"yarn": ">=1.0.0"
}
}
}
27 changes: 18 additions & 9 deletions src/cli/cliOutput.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export const printSummary = (
rootDir: string,
totalFiles: number,
totalCharacters: number,
totalTokens: number,
outputPath: string,
suspiciousFilesResults: SecretLintCoreResult[],
) => {
Expand All @@ -20,10 +21,11 @@ export const printSummary = (

console.log(pc.white('📊 Pack Summary:'));
console.log(pc.dim('────────────────'));
console.log(`${pc.white('Total Files:')} ${pc.white(totalFiles.toString())}`);
console.log(`${pc.white('Total Chars:')} ${pc.white(totalCharacters.toString())}`);
console.log(`${pc.white(' Output:')} ${pc.white(relativeOutputPath)}`);
console.log(`${pc.white(' Security:')} ${pc.white(securityCheckMessage)}`);
console.log(`${pc.white(' Total Files:')} ${pc.white(totalFiles.toString())}`);
console.log(`${pc.white(' Total Chars:')} ${pc.white(totalCharacters.toString())}`);
console.log(`${pc.white(' Total Tokens:')} ${pc.white(totalTokens.toString())}`);
console.log(`${pc.white(' Output:')} ${pc.white(relativeOutputPath)}`);
console.log(`${pc.white(' Security:')} ${pc.white(securityCheckMessage)}`);
};

export const printSecurityCheck = (rootDir: string, suspiciousFilesResults: SecretLintCoreResult[]) => {
Expand All @@ -46,17 +48,24 @@ export const printSecurityCheck = (rootDir: string, suspiciousFilesResults: Secr
}
};

export const printTopFiles = (fileCharCounts: Record<string, number>, topFilesLength: number) => {
console.log(pc.white(`📈 Top ${topFilesLength} Files by Character Count:`));
console.log(pc.dim('──────────────────────────────────'));
export const printTopFiles = (
fileCharCounts: Record<string, number>,
fileTokenCounts: Record<string, number>,
topFilesLength: number,
) => {
console.log(pc.white(`📈 Top ${topFilesLength} Files by Character Count and Token Count:`));
console.log(pc.dim('──────────────────────────────────────────────────────'));

const topFiles = Object.entries(fileCharCounts)
.sort((a, b) => b[1] - a[1])
.slice(0, topFilesLength);

topFiles.forEach(([filePath, count], index) => {
topFiles.forEach(([filePath, charCount], index) => {
const tokenCount = fileTokenCounts[filePath];
const indexString = `${index + 1}.`.padEnd(3, ' ');
console.log(`${pc.white(`${indexString}`)} ${pc.white(filePath)} ${pc.dim(`(${count} chars)`)}`);
console.log(
`${pc.white(`${indexString}`)} ${pc.white(filePath)} ${pc.dim(`(${charCount} chars, ${tokenCount} tokens)`)}`,
);
});
};

Expand Down
3 changes: 2 additions & 1 deletion src/cli/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ const executeAction = async (directory: string, rootDir: string, options: CliOpt
console.log('');

if (config.output.topFilesLength > 0) {
printTopFiles(packResult.fileCharCounts, config.output.topFilesLength);
printTopFiles(packResult.fileCharCounts, packResult.fileTokenCounts, config.output.topFilesLength);
console.log('');
}

Expand All @@ -94,6 +94,7 @@ const executeAction = async (directory: string, rootDir: string, options: CliOpt
rootDir,
packResult.totalFiles,
packResult.totalCharacters,
packResult.totalTokens,
config.output.filePath,
packResult.suspiciousFilesResults,
);
Expand Down
12 changes: 12 additions & 0 deletions src/core/packager.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import * as fs from 'node:fs/promises';
import path from 'node:path';
import { Tiktoken, get_encoding } from 'tiktoken';
import type { SecretLintCoreResult } from '@secretlint/types';
import { RepopackConfigMerged } from '../config/configTypes.js';
import { sanitizeFiles as defaultSanitizeFiles } from '../utils/fileHandler.js';
Expand All @@ -15,7 +16,9 @@ export interface Dependencies {
export interface PackResult {
totalFiles: number;
totalCharacters: number;
totalTokens: number;
fileCharCounts: Record<string, number>;
fileTokenCounts: Record<string, number>;
suspiciousFilesResults: SecretLintCoreResult[];
}

Expand All @@ -40,18 +43,27 @@ export const pack = async (
const sanitizedFiles = await deps.sanitizeFiles(safeFilePaths, rootDir, config);
await deps.generateOutput(rootDir, config, sanitizedFiles, safeFilePaths);

// Setup encoding
const encoding: Tiktoken = get_encoding('cl100k_base');

// Metrics
const totalFiles = sanitizedFiles.length;
const totalCharacters = sanitizedFiles.reduce((sum, file) => sum + file.content.length, 0);
const totalTokens = sanitizedFiles.reduce((sum, file) => sum + encoding.encode(file.content).length, 0);
const fileCharCounts: Record<string, number> = {};
const fileTokenCounts: Record<string, number> = {};
sanitizedFiles.forEach((file) => {
fileCharCounts[file.path] = file.content.length;
fileTokenCounts[file.path] = encoding.encode(file.content).length;
});
encoding.free();

return {
totalFiles,
totalCharacters,
totalTokens,
fileCharCounts,
fileTokenCounts,
suspiciousFilesResults,
};
};
Expand Down
40 changes: 40 additions & 0 deletions tests/core/tokenCounter.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import { expect, test, describe, beforeAll, afterAll } from 'vitest';
import { Tiktoken, get_encoding } from 'tiktoken';

describe('tiktoken', () => {
let encoding: Tiktoken;

beforeAll(() => {
encoding = get_encoding('cl100k_base');
});

afterAll(() => {
encoding.free();
});

test('should correctly count tokens', () => {
const testCases = [
{ input: 'Hello, world!', expectedTokens: 4 },
{ input: 'This is a longer sentence with more tokens.', expectedTokens: 9 },
{ input: 'Special characters like !@#$%^&*() should be handled correctly.', expectedTokens: 15 },
{ input: 'Numbers 123 and symbols @#$ might affect tokenization.', expectedTokens: 12 },
{ input: 'Multi-line\ntext\nshould\nwork\ntoo.', expectedTokens: 11 },
];

testCases.forEach(({ input, expectedTokens }) => {
const tokenCount = encoding.encode(input).length;
expect(tokenCount).toBe(expectedTokens);
});
});

test('should handle empty input', () => {
const tokenCount = encoding.encode('').length;
expect(tokenCount).toBe(0);
});

test('should handle very long input', () => {
const longText = 'a'.repeat(1000);
const tokenCount = encoding.encode(longText).length;
expect(tokenCount).toBeGreaterThan(0);
});
});

0 comments on commit cc45bdd

Please sign in to comment.