feat(cli): Add token counts for CLI outputs

samir-byte · Aug 6, 2024 · cc45bdd · cc45bdd
1 parent 9ba239e
commit cc45bdd
Show file tree

Hide file tree

Showing 7 changed files with 84 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ Perfect for when you need to feed your codebase to Large Language Models (LLMs)
 ## 🌟 Features
 
 - **AI-Optimized**: Formats your codebase in a way that's easy for AI to understand and process.
+- **Token Counting**: Provides token counts for each file and the entire repository, useful for LLM context limits.
 - **Simple to Use**: Just one command to pack your entire repository.
 - **Customizable**: Easily configure what to include or exclude.
 - **Git-Aware**: Automatically respects your .gitignore files.

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -59,7 +59,8 @@
     "jschardet": "^3.1.3",
     "log-update": "^6.1.0",
     "picocolors": "^1.0.1",
-    "strip-comments": "^2.0.1"
+    "strip-comments": "^2.0.1",
+    "tiktoken": "^1.0.15"
   },
   "devDependencies": {
     "@eslint/js": "^9.8.0",
@@ -85,4 +86,4 @@
     "node": ">=16.0.0",
     "yarn": ">=1.0.0"
   }
-}
+}
diff --git a/src/cli/cliOutput.ts b/src/cli/cliOutput.ts
@@ -6,6 +6,7 @@ export const printSummary = (
   rootDir: string,
   totalFiles: number,
   totalCharacters: number,
+  totalTokens: number,
   outputPath: string,
   suspiciousFilesResults: SecretLintCoreResult[],
 ) => {
@@ -20,10 +21,11 @@ export const printSummary = (
 
   console.log(pc.white('📊 Pack Summary:'));
   console.log(pc.dim('────────────────'));
-  console.log(`${pc.white('Total Files:')} ${pc.white(totalFiles.toString())}`);
-  console.log(`${pc.white('Total Chars:')} ${pc.white(totalCharacters.toString())}`);
-  console.log(`${pc.white('     Output:')} ${pc.white(relativeOutputPath)}`);
-  console.log(`${pc.white('   Security:')} ${pc.white(securityCheckMessage)}`);
+  console.log(`${pc.white('  Total Files:')} ${pc.white(totalFiles.toString())}`);
+  console.log(`${pc.white('  Total Chars:')} ${pc.white(totalCharacters.toString())}`);
+  console.log(`${pc.white(' Total Tokens:')} ${pc.white(totalTokens.toString())}`);
+  console.log(`${pc.white('       Output:')} ${pc.white(relativeOutputPath)}`);
+  console.log(`${pc.white('     Security:')} ${pc.white(securityCheckMessage)}`);
 };
 
 export const printSecurityCheck = (rootDir: string, suspiciousFilesResults: SecretLintCoreResult[]) => {
@@ -46,17 +48,24 @@ export const printSecurityCheck = (rootDir: string, suspiciousFilesResults: Secr
   }
 };
 
-export const printTopFiles = (fileCharCounts: Record<string, number>, topFilesLength: number) => {
-  console.log(pc.white(`📈 Top ${topFilesLength} Files by Character Count:`));
-  console.log(pc.dim('──────────────────────────────────'));
+export const printTopFiles = (
+  fileCharCounts: Record<string, number>,
+  fileTokenCounts: Record<string, number>,
+  topFilesLength: number,
+) => {
+  console.log(pc.white(`📈 Top ${topFilesLength} Files by Character Count and Token Count:`));
+  console.log(pc.dim('──────────────────────────────────────────────────────'));
 
   const topFiles = Object.entries(fileCharCounts)
     .sort((a, b) => b[1] - a[1])
     .slice(0, topFilesLength);
 
-  topFiles.forEach(([filePath, count], index) => {
+  topFiles.forEach(([filePath, charCount], index) => {
+    const tokenCount = fileTokenCounts[filePath];
     const indexString = `${index + 1}.`.padEnd(3, ' ');
-    console.log(`${pc.white(`${indexString}`)} ${pc.white(filePath)} ${pc.dim(`(${count} chars)`)}`);
+    console.log(
+      `${pc.white(`${indexString}`)} ${pc.white(filePath)} ${pc.dim(`(${charCount} chars, ${tokenCount} tokens)`)}`,
+    );
   });
 };
 

diff --git a/src/cli/index.ts b/src/cli/index.ts
@@ -83,7 +83,7 @@ const executeAction = async (directory: string, rootDir: string, options: CliOpt
     console.log('');
 
     if (config.output.topFilesLength > 0) {
-      printTopFiles(packResult.fileCharCounts, config.output.topFilesLength);
+      printTopFiles(packResult.fileCharCounts, packResult.fileTokenCounts, config.output.topFilesLength);
       console.log('');
     }
 
@@ -94,6 +94,7 @@ const executeAction = async (directory: string, rootDir: string, options: CliOpt
       rootDir,
       packResult.totalFiles,
       packResult.totalCharacters,
+      packResult.totalTokens,
       config.output.filePath,
       packResult.suspiciousFilesResults,
     );

diff --git a/src/core/packager.ts b/src/core/packager.ts
@@ -1,5 +1,6 @@
 import * as fs from 'node:fs/promises';
 import path from 'node:path';
+import { Tiktoken, get_encoding } from 'tiktoken';
 import type { SecretLintCoreResult } from '@secretlint/types';
 import { RepopackConfigMerged } from '../config/configTypes.js';
 import { sanitizeFiles as defaultSanitizeFiles } from '../utils/fileHandler.js';
@@ -15,7 +16,9 @@ export interface Dependencies {
 export interface PackResult {
   totalFiles: number;
   totalCharacters: number;
+  totalTokens: number;
   fileCharCounts: Record<string, number>;
+  fileTokenCounts: Record<string, number>;
   suspiciousFilesResults: SecretLintCoreResult[];
 }
 
@@ -40,18 +43,27 @@ export const pack = async (
   const sanitizedFiles = await deps.sanitizeFiles(safeFilePaths, rootDir, config);
   await deps.generateOutput(rootDir, config, sanitizedFiles, safeFilePaths);
 
+  // Setup encoding
+  const encoding: Tiktoken = get_encoding('cl100k_base');
+
   // Metrics
   const totalFiles = sanitizedFiles.length;
   const totalCharacters = sanitizedFiles.reduce((sum, file) => sum + file.content.length, 0);
+  const totalTokens = sanitizedFiles.reduce((sum, file) => sum + encoding.encode(file.content).length, 0);
   const fileCharCounts: Record<string, number> = {};
+  const fileTokenCounts: Record<string, number> = {};
   sanitizedFiles.forEach((file) => {
     fileCharCounts[file.path] = file.content.length;
+    fileTokenCounts[file.path] = encoding.encode(file.content).length;
   });
+  encoding.free();
 
   return {
     totalFiles,
     totalCharacters,
+    totalTokens,
     fileCharCounts,
+    fileTokenCounts,
     suspiciousFilesResults,
   };
 };

diff --git a/tests/core/tokenCounter.test.ts b/tests/core/tokenCounter.test.ts
@@ -0,0 +1,40 @@
+import { expect, test, describe, beforeAll, afterAll } from 'vitest';
+import { Tiktoken, get_encoding } from 'tiktoken';
+
+describe('tiktoken', () => {
+  let encoding: Tiktoken;
+
+  beforeAll(() => {
+    encoding = get_encoding('cl100k_base');
+  });
+
+  afterAll(() => {
+    encoding.free();
+  });
+
+  test('should correctly count tokens', () => {
+    const testCases = [
+      { input: 'Hello, world!', expectedTokens: 4 },
+      { input: 'This is a longer sentence with more tokens.', expectedTokens: 9 },
+      { input: 'Special characters like !@#$%^&*() should be handled correctly.', expectedTokens: 15 },
+      { input: 'Numbers 123 and symbols @#$ might affect tokenization.', expectedTokens: 12 },
+      { input: 'Multi-line\ntext\nshould\nwork\ntoo.', expectedTokens: 11 },
+    ];
+
+    testCases.forEach(({ input, expectedTokens }) => {
+      const tokenCount = encoding.encode(input).length;
+      expect(tokenCount).toBe(expectedTokens);
+    });
+  });
+
+  test('should handle empty input', () => {
+    const tokenCount = encoding.encode('').length;
+    expect(tokenCount).toBe(0);
+  });
+
+  test('should handle very long input', () => {
+    const longText = 'a'.repeat(1000);
+    const tokenCount = encoding.encode(longText).length;
+    expect(tokenCount).toBeGreaterThan(0);
+  });
+});