feat(.): added codebase analysis feature

yotamnahum · Jul 26, 2024 · aa44893 · aa44893
1 parent b37c438
commit aa44893
Show file tree

Hide file tree

Showing 6 changed files with 288 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -364,7 +364,66 @@ This command will analyze your project, count the tokens, and provide a detailed
 ![](./docs/screen-example2.png)
 
 
+## 🔥 Analyzing Codebases
 
+code2prompt now offers a powerful feature to analyze codebases and provide a summary of file extensions. Use the `--analyze` option along with the `-p` (path) option to get an overview of your project's file composition. For example:
+
+```
+code2prompt --analyze -p code2prompt
+```
+
+Result:
+
+```
+.j2: 6 files
+.json: 1 file
+.py: 33 files
+.pyc: 56 files
+
+Comma-separated list of extensions:
+.j2,.json,.py,.pyc
+```
+
+This command will analyze the 'code2prompt' directory and display a summary of all file extensions found, including their counts. You can choose between two output formats:
+
+- Flat format (default): Lists all unique extensions alphabetically with their file counts.
+- Tree-like format: Displays extensions in a directory tree structure with counts at each level.
+
+To use the tree-like format, add the `--format tree` option:
+
+```
+code2prompt --analyze -p code2prompt --format tree
+```
+
+Result: 
+
+```
+└── code2prompt
+    ├── utils
+    │   ├── .py
+    │   └── __pycache__
+    │       └── .pyc
+    ├── .py
+    ├── core
+    │   ├── .py
+    │   └── __pycache__
+    │       └── .pyc
+    ├── comment_stripper
+    │   ├── .py
+    │   └── __pycache__
+    │       └── .pyc
+    ├── __pycache__
+    │   └── .pyc
+    ├── templates
+    │   └── .j2
+    └── data
+        └── .json
+
+Comma-separated list of extensions:
+.j2,.json,.py,.pyc
+```
+
+The analysis also generates a comma-separated list of file extensions, which can be easily copied and used with the `--filter` option for more targeted code processing.
 
 ## 🔥 Feature Highlight: Dynamic Variable Extraction for Prompt Generation
 

diff --git a/code2prompt/main.py b/code2prompt/main.py
@@ -1,8 +1,10 @@
 from importlib import resources
 import logging
 from pathlib import Path
+
 import click
 from tabulate import tabulate
+
 from code2prompt.utils.config import load_config, merge_options
 from code2prompt.utils.count_tokens import count_tokens
 from code2prompt.core.generate_content import generate_content
@@ -11,9 +13,12 @@
 from code2prompt.utils.create_template_directory import create_templates_directory
 from code2prompt.utils.logging_utils import setup_logger, log_token_count, log_error, log_info
 from code2prompt.utils.price_calculator import load_token_prices, calculate_prices
+from code2prompt.utils.analyzer import analyze_codebase, format_flat_output, format_tree_output, get_extension_list
 
-VERSION = "0.6.12"
+# Version number of the code2prompt tool
+VERSION = "0.6.13"
 
+# Default options for the tool
 DEFAULT_OPTIONS = {
     "path": [],
     "output": None,
@@ -33,6 +38,8 @@
     "provider": None,
     "model": None,
     "output_tokens": 1000,  # Default output token count
+    "analyze": False,
+    "format": "flat"
 }
 
 @click.command()
@@ -138,6 +145,17 @@
     default=1000,
     help="Specify the number of output tokens for price calculation.",
 )
+@click.option(
+    "--analyze",
+    is_flag=True,
+    help="Analyze the codebase and provide a summary of file extensions.",
+)
+@click.option(
+    "--format",
+    type=click.Choice(["flat", "tree"]),
+    default="flat",
+    help="Format of the analysis output (flat or tree-like).",
+)
 def create_markdown_file(**cli_options):
     """
     Creates a Markdown file based on the provided options.
@@ -149,9 +167,10 @@ def create_markdown_file(**cli_options):
 
     Args:
         **options (dict): Key-value pairs of options to customize the behavior of the function.
-        Possible keys include 'path', 'output', 'gitignore', 'filter', 'exclude', 'case_sensitive',
-        'suppress_comments', 'line_number', 'no_codeblock', 'template', 'tokens', 'encoding',
-        'create_templates', 'log_level', 'price', 'provider', 'model', and 'output_tokens'.
+            Possible keys include 'path', 'output', 'gitignore', 'filter', 'exclude',
+            'case_sensitive', 'suppress_comments', 'line_number', 'no_codeblock', 'template',
+            'tokens', 'encoding', 'create_templates', 'log_level', 'price', 'provider', 'model',
+            'output_tokens', 'analyze', and 'format'.
 
     Returns:
         None
@@ -181,6 +200,23 @@ def create_markdown_file(**cli_options):
         )
         return
 
+    if options["analyze"]:
+        for path in options["path"]:
+            extension_counts, extension_dirs = analyze_codebase(path)
+            if "No files found" in extension_counts:
+                click.echo("No files found")
+            else:
+                if options["format"] == "flat":
+                    output = format_flat_output(extension_counts)
+                else:
+                    output = format_tree_output(extension_dirs)
+
+                click.echo(output)
+
+            click.echo("\nComma-separated list of extensions:")
+            click.echo(get_extension_list(extension_counts))
+        return
+
     all_files_data = []
     for path in options["path"]:
         files_data = process_files({**options, "path": path})
@@ -193,13 +229,11 @@ def create_markdown_file(**cli_options):
         token_count = count_tokens(content, options["encoding"])
         log_token_count(token_count)
 
-
     write_output(content, options["output"], copy_to_clipboard=True)
-    
+
     if options["price"]:
         display_price_table(options, token_count)
 
-
 def display_price_table(options, token_count):
     """
     Display a table with price estimates for the given token count.
@@ -217,7 +251,6 @@ def display_price_table(options, token_count):
         return
 
     output_token_count = options["output_tokens"]
-
     table_data = calculate_prices(token_prices, token_count, output_token_count, options["provider"], options["model"])
 
     if not table_data:
@@ -226,6 +259,7 @@ def display_price_table(options, token_count):
 
     headers = ["Provider", "Model", "Price for 1K Input Tokens", "Number of Input Tokens", "Total Price"]
     table = tabulate(table_data, headers=headers, tablefmt="grid")
+
     log_info("\n✨ Estimated Token Prices: (All prices are in USD, it is an estimate as the current token implementation is based on OpenAI's Tokenizer)")
     log_info("\n")
     log_info(table)

diff --git a/code2prompt/utils/analyzer.py b/code2prompt/utils/analyzer.py
@@ -0,0 +1,90 @@
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple
+
+def analyze_codebase(path: str) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
+    """
+    Analyze the codebase and return file extension information.
+    
+    Args:
+        path (str): The path to the codebase directory.
+    
+    Returns:
+        Tuple[Dict[str, int], Dict[str, List[str]]]: A tuple containing:
+            - A dictionary of file extensions and their counts.
+            - A dictionary of file extensions and the directories containing them.
+    """
+    extension_counts = defaultdict(int)
+    extension_dirs = defaultdict(set)
+
+    file_count = 0
+    for file_path in Path(path).rglob('*'):
+        if file_path.is_file():
+            file_count += 1
+            ext = file_path.suffix.lower()
+            if ext:
+                extension_counts[ext] += 1
+                extension_dirs[ext].add(str(file_path.parent))
+
+    if file_count == 0:
+        return {"No files found": 0}, {}
+
+    return dict(extension_counts), {k: list(v) for k, v in extension_dirs.items()}
+
+
+def format_flat_output(extension_counts: Dict[str, int]) -> str:
+    """
+    Format the analysis results in a flat structure.
+    
+    Args:
+        extension_counts (Dict[str, int]): A dictionary of file extensions and their counts.
+    
+    Returns:
+        str: Formatted output string.
+    """
+    output = []
+    for ext, count in sorted(extension_counts.items()):
+        output.append(f"{ext}: {count} file{'s' if count > 1 else ''}")
+    return "\n".join(output)
+
+def format_tree_output(extension_dirs: Dict[str, List[str]]) -> str:
+    """
+    Format the analysis results in a tree-like structure.
+    
+    Args:
+        extension_dirs (Dict[str, List[str]]): A dictionary of file extensions and their directories.
+    
+    Returns:
+        str: Formatted output string.
+    """
+    def format_tree(node, prefix=""):
+        output = []
+        for i, (key, value) in enumerate(node.items()):
+            is_last = i == len(node) - 1
+            output.append(f"{prefix}{'└── ' if is_last else '├── '}{key}")
+            if isinstance(value, dict):
+                extension = "    " if is_last else "│   "
+                output.extend(format_tree(value, prefix + extension))
+        return output
+
+    tree = {}
+    for ext, dirs in extension_dirs.items():
+        for dir_path in dirs:
+            current = tree
+            for part in Path(dir_path).parts:
+                current = current.setdefault(part, {})
+            current[ext] = {}
+
+    return "\n".join(format_tree(tree))
+
+def get_extension_list(extension_counts: Dict[str, int]) -> str:
+    """
+    Generate a comma-separated list of file extensions.
+    
+    Args:
+        extension_counts (Dict[str, int]): A dictionary of file extensions and their counts.
+    
+    Returns:
+        str: Comma-separated list of file extensions.
+    """
+    return ",".join(sorted(extension_counts.keys()))
diff --git a/code2prompt/utils/is_filtered.py b/code2prompt/utils/is_filtered.py
@@ -4,16 +4,17 @@
 def is_filtered(file_path: Path, include_pattern: str = "", exclude_pattern: str = "", case_sensitive: bool = False) -> bool:
     """
     Determine if a file should be filtered based on include and exclude patterns.
-    
+
     Parameters:
     - file_path (Path): Path to the file to check
     - include_pattern (str): Comma-separated list of patterns to include files
     - exclude_pattern (str): Comma-separated list of patterns to exclude files
     - case_sensitive (bool): Whether to perform case-sensitive pattern matching
-    
+
     Returns:
     - bool: True if the file should be included, False if it should be filtered out
     """
+
     def match_pattern(path: str, pattern: str) -> bool:
         if "**" in pattern:
             parts = pattern.split("**")

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "code2prompt"
-version = "0.6.12"
+version = "0.6.13"
 description = "A tool to convert code snippets into AI prompts for documentation or explanation purposes."
 authors = ["Raphael MANSUY <raphael.mansuy@gmail.com>"]
 license = "MIT"