Skip to content

Commit

Permalink
feat(.): added codebase analysis feature
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaelmansuy committed Jul 26, 2024
1 parent b37c438 commit aa44893
Show file tree
Hide file tree
Showing 6 changed files with 288 additions and 11 deletions.
59 changes: 59 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,66 @@ This command will analyze your project, count the tokens, and provide a detailed
![](./docs/screen-example2.png)


## 🔥 Analyzing Codebases

code2prompt now offers a powerful feature to analyze codebases and provide a summary of file extensions. Use the `--analyze` option along with the `-p` (path) option to get an overview of your project's file composition. For example:

```
code2prompt --analyze -p code2prompt
```

Result:

```
.j2: 6 files
.json: 1 file
.py: 33 files
.pyc: 56 files
Comma-separated list of extensions:
.j2,.json,.py,.pyc
```

This command will analyze the 'code2prompt' directory and display a summary of all file extensions found, including their counts. You can choose between two output formats:

- Flat format (default): Lists all unique extensions alphabetically with their file counts.
- Tree-like format: Displays extensions in a directory tree structure with counts at each level.

To use the tree-like format, add the `--format tree` option:

```
code2prompt --analyze -p code2prompt --format tree
```

Result:

```
└── code2prompt
├── utils
│ ├── .py
│ └── __pycache__
│ └── .pyc
├── .py
├── core
│ ├── .py
│ └── __pycache__
│ └── .pyc
├── comment_stripper
│ ├── .py
│ └── __pycache__
│ └── .pyc
├── __pycache__
│ └── .pyc
├── templates
│ └── .j2
└── data
└── .json
Comma-separated list of extensions:
.j2,.json,.py,.pyc
```

The analysis also generates a comma-separated list of file extensions, which can be easily copied and used with the `--filter` option for more targeted code processing.

## 🔥 Feature Highlight: Dynamic Variable Extraction for Prompt Generation

Expand Down
50 changes: 42 additions & 8 deletions code2prompt/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from importlib import resources
import logging
from pathlib import Path

import click
from tabulate import tabulate

from code2prompt.utils.config import load_config, merge_options
from code2prompt.utils.count_tokens import count_tokens
from code2prompt.core.generate_content import generate_content
Expand All @@ -11,9 +13,12 @@
from code2prompt.utils.create_template_directory import create_templates_directory
from code2prompt.utils.logging_utils import setup_logger, log_token_count, log_error, log_info
from code2prompt.utils.price_calculator import load_token_prices, calculate_prices
from code2prompt.utils.analyzer import analyze_codebase, format_flat_output, format_tree_output, get_extension_list

VERSION = "0.6.12"
# Version number of the code2prompt tool
VERSION = "0.6.13"

# Default options for the tool
DEFAULT_OPTIONS = {
"path": [],
"output": None,
Expand All @@ -33,6 +38,8 @@
"provider": None,
"model": None,
"output_tokens": 1000, # Default output token count
"analyze": False,
"format": "flat"
}

@click.command()
Expand Down Expand Up @@ -138,6 +145,17 @@
default=1000,
help="Specify the number of output tokens for price calculation.",
)
@click.option(
"--analyze",
is_flag=True,
help="Analyze the codebase and provide a summary of file extensions.",
)
@click.option(
"--format",
type=click.Choice(["flat", "tree"]),
default="flat",
help="Format of the analysis output (flat or tree-like).",
)
def create_markdown_file(**cli_options):
"""
Creates a Markdown file based on the provided options.
Expand All @@ -149,9 +167,10 @@ def create_markdown_file(**cli_options):
Args:
**options (dict): Key-value pairs of options to customize the behavior of the function.
Possible keys include 'path', 'output', 'gitignore', 'filter', 'exclude', 'case_sensitive',
'suppress_comments', 'line_number', 'no_codeblock', 'template', 'tokens', 'encoding',
'create_templates', 'log_level', 'price', 'provider', 'model', and 'output_tokens'.
Possible keys include 'path', 'output', 'gitignore', 'filter', 'exclude',
'case_sensitive', 'suppress_comments', 'line_number', 'no_codeblock', 'template',
'tokens', 'encoding', 'create_templates', 'log_level', 'price', 'provider', 'model',
'output_tokens', 'analyze', and 'format'.
Returns:
None
Expand Down Expand Up @@ -181,6 +200,23 @@ def create_markdown_file(**cli_options):
)
return

if options["analyze"]:
for path in options["path"]:
extension_counts, extension_dirs = analyze_codebase(path)
if "No files found" in extension_counts:
click.echo("No files found")
else:
if options["format"] == "flat":
output = format_flat_output(extension_counts)
else:
output = format_tree_output(extension_dirs)

click.echo(output)

click.echo("\nComma-separated list of extensions:")
click.echo(get_extension_list(extension_counts))
return

all_files_data = []
for path in options["path"]:
files_data = process_files({**options, "path": path})
Expand All @@ -193,13 +229,11 @@ def create_markdown_file(**cli_options):
token_count = count_tokens(content, options["encoding"])
log_token_count(token_count)


write_output(content, options["output"], copy_to_clipboard=True)

if options["price"]:
display_price_table(options, token_count)


def display_price_table(options, token_count):
"""
Display a table with price estimates for the given token count.
Expand All @@ -217,7 +251,6 @@ def display_price_table(options, token_count):
return

output_token_count = options["output_tokens"]

table_data = calculate_prices(token_prices, token_count, output_token_count, options["provider"], options["model"])

if not table_data:
Expand All @@ -226,6 +259,7 @@ def display_price_table(options, token_count):

headers = ["Provider", "Model", "Price for 1K Input Tokens", "Number of Input Tokens", "Total Price"]
table = tabulate(table_data, headers=headers, tablefmt="grid")

log_info("\n✨ Estimated Token Prices: (All prices are in USD, it is an estimate as the current token implementation is based on OpenAI's Tokenizer)")
log_info("\n")
log_info(table)
Expand Down
90 changes: 90 additions & 0 deletions code2prompt/utils/analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple

def analyze_codebase(path: str) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
"""
Analyze the codebase and return file extension information.
Args:
path (str): The path to the codebase directory.
Returns:
Tuple[Dict[str, int], Dict[str, List[str]]]: A tuple containing:
- A dictionary of file extensions and their counts.
- A dictionary of file extensions and the directories containing them.
"""
extension_counts = defaultdict(int)
extension_dirs = defaultdict(set)

file_count = 0
for file_path in Path(path).rglob('*'):
if file_path.is_file():
file_count += 1
ext = file_path.suffix.lower()
if ext:
extension_counts[ext] += 1
extension_dirs[ext].add(str(file_path.parent))

if file_count == 0:
return {"No files found": 0}, {}

return dict(extension_counts), {k: list(v) for k, v in extension_dirs.items()}


def format_flat_output(extension_counts: Dict[str, int]) -> str:
"""
Format the analysis results in a flat structure.
Args:
extension_counts (Dict[str, int]): A dictionary of file extensions and their counts.
Returns:
str: Formatted output string.
"""
output = []
for ext, count in sorted(extension_counts.items()):
output.append(f"{ext}: {count} file{'s' if count > 1 else ''}")
return "\n".join(output)

def format_tree_output(extension_dirs: Dict[str, List[str]]) -> str:
"""
Format the analysis results in a tree-like structure.
Args:
extension_dirs (Dict[str, List[str]]): A dictionary of file extensions and their directories.
Returns:
str: Formatted output string.
"""
def format_tree(node, prefix=""):
output = []
for i, (key, value) in enumerate(node.items()):
is_last = i == len(node) - 1
output.append(f"{prefix}{'└── ' if is_last else '├── '}{key}")
if isinstance(value, dict):
extension = " " if is_last else "│ "
output.extend(format_tree(value, prefix + extension))
return output

tree = {}
for ext, dirs in extension_dirs.items():
for dir_path in dirs:
current = tree
for part in Path(dir_path).parts:
current = current.setdefault(part, {})
current[ext] = {}

return "\n".join(format_tree(tree))

def get_extension_list(extension_counts: Dict[str, int]) -> str:
"""
Generate a comma-separated list of file extensions.
Args:
extension_counts (Dict[str, int]): A dictionary of file extensions and their counts.
Returns:
str: Comma-separated list of file extensions.
"""
return ",".join(sorted(extension_counts.keys()))
5 changes: 3 additions & 2 deletions code2prompt/utils/is_filtered.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,17 @@
def is_filtered(file_path: Path, include_pattern: str = "", exclude_pattern: str = "", case_sensitive: bool = False) -> bool:
"""
Determine if a file should be filtered based on include and exclude patterns.
Parameters:
- file_path (Path): Path to the file to check
- include_pattern (str): Comma-separated list of patterns to include files
- exclude_pattern (str): Comma-separated list of patterns to exclude files
- case_sensitive (bool): Whether to perform case-sensitive pattern matching
Returns:
- bool: True if the file should be included, False if it should be filtered out
"""

def match_pattern(path: str, pattern: str) -> bool:
if "**" in pattern:
parts = pattern.split("**")
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "code2prompt"
version = "0.6.12"
version = "0.6.13"
description = "A tool to convert code snippets into AI prompts for documentation or explanation purposes."
authors = ["Raphael MANSUY <raphael.mansuy@gmail.com>"]
license = "MIT"
Expand Down
Loading

0 comments on commit aa44893

Please sign in to comment.