docs creation example (#49)

jayhack · web-flow · commit 30c001728c6e · 2025-02-07T16:12:32.000-08:00
* computes cyclomatic complexity

* .

* .

* Automated pre-commit update

* Remove unnecessary files from codegen

* Automated pre-commit update

* .

* .

* Automated pre-commit update

---------

Co-authored-by: jayhack &lt;2548876+jayhack@users.noreply.github.com&gt;
diff --git a/examples/document_functions/README.md b/examples/document_functions/README.md
@@ -0,0 +1,84 @@
+# Automated Function Documentation Generator
+
+This example demonstrates how to use Codegen to automatically generate comprehensive docstrings for functions by analyzing their dependencies and usage patterns within a codebase.
+
+## Overview
+
+The script uses Codegen's symbol analysis capabilities to:
+1. Identify functions without docstrings
+2. Analyze their dependencies and usages up to N degrees deep
+3. Generate contextually aware docstrings using AI
+
+## Key Features
+
+### Recursive Context Collection
+The script recursively collects both dependencies and usages to provide comprehensive context for docstring generation:
+
+```python
+def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[Symbol]]:
+    """Recursively collect dependencies and usages up to the specified degree."""
+    dependencies = set()
+    usages = set()
+
+    if degree > 0:
+        for dep in symbol.dependencies:
+            if isinstance(dep, Import):
+                dep = hop_through_imports(dep)
+            if isinstance(dep, Symbol):
+                dependencies.add(dep)
+                # Recursively collect nested context
+                dep_deps, dep_usages = get_extended_context(dep, degree - 1)
+                dependencies.update(dep_deps)
+                usages.update(dep_usages)
+```
+
+### Import Resolution
+The script intelligently resolves imports to find the actual symbol definitions:
+
+```python
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+    """Finds the root symbol for an import"""
+    if isinstance(imp.imported_symbol, Import):
+        return hop_through_imports(imp.imported_symbol)
+    return imp.imported_symbol
+```
+
+## Usage
+
+1. Run the script on a target repository:
+```python
+codebase = Codebase.from_repo("your/repo", commit="commit_hash")
+run(codebase)
+```
+
+2. The script will:
+   - Process each function in the codebase
+   - Skip functions that already have docstrings
+   - Generate contextually aware docstrings for undocumented functions
+   - Commit changes incrementally for safe early termination
+
+## Example Output
+
+The script provides detailed progress information:
+```
+[1/150] Skipping my_function - already has docstring
+[2/150] Generating docstring for process_data at src/utils.py
+  ✓ Generated docstring
+[3/150] Generating docstring for validate_input at src/validation.py
+  ✗ Failed to generate docstring
+```
+
+## Features
+
+- **Intelligent Context Collection**: Analyzes both dependencies and usages to understand function purpose
+- **Import Resolution**: Follows import chains to find actual symbol definitions
+- **Incremental Commits**: Saves progress after each function for safe interruption
+- **Progress Tracking**: Detailed logging of processing status
+- **Existing Docstring Preservation**: Skips functions that are already documented
+
+## Use Cases
+
+- Documenting legacy codebases
+- Maintaining documentation standards in large projects
+- Onboarding new team members with better code documentation
+- Preparing codebases for public release
diff --git a/examples/document_functions/run.py b/examples/document_functions/run.py
@@ -0,0 +1,119 @@
+import codegen
+from codegen import Codebase
+from codegen.sdk.core.external_module import ExternalModule
+from codegen.sdk.core.import_resolution import Import
+from codegen.sdk.core.symbol import Symbol
+
+
+def hop_through_imports(imp: Import) -> Symbol | ExternalModule:
+    """Finds the root symbol for an import"""
+    if isinstance(imp.imported_symbol, Import):
+        return hop_through_imports(imp.imported_symbol)
+    return imp.imported_symbol
+
+
+def get_extended_context(symbol: Symbol, degree: int) -> tuple[set[Symbol], set[Symbol]]:
+    """Recursively collect dependencies and usages up to the specified degree.
+
+    Args:
+        symbol: The symbol to collect context for
+        degree: How many levels deep to collect dependencies and usages
+
+    Returns:
+        A tuple of (dependencies, usages) where each is a set of related Symbol objects
+    """
+    dependencies = set()
+    usages = set()
+
+    if degree > 0:
+        # Collect direct dependencies
+        for dep in symbol.dependencies:
+            # Hop through imports to find the root symbol
+            if isinstance(dep, Import):
+                dep = hop_through_imports(dep)
+
+            if isinstance(dep, Symbol) and dep not in dependencies:
+                dependencies.add(dep)
+                dep_deps, dep_usages = get_extended_context(dep, degree - 1)
+                dependencies.update(dep_deps)
+                usages.update(dep_usages)
+
+        # Collect usages in the current symbol
+        for usage in symbol.usages:
+            usage_symbol = usage.usage_symbol
+            # Hop through imports for usage symbols too
+            if isinstance(usage_symbol, Import):
+                usage_symbol = hop_through_imports(usage_symbol)
+
+            if isinstance(usage_symbol, Symbol) and usage_symbol not in usages:
+                usages.add(usage_symbol)
+                usage_deps, usage_usages = get_extended_context(usage_symbol, degree - 1)
+                dependencies.update(usage_deps)
+                usages.update(usage_usages)
+
+    return dependencies, usages
+
+
+@codegen.function("document-functions")
+def run(codebase: Codebase):
+    # Define the maximum degree of dependencies and usages to consider for context
+    N_DEGREE = 2
+
+    # Filter out test and tutorial functions first
+    functions = [f for f in codebase.functions if not any(pattern in f.name.lower() for pattern in ["test", "tutorial"]) and not any(pattern in f.filepath.lower() for pattern in ["test", "tutorial"])]
+
+    # Track progress for user feedback
+    total_functions = len(functions)
+    processed = 0
+
+    print(f"Found {total_functions} functions to process (excluding tests and tutorials)")
+
+    for function in functions:
+        processed += 1
+
+        # Skip if already has docstring
+        if function.docstring:
+            print(f"[{processed}/{total_functions}] Skipping {function.name} - already has docstring")
+            continue
+
+        print(f"[{processed}/{total_functions}] Generating docstring for {function.name} at {function.filepath}")
+
+        # Collect context using N-degree dependencies and usages
+        dependencies, usages = get_extended_context(function, N_DEGREE)
+
+        # Generate a docstring using the AI with the context
+        docstring = codebase.ai(
+            """
+            Generate a docstring for this function using the provided context.
+            The context includes:
+            - dependencies: other symbols this function depends on
+            - usages: other symbols that use this function
+        """,
+            target=function,
+            # `codebase.ai` is smart about stringifying symbols
+            context={"dependencies": list(dependencies), "usages": list(usages)},
+        )
+
+        # Set the generated docstring for the function
+        if docstring:
+            function.set_docstring(docstring)
+            print("  ✓ Generated docstring")
+        else:
+            print("  ✗ Failed to generate docstring")
+
+        # Commit after each function so work is saved incrementally
+        # This allows for:
+        # 1. Safe early termination - progress won't be lost
+        # 2. Immediate feedback - can check results while running
+        # 3. Smaller atomic changes - easier to review/revert if needed
+        codebase.commit()
+
+    print(f"\nCompleted processing {total_functions} functions")
+
+
+if __name__ == "__main__":
+    print("Parsing codebase...")
+    codebase = Codebase.from_repo("fastapi/fastapi", commit="887270ff8a54bb58c406b0651678a27589793d2f")
+
+    print("Running function...")
+    run(codebase)