Merge branch 'main' into array-comparator

KRRT7 · web-flow · commit 4b41d1480f13 · 2025-05-12T18:32:51.000-07:00
diff --git a/.github/workflows/end-to-end-test-init-optim.yaml b/.github/workflows/end-to-end-test-init-optim.yaml
@@ -20,7 +20,7 @@ jobs:
       COLUMNS: 110
       MAX_RETRIES: 3
       RETRY_DELAY: 5
-      EXPECTED_IMPROVEMENT_PCT: 300
+      EXPECTED_IMPROVEMENT_PCT: 30
       CODEFLASH_END_TO_END: 1
     steps:
       - name: 🛎️ Checkout
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import time
+
 import json
 import os
 import platform
@@ -95,6 +97,7 @@ def optimize_python_code(
         - List[OptimizationCandidate]: A list of Optimization Candidates.
 
         """
+        start_time = time.perf_counter()
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
@@ -118,6 +121,8 @@ def optimize_python_code(
             optimizations_json = response.json()["optimizations"]
             logger.info(f"Generated {len(optimizations_json)} candidates.")
             console.rule()
+            end_time = time.perf_counter()
+            logger.debug(f"Optimization took {end_time - start_time:.2f} seconds.")
             return [
                 OptimizedCandidate(
                     source_code=opt["source_code"],
diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py
@@ -217,7 +217,9 @@ def extract_code_string_context_from_files(
             continue
         try:
             qualified_helper_function_names = {func.qualified_name for func in helper_function_sources}
-            code_without_unused_defs = remove_unused_definitions_by_function_names(original_code, qualified_helper_function_names)
+            code_without_unused_defs = remove_unused_definitions_by_function_names(
+                original_code, qualified_helper_function_names
+            )
             code_context = parse_code_and_prune_cst(
                 code_without_unused_defs, code_context_type, set(), qualified_helper_function_names, remove_docstrings
             )
@@ -325,7 +327,9 @@ def extract_code_markdown_context_from_files(
             continue
         try:
             qualified_helper_function_names = {func.qualified_name for func in helper_function_sources}
-            code_without_unused_defs = remove_unused_definitions_by_function_names(original_code, qualified_helper_function_names)
+            code_without_unused_defs = remove_unused_definitions_by_function_names(
+                original_code, qualified_helper_function_names
+            )
             code_context = parse_code_and_prune_cst(
                 code_without_unused_defs, code_context_type, set(), qualified_helper_function_names, remove_docstrings
             )
@@ -403,12 +407,8 @@ def get_function_sources_from_jedi(
             for name in names:
                 try:
                     definitions: list[Name] = name.goto(follow_imports=True, follow_builtin_imports=False)
-                except Exception as e:
-                    try:
-                        logger.exception(f"Error while getting definition for {name.full_name}: {e}")
-                    except Exception as e:
-                        # name.full_name can also throw exceptions sometimes
-                        logger.exception(f"Error while getting definition: {e}")
+                except Exception:  # noqa: BLE001
+                    logger.debug(f"Error while getting definitions for {qualified_function_name}")
                     definitions = []
                 if definitions:
                     # TODO: there can be multiple definitions, see how to handle such cases
@@ -424,7 +424,12 @@ def get_function_sources_from_jedi(
                         and not belongs_to_function_qualified(definition, qualified_function_name)
                         and definition.full_name.startswith(definition.module_name)
                         # Avoid nested functions or classes. Only class.function is allowed
-                        and len((qualified_name := get_qualified_name(definition.module_name, definition.full_name)).split(".")) <= 2
+                        and len(
+                            (qualified_name := get_qualified_name(definition.module_name, definition.full_name)).split(
+                                "."
+                            )
+                        )
+                        <= 2
                     ):
                         function_source = FunctionSource(
                             file_path=definition_path,
diff --git a/codeflash/main.py b/codeflash/main.py
@@ -7,6 +7,7 @@
 from codeflash.cli_cmds.cli import parse_args, process_pyproject_config
 from codeflash.cli_cmds.cmd_init import CODEFLASH_LOGO, ask_run_end_to_end_test
 from codeflash.cli_cmds.console import paneled_text
+from codeflash.code_utils.checkpoint import ask_should_use_checkpoint_get_functions
 from codeflash.code_utils.config_parser import parse_config_file
 from codeflash.optimization import optimizer
 from codeflash.telemetry import posthog_cf
@@ -35,6 +36,7 @@ def main() -> None:
         ask_run_end_to_end_test(args)
     else:
         args = process_pyproject_config(args)
+        args.previous_checkpoint_functions = ask_should_use_checkpoint_get_functions(args)
         init_sentry(not args.disable_telemetry, exclude_errors=True)
         posthog_cf.initialize_posthog(not args.disable_telemetry)
         optimizer.run_with_args(args)
diff --git a/codeflash/optimization/optimizer.py b/codeflash/optimization/optimizer.py
@@ -16,7 +16,7 @@
 from codeflash.benchmarking.utils import print_benchmark_table, validate_and_format_benchmark_table
 from codeflash.cli_cmds.console import console, logger, progress_bar
 from codeflash.code_utils import env_utils
-from codeflash.code_utils.checkpoint import CodeflashRunCheckpoint, ask_should_use_checkpoint_get_functions
+from codeflash.code_utils.checkpoint import CodeflashRunCheckpoint
 from codeflash.code_utils.code_replacer import normalize_code, normalize_node
 from codeflash.code_utils.code_utils import cleanup_paths, get_run_tmp_file
 from codeflash.code_utils.static_analysis import analyze_imported_modules, get_first_top_level_function_or_method_ast
@@ -85,7 +85,6 @@ def run(self) -> None:
         function_optimizer = None
         file_to_funcs_to_optimize: dict[Path, list[FunctionToOptimize]]
         num_optimizable_functions: int
-        previous_checkpoint_functions = ask_should_use_checkpoint_get_functions(self.args)
         # discover functions
         (file_to_funcs_to_optimize, num_optimizable_functions) = get_functions_to_optimize(
             optimize_all=self.args.all,
@@ -96,7 +95,7 @@ def run(self) -> None:
             ignore_paths=self.args.ignore_paths,
             project_root=self.args.project_root,
             module_root=self.args.module_root,
-            previous_checkpoint_functions=previous_checkpoint_functions,
+            previous_checkpoint_functions=self.args.previous_checkpoint_functions,
         )
         function_benchmark_timings: dict[str, dict[BenchmarkKey, int]] = {}
         total_benchmark_timings: dict[BenchmarkKey, int] = {}
diff --git a/codeflash/verification/concolic_testing.py b/codeflash/verification/concolic_testing.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import time
+
 import ast
 import subprocess
 import tempfile
@@ -20,6 +22,7 @@
 def generate_concolic_tests(
     test_cfg: TestConfig, args: Namespace, function_to_optimize: FunctionToOptimize, function_to_optimize_ast: ast.AST
 ) -> tuple[dict[str, list[FunctionCalledInTest]], str]:
+    start_time = time.perf_counter()
     function_to_concolic_tests = {}
     concolic_test_suite_code = ""
     if (
@@ -84,4 +87,6 @@ def generate_concolic_tests(
         else:
             logger.debug(f"Error running CrossHair Cover {': ' + cover_result.stderr if cover_result.stderr else '.'}")
             console.rule()
+    end_time = time.perf_counter()
+    logger.debug(f"Generated concolic tests in {end_time - start_time:.2f} seconds")
     return function_to_concolic_tests, concolic_test_suite_code
diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
@@ -107,13 +107,19 @@ def parse_sqlite_test_results(sqlite_file_path: Path, test_files: TestFiles, tes
         logger.warning(f"No test results for {sqlite_file_path} found.")
         console.rule()
         return test_results
+    db = None
     try:
         db = sqlite3.connect(sqlite_file_path)
         cur = db.cursor()
         data = cur.execute(
             "SELECT test_module_path, test_class_name, test_function_name, "
             "function_getting_tested, loop_index, iteration_id, runtime, return_value,verification_type FROM test_results"
         ).fetchall()
+    except Exception as e:
+        logger.warning(f"Failed to parse test results from {sqlite_file_path}. Exception: {e}")
+        if db is not None:
+            db.close()
+        return test_results
     finally:
         db.close()
     for val in data:
diff --git a/codeflash/verification/test_runner.py b/codeflash/verification/test_runner.py
@@ -15,8 +15,8 @@
 if TYPE_CHECKING:
     from codeflash.models.models import TestFiles
 
-BEHAVIORAL_BLOCKLISTED_PLUGINS = ["benchmark"]
-BENCHMARKING_BLOCKLISTED_PLUGINS = ["codspeed", "cov", "benchmark", "profiling"]
+BEHAVIORAL_BLOCKLISTED_PLUGINS = ["benchmark", "codspeed", "xdist", "sugar"]
+BENCHMARKING_BLOCKLISTED_PLUGINS = ["codspeed", "cov", "benchmark", "profiling", "xdist", "sugar"]
 
 
 def execute_test_subprocess(
@@ -141,6 +141,7 @@ def run_behavioral_tests(
         coverage_config_file if enable_coverage else None,
     )
 
+
 def run_line_profile_tests(
     test_paths: TestFiles,
     pytest_cmd: str,
@@ -154,7 +155,6 @@ def run_line_profile_tests(
     pytest_min_loops: int = 5,
     pytest_max_loops: int = 100_000,
     line_profiler_output_file: Path | None = None,
-
 ) -> tuple[Path, subprocess.CompletedProcess]:
     if test_framework == "pytest":
         pytest_cmd_list = (
@@ -191,7 +191,7 @@ def run_line_profile_tests(
         pytest_test_env = test_env.copy()
         pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
         blocklist_args = [f"-p no:{plugin}" for plugin in BENCHMARKING_BLOCKLISTED_PLUGINS]
-        pytest_test_env["LINE_PROFILE"]="1"
+        pytest_test_env["LINE_PROFILE"] = "1"
         results = execute_test_subprocess(
             pytest_cmd_list + pytest_args + blocklist_args + result_args + test_files,
             cwd=cwd,
@@ -203,6 +203,7 @@ def run_line_profile_tests(
         raise ValueError(msg)
     return line_profiler_output_file, results
 
+
 def run_benchmarking_tests(
     test_paths: TestFiles,
     pytest_cmd: str,
diff --git a/codeflash/verification/verifier.py b/codeflash/verification/verifier.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import time
+
 import ast
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -29,6 +31,7 @@ def generate_tests(
 ) -> tuple[str, str, Path] | None:
     # TODO: Sometimes this recreates the original Class definition. This overrides and messes up the original
     #  class import. Remove the recreation of the class definition
+    start_time = time.perf_counter()
     test_module_path = Path(module_name_from_file_path(test_path, test_cfg.tests_project_rootdir))
     response = aiservice_client.generate_regression_tests(
         source_code_being_tested=source_code_being_tested,
@@ -54,7 +57,8 @@ def generate_tests(
     else:
         logger.warning(f"Failed to generate and instrument tests for {function_to_optimize.function_name}")
         return None
-
+    end_time = time.perf_counter()
+    logger.debug(f"Generated tests in {end_time - start_time:.2f} seconds")
     return (
         generated_test_source,
         instrumented_behavior_test_source,
diff --git a/docs/docs/codeflash-concepts/how-codeflash-works.md b/docs/docs/codeflash-concepts/how-codeflash-works.md
@@ -21,13 +21,12 @@ Codeflash currently only runs tests that directly call the target function in th
 
 ## Optimization Generation
 
-To optimize code, Codeflash first gathers all necessary context from the codebase. It then calls our backend to generate several candidate optimizations. These are called "candidates" because their speed and correctness haven't been verified yet. Both properties will be verified in later steps.
-
+To optimize code, Codeflash first gathers all necessary context from the codebase. It also line-profiles your code to understand where the bottlenecks might reside. It then calls our backend to generate several candidate optimizations. These are called "candidates" because their speed and correctness haven't been verified yet. Both properties will be verified in later steps.
 ## Verification of correctness
 
 ![Verification](/img/codeflash_arch_diagram.gif)
 
-The goal of correctness verification is to ensure that when the original code is replaced by the new code, there are no behavioral changes in the code and the rest of the system. This means the replacement should be completely safe.
+The goal of correctness verification is to ensure that when the new code replaces the original code, there are no behavioral changes in the code and the rest of the system. This means the replacement should be completely safe.
 
 To verify correctness, Codeflash calls the function with numerous inputs, confirming that the new function behaves identically to the original.
 
@@ -60,4 +59,4 @@ Codeflash implements several techniques to measure code performance accurately.
 
 ## Creating Pull Requests
 
-Once an optimization passes all checks, Codeflash creates a pull request through the Codeflash GitHub app directly in your repository. The pull request includes the new code, the speedup percentage, an explanation of the optimization, test statistics including coverage, and the test content itself. You can review and merge the new code if it meets your standards. Feel free to modify the code as needed—we welcome your improvements!
+Once an optimization passes all checks, Codeflash creates a pull request through the Codeflash GitHub app directly in your repository. The pull request includes the new code, the speedup percentage, an explanation of the optimization, test statistics including coverage, and the test content itself. You can review, edit, and merge the new code.
diff --git a/docs/docs/intro.md b/docs/docs/intro.md
@@ -2,27 +2,26 @@
 sidebar_position: 1
 slug: /
 ---
-# Introduction
-Welcome to the Codeflash documentation!
-
 ## What is Codeflash?
 
 Welcome! Codeflash is an AI performance optimizer for Python code.
-Codeflash speeds up Python code by figuring out the best way to rewrite a particular function, while verifying the behavior of the code is unchanged.
+Codeflash speeds up Python code by figuring out the best way to rewrite your code while verifying that the behavior of the code is unchanged.
 
-The optimizations Codeflash finds are generally better algorithms, opportunities to remove wasteful compute, better logic, and utilization of more efficient library methods.
+The optimizations Codeflash finds are generally better algorithms, opportunities to remove wasteful compute, better logic, and utilization of more efficient library methods. Codeflash
+does not modify the architecture of your code, but it tries to find the most efficient implementation of that architecture.
 
 ### How does Codeflash verify correctness?
 
-Codeflash verifies the correctness of the optimizations it finds by generating and running new regression tests, as well as any existing tests you may already have.
-This offers additional confidence that the behavior of your code remains unchanged.
+Codeflash verifies the correctness of the optimizations it finds by generating and running new regression tests, as well as any existing tests you may already have. Codeflash tries to ensure that your
+code behaves the same way before and after the optimization.
+This offers high confidence that the behavior of your code remains unchanged.
 
 ### Continuous Optimization
 
-Because Codeflash is an automated process, you can install it as a GitHub action and have it run on every pull request made to your codebase.
+Because Codeflash is an automated process, you can install it as a GitHub action and have it optimize the new code on every pull request.
 When Codeflash finds an optimization, it will ask you to review it. It will write a detailed explanation of the changes it made, and include all relevant info like % speed increase and proofs of correctness.
 
-Having Codeflash installed on your Github repository gives you the peace of mind that your code is always written optimally. We call it *Continuous Optimization*.
+This is a great way to ensure that your code, your team's code and your AI Agent's code are optimized for performance before it causes a performance regression. We call this *Continuous Optimization*.
 
 ### Features
 
@@ -34,9 +33,6 @@ Having Codeflash installed on your Github repository gives you the peace of mind
 | [Optimize all code in a repo](optimizing-with-codeflash/codeflash-all)                  | Codeflash discovers all functions in a repo and optimizes all of them!                                                                                                                              |
 | [Optimize every new pull request](optimizing-with-codeflash/optimize-prs)               | Codeflash runs as a GitHub action and GitHub app and reviews all new code for Optimizations                                                                                                         |
 | [Optimize a whole workflow by Tracing it](optimizing-with-codeflash/trace-and-optimize) | End to end optimization for all the functions called in a workflow, by tracing to collect real inputs seen during execution and ensuring correctness and performance optimization with those inputs |
-| Correctness Verification                                                                | The way Codeflash gains high confidence that the newly generated optimization has the same behavior as the originally written function.                                                             |
-| Performance Measurement                                                                 | Measuring execution times on a set of inputs to estimate runtime performance.                                                                                                                       |
-
 
 ## How to use these docs
 
@@ -45,4 +41,4 @@ Start by installing Codeflash, then explore the different ways of using it to op
 
 ## Questions or Feedback?
 
-We love feedback! If you have any questions or feedback, use the Intercom button in the lower right, or drop us a note at [founders@codeflash.ai](mailto:founders@codeflash.ai) - we read every message!
+We love feedback! If you have any questions or feedback, use the Intercom button in the lower right, join our [Discord](https://www.codeflash.ai/discord), or drop us a note at [contact@codeflash.ai](mailto:founders@codeflash.ai) - we read every message!
diff --git a/docs/docs/optimizing-with-codeflash/benchmarking.md b/docs/docs/optimizing-with-codeflash/benchmarking.md
@@ -1,5 +1,5 @@
 ---
-sidebar_position: 4
+sidebar_position: 5
 ---
 # Using Benchmarks
 
diff --git a/docs/docs/optimizing-with-codeflash/trace-and-optimize.md b/docs/docs/optimizing-with-codeflash/trace-and-optimize.md
@@ -1,9 +1,9 @@
 ---
 sidebar_position: 4
 ---
-# Optimize Workflows End-to-End
+# Optimize Workflows End-to-End by Tracing them.
 
-Codeflash supports optimizing an entire Python script end to end by tracing the execution of the script and generating Replay Tests. These Replay Tests can be used to optimize all the functions called in the script.
+Codeflash supports optimizing an entire Python script end to end by tracing the execution of the script and generating Replay Tests. Tracing means following the execution of a script and capturing the inputs to all the functions called, so they can be replayed again while optimizing. These Replay Tests can be used to optimize all the functions called in the script.
 
 ## Motivation for Tracing a workflow
 
diff --git a/tests/scripts/end_to_end_test_init_optimization.py b/tests/scripts/end_to_end_test_init_optimization.py
@@ -9,7 +9,7 @@ def run_test(expected_improvement_pct: int) -> bool:
         file_path="remove_control_chars.py",
         function_name="CharacterRemover.remove_control_characters",
         test_framework="pytest",
-        min_improvement_x=1.0,
+        min_improvement_x=0.3,
         coverage_expectations=[
             CoverageExpectation(
                 function_name="CharacterRemover.remove_control_characters", expected_coverage=100.0, expected_lines=[14]