stdout comparison

codeflash-ai · alvin-r · Feb 15, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
commit a441dfdce8aa8d1d09d49b8e76dcf10da5d7f9f0
diff --git a/codeflash/verification/equivalence.py b/codeflash/verification/equivalence.py
@@ -1,10 +1,27 @@
+import re
 import sys
 
 from codeflash.cli_cmds.console import logger
 from codeflash.verification.comparator import comparator
 from codeflash.verification.test_results import TestResults, TestType, VerificationType
 
 INCREASED_RECURSION_LIMIT = 5000
+percentage_pattern = re.compile(r"\.\s+\[\d+%\]")
+passed_pattern = re.compile(r"\d+\s+passed\s+in\s+\d+\.\d+s")
+not_allowed = {"test", "codeflash"}
+
+
+def cleanup_stdout(stdout: str) -> str:
+    return (
+        "\n".join(
+            line
+            for line in stdout.splitlines()
+            if not any(word in line for word in not_allowed)
+            and not percentage_pattern.search(line)
+            and not passed_pattern.search(line)
+        )
+        + "\n"
+    )
 
 
 def compare_test_results(original_results: TestResults, candidate_results: TestResults) -> bool:
@@ -22,6 +39,7 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
     for test_id in test_ids_superset:
         original_test_result = original_results.get_by_unique_invocation_loop_id(test_id)
         cdd_test_result = candidate_results.get_by_unique_invocation_loop_id(test_id)
+
         if cdd_test_result is not None and original_test_result is None:
             continue
         # If helper function instance_state verification is not present, that's ok. continue
@@ -66,6 +84,10 @@ def compare_test_results(original_results: TestResults, candidate_results: TestR
         ):
             are_equal = False
             break
+        if not comparator(cleanup_stdout(original_test_result.stdout), cleanup_stdout(cdd_test_result.stdout)):
+            are_equal = False
+            break
+
     sys.setrecursionlimit(original_recursion_limit)
     if did_all_timeout:
         return False

diff --git a/codeflash/verification/parse_test_output.py b/codeflash/verification/parse_test_output.py
@@ -39,7 +39,7 @@
 def parse_func(file_path: Path) -> XMLParser:
     """Parse the XML file with lxml.etree.XMLParser as the backend."""
     xml_parser = XMLParser(huge_tree=True)
-    return parse(file_path, xml_parser)
+    return parse(file_path, xml_parser)  # type: ignore  # noqa: PGH003, S320
 
 
 def parse_test_return_values_bin(file_location: Path, test_files: TestFiles, test_config: TestConfig) -> TestResults:
@@ -259,6 +259,10 @@ def parse_test_xml(
                     message = testcase.result[0].message.lower()
                     if "timed out" in message:
                         timed_out = True
+
+            stdout = run_result.stdout if run_result and run_result.stdout else None
+            stderr = run_result.stderr if run_result and run_result.stderr else None
+
             matches = re.findall(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!", testcase.system_out or "")
             if not matches or not len(matches):
                 test_results.add(
@@ -278,9 +282,10 @@ def parse_test_xml(
                         test_type=test_type,
                         return_value=None,
                         timed_out=timed_out,
+                        stdout=stdout,
+                        stderr=stderr,
                     )
                 )
-
             else:
                 for match in matches:
                     split_val = match[5].split(":")
@@ -306,21 +311,17 @@ def parse_test_xml(
                             test_type=test_type,
                             return_value=None,
                             timed_out=timed_out,
+                            stdout=stdout,
+                            stderr=stderr,
                         )
                     )
 
     if not test_results:
         logger.info(
             f"Tests '{[test_file.original_file_path for test_file in test_files.test_files]}' failed to run, skipping"
         )
-        if run_result is not None:
-            stdout, stderr = "", ""
-            try:
-                stdout = run_result.stdout.decode()
-                stderr = run_result.stderr.decode()
-            except AttributeError:
-                stdout = run_result.stderr
-            logger.debug(f"Test log - STDOUT : {stdout} \n STDERR : {stderr}")
+        stdout, stderr = run_result.stdout, run_result.stderr
+        logger.debug(f"Test log - STDOUT : {stdout} \n STDERR : {stderr}")
     return test_results
 
 
@@ -335,7 +336,11 @@ def merge_test_results(
     # This is done to match the right iteration_id which might not be available in the xml
     for result in xml_test_results:
         if test_framework == "pytest":
-            if result.id.test_function_name.endswith("]") and "[" in result.id.test_function_name:  # parameterized test
+            if (
+                result.id.test_function_name
+                and result.id.test_function_name.endswith("]")
+                and "[" in result.id.test_function_name
+            ):  # parameterized test
                 test_function_name = result.id.test_function_name[: result.id.test_function_name.index("[")]
             else:
                 test_function_name = result.id.test_function_name

diff --git a/codeflash/verification/test_results.py b/codeflash/verification/test_results.py
@@ -85,8 +85,10 @@ class FunctionTestInvocation:
     test_framework: str  # unittest or pytest
     test_type: TestType
     return_value: Optional[object]  # The return value of the function invocation
-    timed_out: Optional[bool]
-    verification_type: Optional[str] = VerificationType.FUNCTION_CALL
+    time_out: Optional[bool]
+    verification_type: Optional[VerificationType] = VerificationType.FUNCTION_CALL
+    stdout: Optional[str] = None
+    stderr: Optional[str] = None
 
     @property
     def unique_invocation_loop_id(self) -> str:

diff --git a/pyproject.toml b/pyproject.toml
@@ -174,7 +174,8 @@ ignore = [
     "TD002",
     "TD003",
     "TD004",
-    "PLR2004"
+    "PLR2004",
+    "UP007" # remove once we drop 3.9 support.
 ]
 
 [tool.ruff.lint.flake8-type-checking]