Add a script to split minimal tests

This script splits minimal tests into distinct directories depending on their metric differences
mozilla · marco-c · May 11, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 21, 2021
commit d24569c75779be90a948df6880a4623ce2b0398d
diff --git a/split-minimal-tests.py b/split-minimal-tests.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+
+"""split-minimal-tests
+This script splits HTML minimal-tests, produced by a software called
+`json-minimal-tests`, into distinct directories depending on metric differences.
+
+Usage:
+
+./split-minimal-tests.py -i INPUT_DIR -o OUTPUT_DIR [-t MT_THRESHOLD]
+
+NOTE: OUTPUT_DIR is the path to the output directory to be created.
+This directory could contain either a series of directories, called as
+the metrics that presents differences, or be empty if no metric differences
+are found.
+MT_THRESHOLD determines the maximum number of considered minimal tests
+for a metric.
+"""
+
+import argparse
+import pathlib
+import re
+import shutil
+import typing as T
+
+# List of metrics
+# TODO: Implement a command into rust-code-analysis-cli that returns all
+# computed metrics https://github.com/mozilla/rust-code-analysis/issues/478
+METRICS = [
+    "cognitive",
+    "sloc",
+    "ploc",
+    "lloc",
+    "cloc",
+    "blank",
+    "cyclomatic",
+    "halstead",
+    "nom",
+    "nexits",
+    "nargs",
+]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        prog="split-minimal-tests",
+        description="This tool splits HTML minimal-tests, produced by "
+        "a software called `json-minimal-tests`, into distinct directories "
+        "depending on metric differences.",
+        epilog="The source code of this program can be found on "
+        "GitHub at https://github.com/mozilla/rust-code-analysis",
+    )
+
+    # Arguments
+    parser.add_argument(
+        "--input",
+        "-i",
+        type=lambda value: pathlib.Path(value),
+        required=True,
+        help="Input directory containing HTML minimal tests.",
+    )
+
+    parser.add_argument(
+        "--output",
+        "-o",
+        type=lambda value: pathlib.Path(value),
+        required=True,
+        help="Path to the output directory.",
+    )
+
+    # Optional arguments
+    parser.add_argument(
+        "--threshold",
+        "-t",
+        type=int,
+        help="Maximum number of considered minimal tests for a metric.",
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Create output directory
+    args.output.mkdir(parents=True, exist_ok=True)
+
+    # Save files associated to each metric
+    metrics_saver: T.Dict[str, T.List] = {metric_name: [] for metric_name in METRICS}
+
+    # Iterate over the files contained in the input directory
+    for path in args.input.glob("*.html"):
+        # Open a file
+        with open(path) as f:
+            # Read a file
+            file_str = f.read()
+
+            # Remove all code inside <pre></pre> tags
+            file_no_pre = re.sub(r"<pre>(.|\n)*?<\/pre>", "", file_str)
+
+            # Iterate over metrics
+            for metric_name, metric_files in metrics_saver.items():
+                # Check if there is a metric difference in a file
+                m = re.search(f"(\.{metric_name})", file_no_pre)
+
+                # If some errors occurred, skip to the next metric
+                if m is None:
+                    continue
+
+                # Save path if there is a metric difference in a file
+                if m.group(1):
+                    metric_files.append(path)
+
+    # Iterate over metrics to print them
+    for metric_name, metric_files in metrics_saver.items():
+        # Create path for metric directory
+        metric_path = args.output / metric_name
+
+        if metric_files:
+            # Create metric directory
+            metric_path.mkdir(parents=True, exist_ok=True)
+
+            # Save the number of files specified in the threshold
+            output_paths = (
+                metric_files[: args.threshold] if args.threshold else metric_files
+            )
+
+            for path in output_paths:
+                # Copy files in the directory
+                shutil.copy(path, metric_path)
+
+
+if __name__ == "__main__":
+    main()