The-OpenROAD-Project · luarss · Sep 29, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 2, 2024
diff --git a/flow/test/test_autotuner.sh b/flow/test/test_autotuner.sh
@@ -17,6 +17,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
 echo "Running Autotuner smoke tests for --sample and --iteration."
 python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration
 
+echo "Running Autotuner smoke tests for --cpu_budget."
+python3 -m unittest tools.AutoTuner.test.smoke_test_cpubudget.${PLATFORM}CPUBudgetSmokeTest.test_cpu_budget
+
+echo "Running Autotuner smoke tests for --timeout and --timeout_per_trial."
+python3 -m unittest tools.AutoTuner.test.smoke_test_timeout.${PLATFORM}TimeoutSmokeTest.test_timeout
+
 if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
   echo "Running Autotuner ref file test (only once)"
   python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files

diff --git a/tools/AutoTuner/src/autotuner/distributed.py b/tools/AutoTuner/src/autotuner/distributed.py
@@ -74,6 +74,30 @@
 # Global variable for args
 args = None
 
+TEMPLATE = """
+Expected figures for this experiment.
+    Wall time: {runtime:.5f} seconds
+    Number of Samples: 
+        Samples per minute: {num_samples_per_minute:.5f}
+        Design runtime of 10 min: {num_samples_10min:.5f}
+        Design runtime of 1h: {num_samples_1h:.5f}
+    Number of iterations
+        Design runtime of 10 min: {num_iterations_10min:.5f}
+        Design runtime of 1h: {num_iterations_1h:.5f}
+"""
+
+
+def calculate_expected_numbers(runtime, num_samples):
+    # Runtime - seconds
+    return TEMPLATE.format(
+        runtime=runtime,
+        num_samples_per_minute=num_samples / (runtime * 60),
+        num_samples_10min=(num_samples / (runtime * 60)) * 10,
+        num_samples_1h=(num_samples / (runtime * 60)) * 60,
+        num_iterations_10min=((num_samples / (runtime * 60)) * 10) / num_samples,
+        num_iterations_1h=((num_samples / (runtime * 60)) * 60) / num_samples,
+    )
+
 
 class AutoTunerBase(tune.Trainable):
     """
@@ -233,7 +257,7 @@ def parse_arguments():
         help="mode of execution", dest="mode", required=True
     )
     tune_parser = subparsers.add_parser("tune")
-    _ = subparsers.add_parser("sweep")
+    sweep_parser = subparsers.add_parser("sweep")
 
     # DUT
     parser.add_argument(
@@ -268,12 +292,22 @@ def parse_arguments():
         " FLOW_VARIANT and to set the Ray log destination.",
     )
     parser.add_argument(
-        "--timeout",
+        "--timeout_per_trial",
         type=float,
         metavar="<float>",
         default=None,
         help="Time limit (in hours) for each trial run. Default is no limit.",
     )
+    parser.add_argument(
+        "--timeout",
+        type=float,
+        metavar="<float>",
+        default=None,
+        help="Time limit (in hours) for the whole Autotuning process.",
+    )
+    parser.add_argument(
+        "-y", "--yes", action="store_true", help="Skip confirmation prompt."
+    )
     tune_parser.add_argument(
         "--resume",
         action="store_true",
@@ -335,7 +369,7 @@ def parse_arguments():
         help="Additional arguments given to ./build_openroad.sh.",
     )
 
-    # ML
+    # Tune mode
     tune_parser.add_argument(
         "--algorithm",
         type=str,
@@ -393,7 +427,22 @@ def parse_arguments():
         help="Random seed. (0 means no seed.)",
     )
 
+    sweep_parser.add_argument(
+        "--resources_per_trial",
+        type=int,
+        metavar="<int>",
+        default=1,
+        help="Number of CPUs to request for each sweep job.",
+    )
+
     # Workload
+    parser.add_argument(
+        "--cpu_budget",
+        type=int,
+        metavar="<int>",
+        default=-1,
+        help="CPU Hours (-1 means no limit.)",
+    )
     parser.add_argument(
         "--jobs",
         type=int,
@@ -458,8 +507,33 @@ def parse_arguments():
     else:
         args.experiment += f"-{args.mode}"
 
+    # Convert time to seconds
+    if args.timeout_per_trial is not None:
+        args.timeout_per_trial = round(args.timeout_per_trial * 3600)
     if args.timeout is not None:
         args.timeout = round(args.timeout * 3600)
+    args.timeout = set_timeout(args.timeout, args.timeout_per_trial)
+
+    # Calculate timeout based on cpu_budget
+    if args.cpu_budget != -1:
+        args.timeout = round(args.cpu_budget / os.cpu_count() * 3600)
+        args.timeout_per_trial = round(
+            args.cpu_budget / (args.jobs * args.resources_per_trial) * 3600
+        )
+        args.timeout = set_timeout(args.timeout, args.timeout_per_trial)
+        if args.mode == "tune":
+            template = calculate_expected_numbers(args.timeout, args.samples)
+        else:
+            template = calculate_expected_numbers(args.timeout, 1)
+        print(template)
+        if not args.yes:
+            print(
+                "[INFO TUN-0022] Tip: use the flag --yes to skip the confirmation prompt."
+            )
+            ans = input("Are you sure you want to proceed? (y/n): ")
+            if ans.lower() != "y":
+                print("Exiting AutoTuner.")
+                sys.exit(0)
 
     return args
 
@@ -544,12 +618,26 @@ def set_training_class(function):
     return None
 
 
+def set_timeout(timeout, timeout_per_trial):
+    """
+    Set timeout for experiment.
+    """
+    return (
+        min(timeout, timeout_per_trial)
+        if (timeout and timeout_per_trial)
+        else (timeout or timeout_per_trial)
+    )
+
+
 @ray.remote
 def save_best(results):
     """
     Save best configuration of parameters found.
     """
     best_config = results.best_config
+    if METRIC not in results.best_result:
+        print("[ERROR TUN-0023] Metric not found in results.")
+        sys.exit(1)
     best_config["best_result"] = results.best_result[METRIC]
     trial_id = results.best_trial.trial_id
     new_best_path = f"{LOCAL_DIR}/{args.experiment}/"
@@ -605,6 +693,14 @@ def main():
 
     LOCAL_DIR, ORFS_FLOW_DIR, INSTALL_PATH = prepare_ray_server(args)
 
+    # Check: Experiment name must be unique.
+    if os.path.exists(f"./{LOCAL_DIR}/{args.experiment}"):
+        print(
+            f"[ERROR TUN-0032] Experiment {args.experiment} already exists."
+            " Please choose a different name."
+        )
+        sys.exit(1)
+
     if args.mode == "tune":
         best_params = set_best_params(args.platform, args.design)
         search_algo = set_algorithm(args.experiment, config_dict)
@@ -617,6 +713,7 @@ def main():
             name=args.experiment,
             metric=METRIC,
             mode="min",
+            time_budget_s=args.timeout,
             num_samples=args.samples,
             fail_fast=False,
             local_dir=LOCAL_DIR,

diff --git a/tools/AutoTuner/test/ref_file_check.py b/tools/AutoTuner/test/ref_file_check.py
@@ -3,9 +3,6 @@
 import os
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
-src_dir = os.path.join(cur_dir, "../src/autotuner")
-orfs_dir = os.path.join(cur_dir, "../../../flow")
-os.chdir(src_dir)
 
 
 class RefFileCheck(unittest.TestCase):
@@ -15,19 +12,19 @@ class RefFileCheck(unittest.TestCase):
 
     def setUp(self):
         configs = [
-            "../../test/files/no_sdc_ref.json",
-            "../../test/files/no_fr_ref.json",
+            os.path.join(cur_dir, "../../test/files/no_sdc_ref.json"),
+            os.path.join(cur_dir, "../../test/files/no_fr_ref.json"),
         ]
         self.commands = [
-            f"python3 distributed.py"
+            f"python3 -m autotuner.distributed"
             f" --design {self.design}"
             f" --platform {self.platform}"
             f" --config {c}"
+            f" --yes"
             f" tune --samples 1"
             for c in configs
         ]
 
-    # Make this a test case
     def test_files(self):
         for c in self.commands:
             out = subprocess.run(c, shell=True)

diff --git a/tools/AutoTuner/test/resume_check.py b/tools/AutoTuner/test/resume_check.py
@@ -6,9 +6,6 @@
 from contextlib import contextmanager
 
 cur_dir = os.path.dirname(os.path.abspath(__file__))
-src_dir = os.path.join(cur_dir, "../src/autotuner")
-orfs_dir = os.path.join(cur_dir, "../../../flow")
-os.chdir(src_dir)
 
 
 @contextmanager
@@ -30,28 +27,36 @@ class ResumeCheck(unittest.TestCase):
     design = "gcd"
     samples = 5
     iterations = 2
+    timeout = 200
 
     def setUp(self):
         self.config = os.path.join(
-            orfs_dir, "designs", self.platform, self.design, "autotuner.json"
+            cur_dir,
+            "../../../",
+            "flow",
+            "designs",
+            self.platform,
+            self.design,
+            "autotuner.json",
         )
         self.jobs = self.samples
         self.num_cpus = os.cpu_count()
 
-        # How it works: Say we have 5 samples and 5 iterations.
+        # How it works: Say we have 5 samples and 5 iterations and 16 cores.
         # If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
         #  We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)
 
         # Cast to 1 decimal place
         res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
         options = ["", "--resume"]
         self.commands = [
-            f"python3 distributed.py"
+            f"python3 -m autotuner.distributed"
             f" --design {self.design}"
             f" --platform {self.platform}"
             f" --config {self.config}"
             f" --jobs {self.jobs}"
-            f" --experiment test-resume"
+            f" --experiment test_resume"
+            f" --yes"
             f" tune --iterations {self.iterations} --samples {self.samples}"
             f" --resources_per_trial {res_per_trial}"
             f" {c}"
@@ -65,18 +70,23 @@ def test_tune_resume(self):
         # Run the first config asynchronously.
         print("Running the first config")
         with managed_process(self.commands[0], shell=True) as proc:
-            time.sleep(120)
+            time.sleep(self.timeout)
 
         # Keep trying to stop the ray cluster until it is stopped
         while 1:
-            proc = subprocess.run("ray status", shell=True)
+            proc = subprocess.run(
+                ["ray", "status"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
             no_nodes = proc.returncode != 0
-            proc = subprocess.run("ray stop", shell=True)
+            proc = subprocess.run(
+                ["ray", "stop"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
             successful = proc.returncode == 0
 
             if no_nodes and successful:
+                print("Ray cluster successfully stopped with no remaining nodes.")
                 break
-            time.sleep(10)
+            time.sleep(5)
 
         # Run the second config to completion
         print("Running the second config")

diff --git a/tools/AutoTuner/test/smoke_test_algo_eval.py b/tools/AutoTuner/test/smoke_test_algo_eval.py
@@ -23,12 +23,13 @@ def setUp(self):
             f"python3 -m autotuner.distributed"
             f" --design {self.design}"
             f" --platform {self.platform}"
-            f" --experiment {self.experiment}"
+            f" --experiment {self.experiment}-{idx}"
             f" --config {self.config}"
+            f" --yes"
             f" tune --samples 5"
             f" --algorithm {a} --eval {e}"
             f" --reference {self.reference}"
-            for a, e in self.matrix
+            for idx, (a, e) in enumerate(self.matrix)
         ]
 
     def make_base(self):

diff --git a/tools/AutoTuner/test/smoke_test_cpubudget.py b/tools/AutoTuner/test/smoke_test_cpubudget.py
@@ -0,0 +1,66 @@
+import unittest
+import subprocess
+import os
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+class BaseCPUBudgetSmokeTest(unittest.TestCase):
+    platform = ""
+    design = ""
+
+    def setUp(self):
+        self.config = os.path.join(
+            cur_dir,
+            f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json",
+        )
+        self.experiment = f"smoke-test-cpubudget-{self.platform}"
+
+        # Tests should fail with such a low budget.
+        self.cpu_budget = 1
+        self.expected_timeout = self.cpu_budget / os.cpu_count() * 3600
+
+        self.command = (
+            "python3 -m autotuner.distributed"
+            f" --design {self.design}"
+            f" --platform {self.platform}"
+            f" --experiment {self.experiment}"
+            f" --config {self.config}"
+            f" --cpu_budget {self.cpu_budget}"
+            f" --yes"
+            f" tune --samples 1"
+        )
+        self.command = self.command.split()
+
+    def test_cpu_budget(self):
+        if not (self.platform and self.design):
+            raise unittest.SkipTest("Platform and design have to be defined")
+        try:
+            _ = subprocess.run(
+                self.command,
+                stderr=subprocess.PIPE,
+                timeout=self.expected_timeout,
+            )
+            failed = False
+        except subprocess.TimeoutExpired:
+            failed = True
+        self.assertTrue(failed)
+
+
+class asap7CPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
+    platform = "asap7"
+    design = "gcd"
+
+
+class sky130hdCPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
+    platform = "sky130hd"
+    design = "gcd"
+
+
+class ihpsg13g2CPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
+    platform = "ihp-sg13g2"
+    design = "gcd"
+
+
+if __name__ == "__main__":
+    unittest.main()