Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
cb3caa2
add initial scaffold
luarss Sep 29, 2024
3cfffe3
fix args.job syntax error
luarss Oct 1, 2024
1be04bf
add smoke test timeout
luarss Oct 1, 2024
2a3ff66
consistent file names and fix smoke_test_sweep
luarss Oct 2, 2024
7486249
move resources_per_trial to sweep_parser
luarss Oct 2, 2024
5f46d5d
fix timeout test and add cpu_budget logic
luarss Oct 3, 2024
a86b771
add `--yes` to avoid prompt.
luarss Oct 5, 2024
e5c2e3e
add prototype for CPUBudgetSmokeTest
luarss Oct 5, 2024
236b1b7
fix timeout
luarss Oct 6, 2024
b319e5e
reintroduce args.jobs, and recalculate timeouts accordingly
luarss Oct 8, 2024
a2dac19
test_trial_timeout -> test_timeout
luarss Oct 14, 2024
3b358fb
fix cpu_budget default to be None and reduce timeout fail
luarss Oct 15, 2024
bc886f4
use cpu_budget default -1 to mean unlimited
luarss Oct 15, 2024
6e96a64
fix smoke timeout
luarss Oct 19, 2024
c311101
fix test timeout by timeout check in subprocess.run, also fix typo in…
luarss Oct 19, 2024
582fffe
add tooltip, update readmes
luarss Jan 16, 2025
15e2a07
fix black
luarss Jan 16, 2025
6e12df6
update test suite
luarss Jan 16, 2025
6522488
add --yes flag
luarss Jan 17, 2025
f2b7883
do not exit code
luarss Jan 18, 2025
d05ea70
fixes to smoke_test_timeout
luarss Jan 18, 2025
27b0a73
fix test_tune_resume
luarss Jan 18, 2025
0468020
shorten timeout
luarss Jan 18, 2025
220e641
fix subprocess invocation
luarss Jan 19, 2025
cd4455a
more generous timeout for resume check
luarss Jan 19, 2025
d5019d1
uniquify experiment dir
luarss Jan 20, 2025
5e27fca
Fix experiment name checks
luarss Jan 20, 2025
26ecb7c
cpubudget test fixes
luarss Jan 21, 2025
4c61459
fix lint
luarss Feb 6, 2025
81ad4e5
fix global args, restore ihp smoke test tune
luarss Feb 8, 2025
6361372
remove cpubudget from sweep test
luarss Feb 8, 2025
a06b8ae
standardise cpubudget test
luarss Feb 8, 2025
cb0f283
standardise ref_file resume tests
luarss Feb 8, 2025
e841bbc
standardise and fix smoke_test_timeout
luarss Feb 9, 2025
0693b7c
fix typo
luarss Feb 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions flow/test/test_autotuner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ python3 -m unittest tools.AutoTuner.test.smoke_test_sweep.${PLATFORM}SweepSmokeT
echo "Running Autotuner smoke tests for --sample and --iteration."
python3 -m unittest tools.AutoTuner.test.smoke_test_sample_iteration.${PLATFORM}SampleIterationSmokeTest.test_sample_iteration

echo "Running Autotuner smoke tests for --cpu_budget."
python3 -m unittest tools.AutoTuner.test.smoke_test_cpubudget.${PLATFORM}CPUBudgetSmokeTest.test_cpu_budget

echo "Running Autotuner smoke tests for --timeout and --timeout_per_trial."
python3 -m unittest tools.AutoTuner.test.smoke_test_timeout.${PLATFORM}TimeoutSmokeTest.test_timeout

if [ "$PLATFORM" == "asap7" ] && [ "$DESIGN_NAME" == "gcd" ]; then
echo "Running Autotuner ref file test (only once)"
python3 -m unittest tools.AutoTuner.test.ref_file_check.RefFileCheck.test_files
Expand Down
103 changes: 100 additions & 3 deletions tools/AutoTuner/src/autotuner/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,30 @@
# Global variable for args
args = None

TEMPLATE = """
Expected figures for this experiment.
Wall time: {runtime:.5f} seconds
Number of Samples:
Samples per minute: {num_samples_per_minute:.5f}
Design runtime of 10 min: {num_samples_10min:.5f}
Design runtime of 1h: {num_samples_1h:.5f}
Number of iterations
Design runtime of 10 min: {num_iterations_10min:.5f}
Design runtime of 1h: {num_iterations_1h:.5f}
"""


def calculate_expected_numbers(runtime, num_samples):
# Runtime - seconds
return TEMPLATE.format(
runtime=runtime,
num_samples_per_minute=num_samples / (runtime * 60),
num_samples_10min=(num_samples / (runtime * 60)) * 10,
num_samples_1h=(num_samples / (runtime * 60)) * 60,
num_iterations_10min=((num_samples / (runtime * 60)) * 10) / num_samples,
num_iterations_1h=((num_samples / (runtime * 60)) * 60) / num_samples,
)


class AutoTunerBase(tune.Trainable):
"""
Expand Down Expand Up @@ -233,7 +257,7 @@ def parse_arguments():
help="mode of execution", dest="mode", required=True
)
tune_parser = subparsers.add_parser("tune")
_ = subparsers.add_parser("sweep")
sweep_parser = subparsers.add_parser("sweep")

# DUT
parser.add_argument(
Expand Down Expand Up @@ -268,12 +292,22 @@ def parse_arguments():
" FLOW_VARIANT and to set the Ray log destination.",
)
parser.add_argument(
"--timeout",
"--timeout_per_trial",
type=float,
metavar="<float>",
default=None,
help="Time limit (in hours) for each trial run. Default is no limit.",
)
parser.add_argument(
"--timeout",
type=float,
metavar="<float>",
default=None,
help="Time limit (in hours) for the whole Autotuning process.",
)
parser.add_argument(
"-y", "--yes", action="store_true", help="Skip confirmation prompt."
)
tune_parser.add_argument(
"--resume",
action="store_true",
Expand Down Expand Up @@ -335,7 +369,7 @@ def parse_arguments():
help="Additional arguments given to ./build_openroad.sh.",
)

# ML
# Tune mode
tune_parser.add_argument(
"--algorithm",
type=str,
Expand Down Expand Up @@ -393,7 +427,22 @@ def parse_arguments():
help="Random seed. (0 means no seed.)",
)

sweep_parser.add_argument(
"--resources_per_trial",
type=int,
metavar="<int>",
default=1,
help="Number of CPUs to request for each sweep job.",
)

# Workload
parser.add_argument(
"--cpu_budget",
type=int,
metavar="<int>",
default=-1,
help="CPU Hours (-1 means no limit.)",
)
parser.add_argument(
"--jobs",
type=int,
Expand Down Expand Up @@ -458,8 +507,33 @@ def parse_arguments():
else:
args.experiment += f"-{args.mode}"

# Convert time to seconds
if args.timeout_per_trial is not None:
args.timeout_per_trial = round(args.timeout_per_trial * 3600)
if args.timeout is not None:
args.timeout = round(args.timeout * 3600)
args.timeout = set_timeout(args.timeout, args.timeout_per_trial)

# Calculate timeout based on cpu_budget
if args.cpu_budget != -1:
args.timeout = round(args.cpu_budget / os.cpu_count() * 3600)
args.timeout_per_trial = round(
args.cpu_budget / (args.jobs * args.resources_per_trial) * 3600
)
args.timeout = set_timeout(args.timeout, args.timeout_per_trial)
if args.mode == "tune":
template = calculate_expected_numbers(args.timeout, args.samples)
else:
template = calculate_expected_numbers(args.timeout, 1)
print(template)
if not args.yes:
print(
"[INFO TUN-0022] Tip: use the flag --yes to skip the confirmation prompt."
)
ans = input("Are you sure you want to proceed? (y/n): ")
if ans.lower() != "y":
print("Exiting AutoTuner.")
sys.exit(0)

return args

Expand Down Expand Up @@ -544,12 +618,26 @@ def set_training_class(function):
return None


def set_timeout(timeout, timeout_per_trial):
"""
Set timeout for experiment.
"""
return (
min(timeout, timeout_per_trial)
if (timeout and timeout_per_trial)
else (timeout or timeout_per_trial)
)


@ray.remote
def save_best(results):
"""
Save best configuration of parameters found.
"""
best_config = results.best_config
if METRIC not in results.best_result:
print("[ERROR TUN-0023] Metric not found in results.")
sys.exit(1)
best_config["best_result"] = results.best_result[METRIC]
trial_id = results.best_trial.trial_id
new_best_path = f"{LOCAL_DIR}/{args.experiment}/"
Expand Down Expand Up @@ -605,6 +693,14 @@ def main():

LOCAL_DIR, ORFS_FLOW_DIR, INSTALL_PATH = prepare_ray_server(args)

# Check: Experiment name must be unique.
if os.path.exists(f"./{LOCAL_DIR}/{args.experiment}"):
print(
f"[ERROR TUN-0032] Experiment {args.experiment} already exists."
" Please choose a different name."
)
sys.exit(1)

if args.mode == "tune":
best_params = set_best_params(args.platform, args.design)
search_algo = set_algorithm(args.experiment, config_dict)
Expand All @@ -617,6 +713,7 @@ def main():
name=args.experiment,
metric=METRIC,
mode="min",
time_budget_s=args.timeout,
num_samples=args.samples,
fail_fast=False,
local_dir=LOCAL_DIR,
Expand Down
11 changes: 4 additions & 7 deletions tools/AutoTuner/test/ref_file_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
import os

cur_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(cur_dir, "../src/autotuner")
orfs_dir = os.path.join(cur_dir, "../../../flow")
os.chdir(src_dir)


class RefFileCheck(unittest.TestCase):
Expand All @@ -15,19 +12,19 @@ class RefFileCheck(unittest.TestCase):

def setUp(self):
configs = [
"../../test/files/no_sdc_ref.json",
"../../test/files/no_fr_ref.json",
os.path.join(cur_dir, "../../test/files/no_sdc_ref.json"),
os.path.join(cur_dir, "../../test/files/no_fr_ref.json"),
]
self.commands = [
f"python3 distributed.py"
f"python3 -m autotuner.distributed"
f" --design {self.design}"
f" --platform {self.platform}"
f" --config {c}"
f" --yes"
f" tune --samples 1"
for c in configs
]

# Make this a test case
def test_files(self):
for c in self.commands:
out = subprocess.run(c, shell=True)
Expand Down
32 changes: 21 additions & 11 deletions tools/AutoTuner/test/resume_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
from contextlib import contextmanager

cur_dir = os.path.dirname(os.path.abspath(__file__))
src_dir = os.path.join(cur_dir, "../src/autotuner")
orfs_dir = os.path.join(cur_dir, "../../../flow")
os.chdir(src_dir)


@contextmanager
Expand All @@ -30,28 +27,36 @@ class ResumeCheck(unittest.TestCase):
design = "gcd"
samples = 5
iterations = 2
timeout = 200

def setUp(self):
self.config = os.path.join(
orfs_dir, "designs", self.platform, self.design, "autotuner.json"
cur_dir,
"../../../",
"flow",
"designs",
self.platform,
self.design,
"autotuner.json",
)
self.jobs = self.samples
self.num_cpus = os.cpu_count()

# How it works: Say we have 5 samples and 5 iterations.
# How it works: Say we have 5 samples and 5 iterations and 16 cores.
# If we want to limit to only 5 trials (and avoid any parallelism magic by Ray)
# We can set resources_per_trial = NUM_CORES/5 = 3.2 (fractional resources_per_trial are allowed!)

# Cast to 1 decimal place
res_per_trial = float("{:.1f}".format(self.num_cpus / self.samples))
options = ["", "--resume"]
self.commands = [
f"python3 distributed.py"
f"python3 -m autotuner.distributed"
f" --design {self.design}"
f" --platform {self.platform}"
f" --config {self.config}"
f" --jobs {self.jobs}"
f" --experiment test-resume"
f" --experiment test_resume"
f" --yes"
f" tune --iterations {self.iterations} --samples {self.samples}"
f" --resources_per_trial {res_per_trial}"
f" {c}"
Expand All @@ -65,18 +70,23 @@ def test_tune_resume(self):
# Run the first config asynchronously.
print("Running the first config")
with managed_process(self.commands[0], shell=True) as proc:
time.sleep(120)
time.sleep(self.timeout)

# Keep trying to stop the ray cluster until it is stopped
while 1:
proc = subprocess.run("ray status", shell=True)
proc = subprocess.run(
["ray", "status"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
no_nodes = proc.returncode != 0
proc = subprocess.run("ray stop", shell=True)
proc = subprocess.run(
["ray", "stop"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
successful = proc.returncode == 0

if no_nodes and successful:
print("Ray cluster successfully stopped with no remaining nodes.")
break
time.sleep(10)
time.sleep(5)

# Run the second config to completion
print("Running the second config")
Expand Down
5 changes: 3 additions & 2 deletions tools/AutoTuner/test/smoke_test_algo_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,13 @@ def setUp(self):
f"python3 -m autotuner.distributed"
f" --design {self.design}"
f" --platform {self.platform}"
f" --experiment {self.experiment}"
f" --experiment {self.experiment}-{idx}"
f" --config {self.config}"
f" --yes"
f" tune --samples 5"
f" --algorithm {a} --eval {e}"
f" --reference {self.reference}"
for a, e in self.matrix
for idx, (a, e) in enumerate(self.matrix)
]

def make_base(self):
Expand Down
66 changes: 66 additions & 0 deletions tools/AutoTuner/test/smoke_test_cpubudget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import unittest
import subprocess
import os

cur_dir = os.path.dirname(os.path.abspath(__file__))


class BaseCPUBudgetSmokeTest(unittest.TestCase):
platform = ""
design = ""

def setUp(self):
self.config = os.path.join(
cur_dir,
f"../../../flow/designs/{self.platform}/{self.design}/autotuner.json",
)
self.experiment = f"smoke-test-cpubudget-{self.platform}"

# Tests should fail with such a low budget.
self.cpu_budget = 1
self.expected_timeout = self.cpu_budget / os.cpu_count() * 3600

self.command = (
"python3 -m autotuner.distributed"
f" --design {self.design}"
f" --platform {self.platform}"
f" --experiment {self.experiment}"
f" --config {self.config}"
f" --cpu_budget {self.cpu_budget}"
f" --yes"
f" tune --samples 1"
)
self.command = self.command.split()

def test_cpu_budget(self):
if not (self.platform and self.design):
raise unittest.SkipTest("Platform and design have to be defined")
try:
_ = subprocess.run(
self.command,
stderr=subprocess.PIPE,
timeout=self.expected_timeout,
)
failed = False
except subprocess.TimeoutExpired:
failed = True
self.assertTrue(failed)


class asap7CPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
platform = "asap7"
design = "gcd"


class sky130hdCPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
platform = "sky130hd"
design = "gcd"


class ihpsg13g2CPUBudgetSmokeTest(BaseCPUBudgetSmokeTest):
platform = "ihp-sg13g2"
design = "gcd"


if __name__ == "__main__":
unittest.main()
Loading