diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 000000000..92304e6a9 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,56 @@ +# +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +changelog: + exclude: + labels: + - ignore-for-release + categories: + - title: Features + labels: + - 'type: feature' + exclude: + labels: + - non-user-facing + - title: Bug Fixes + labels: + - 'bug: critical' + - 'bug: major' + - 'bug: minor' + exclude: + labels: + - non-user-facing + - title: API Breaks + labels: + - 'API break' + exclude: + labels: + - non-user-facing + - title: Miscellaneous Improvements + labels: + - "*" \ No newline at end of file diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 714eb3819..65aadc03c 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -39,6 +39,18 @@ jobs: runs-on: ubuntu-latest steps: + + # Maximize the space in this image + - name: Maximize build space + uses: easimon/maximize-build-space@v10 + with: + root-reserve-mb: 30720 + remove-dotnet: true + remove-android: true + remove-haskell: true + remove-codeql: true + remove-docker-images: true + - uses: actions/checkout@v4 with: fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. diff --git a/smartsim/slurm.py b/.github/workflows/changelog.yml similarity index 72% rename from smartsim/slurm.py rename to .github/workflows/changelog.yml index 6a32d0213..cd4ab58fa 100644 --- a/smartsim/slurm.py +++ b/.github/workflows/changelog.yml @@ -1,3 +1,4 @@ +# # BSD 2-Clause License # # Copyright (c) 2021-2024, Hewlett Packard Enterprise @@ -23,23 +24,26 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +name: enforce_changelog +on: + pull_request: + push: + branches: + - develop -from warnings import simplefilter, warn +jobs: + changelog: + name: check_changelog + runs-on: ubuntu-latest -# pylint: disable-next=unused-import -from .wlm.slurm import ( - _get_alloc_cmd, - _get_system_partition_info, - get_allocation, - get_default_partition, - release_allocation, - validate, -) + steps: + - uses: actions/checkout@v4 -simplefilter("once", category=DeprecationWarning) -DEPRECATION_MSG = ( - "`smartsim.slurm` has been deprecated and will be removed in a future release.\n" - "Please update your code to use `smartsim.wlm.slurm`" -) -warn(DEPRECATION_MSG, category=DeprecationWarning, stacklevel=2) + - name: Changelog Enforcer + uses: dangoslen/changelog-enforcer@v3.6.0 + with: + changeLogPath: './doc/changelog.md' + missingUpdateErrorMessage: 'changelog.md has not been updated' diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ad9a55e03..6c1361b46 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,7 +53,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macos-12] + os: [ubuntu-22.04, macos-12] steps: - uses: actions/checkout@v4 @@ -98,7 +98,7 @@ jobs: - uses: actions/setup-python@v5 name: Install Python with: - python-version: '3.8' + python-version: '3.9' - name: Build sdist run: | @@ -124,3 +124,16 @@ jobs: user: __token__ password: ${{ secrets.PYPI }} #repository_url: https://test.pypi.org/legacy/ + + + createPullRequest: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Create pull request + run: | + gh pr create -B develop -H master --title 'Merge master into develop' --body 'This PR brings develop up to date with master for release.' + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 79466b902..2e3463e5b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -54,10 +54,13 @@ jobs: fail-fast: false matrix: subset: [backends, slow_tests, group_a, group_b] - os: [macos-12, ubuntu-20.04] # Operating systems + os: [macos-12, macos-14, ubuntu-22.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions - py_v: ['3.8', '3.9', '3.10', '3.11'] # Python versions + py_v: ["3.9", "3.10", "3.11"] # Python versions + exclude: + - os: macos-14 + py_v: "3.9" env: SMARTSIM_REDISAI: ${{ matrix.rai }} @@ -108,8 +111,13 @@ jobs: python -m pip install .[dev,ml] - name: Install ML Runtimes with Smart (with pt, tf, and onnx support) + if: contains( matrix.os, 'ubuntu' ) || contains( matrix.os, 'macos-12') run: smart build --device cpu --onnx -v + - name: Install ML Runtimes with Smart (no ONNX,TF on Apple Silicon) + if: contains( matrix.os, 'macos-14' ) + run: smart build --device cpu --no_tf -v + - name: Run mypy run: | python -m pip install .[mypy] diff --git a/.gitignore b/.gitignore index 428e439b3..77b91d586 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ tests/test_output # Dependencies smartsim/_core/.third-party +smartsim/_core/.dragon # Docs _build @@ -22,17 +23,14 @@ venv/ .venv/ env/ .env/ +**/.env # written upon install smartsim/version.py -smartsim/_core/bin/*-server -smartsim/_core/bin/*-cli - # created upon install +smartsim/_core/bin smartsim/_core/lib -**/manifest/ -**/*.err -**/*.out -**/.smartsim/* +# optional dev tools +.pre-commit-config.yaml diff --git a/.pylintrc b/.pylintrc index f2fa17bab..aa378d039 100644 --- a/.pylintrc +++ b/.pylintrc @@ -325,7 +325,7 @@ valid-metaclass-classmethod-first-arg=mcs max-args=9 # Maximum number of locals for function / method body -max-locals=20 +max-locals=25 # Maximum number of return / yield for function / method body max-returns=11 diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000..cecdfe3bf --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,43 @@ +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.10" + jobs: + post_checkout: + # Cancel building pull requests when there aren't changed in the docs directory or YAML file. + # You can add any other files or directories that you'd like here as well, + # like your docs requirements file, or other files that will change your docs build. + # + # If there are no changes (git diff exits with 0) we force the command to return with 183. + # This is a special exit code on Read the Docs that will cancel the build immediately. + - | + if [ "$READTHEDOCS_VERSION_TYPE" = "external" ] && git diff --quiet origin/main -- doc/ .readthedocs.yaml; + then + exit 183; + fi + pre_create_environment: + - git clone --depth 1 https://github.com/CrayLabs/SmartRedis.git smartredis + - git clone --depth 1 https://github.com/CrayLabs/SmartDashboard.git smartdashboard + post_create_environment: + - python -m pip install .[dev] + - cd smartredis; python -m pip install . + - cd smartredis/doc; doxygen Doxyfile_c; doxygen Doxyfile_cpp; doxygen Doxyfile_fortran + - ln -s smartredis/examples ./examples + - cd smartdashboard; python -m pip install . + pre_build: + - pip install typing_extensions==4.8.0 + - pip install pydantic==1.10.13 + - python -m sphinx -b linkcheck doc/ $READTHEDOCS_OUTPUT/linkcheck + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: doc/conf.py + fail_on_warning: true + +python: + install: + - requirements: doc/requirements-doc.txt \ No newline at end of file diff --git a/.wci.yml b/.wci.yml index 265d59579..6194f1939 100644 --- a/.wci.yml +++ b/.wci.yml @@ -22,8 +22,8 @@ language: Python release: - version: 0.6.2 - date: 2024-02-16 + version: 0.7.0 + date: 2024-05-14 documentation: general: https://www.craylabs.org/docs/overview.html diff --git a/Makefile b/Makefile index f71f2a0b3..bddbda722 100644 --- a/Makefile +++ b/Makefile @@ -150,11 +150,11 @@ tutorials-dev: @docker compose build tutorials-dev @docker run -p 8888:8888 smartsim-tutorials:dev-latest -# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.6.2) +# help: tutorials-prod - Build and start a docker container to run the tutorials (v0.7.0) .PHONY: tutorials-prod tutorials-prod: @docker compose build tutorials-prod - @docker run -p 8888:8888 smartsim-tutorials:v0.6.2 + @docker run -p 8888:8888 smartsim-tutorials:v0.7.0 # help: diff --git a/README.md b/README.md index cfd8d4271..c0986042e 100644 --- a/README.md +++ b/README.md @@ -174,13 +174,17 @@ system with which it has a corresponding `RunSettings` class. If one can be foun ## Experiments on HPC Systems SmartSim integrates with common HPC schedulers providing batch and interactive -launch capabilities for all applications. +launch capabilities for all applications: - Slurm - LSF - PBSPro - Local (for laptops/single node, no batch) +In addition, on Slurm and PBS systems, [Dragon](https://dragonhpc.github.io/dragon/doc/_build/html/index.html) +can be used as a launcher. Please refer to the documentation for instructions on +how to insall it on your system and use it in SmartSim. + ### Interactive Launch Example diff --git a/conftest.py b/conftest.py index b5a4fd70b..b0457522c 100644 --- a/conftest.py +++ b/conftest.py @@ -26,33 +26,51 @@ from __future__ import annotations +import asyncio +from collections import defaultdict +from dataclasses import dataclass import json import os -import pytest -import psutil +import pathlib import shutil +import subprocess +import signal +import socket +import sys +import tempfile +import time +import typing as t +import uuid +import warnings +from subprocess import run +import time + +import psutil +import pytest + import smartsim from smartsim import Experiment -from smartsim.entity import Model +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher +from smartsim._core.config import CONFIG +from smartsim._core.config.config import Config +from smartsim._core.utils.telemetry.telemetry import JobEntity from smartsim.database import Orchestrator +from smartsim.entity import Model +from smartsim.error import SSConfigError, SSInternalError +from smartsim.log import get_logger from smartsim.settings import ( - SrunSettings, AprunSettings, + DragonRunSettings, JsrunSettings, - MpirunSettings, MpiexecSettings, + MpirunSettings, PalsMpiexecSettings, RunSettings, + SrunSettings, ) -from smartsim._core.config import CONFIG -from smartsim.error import SSConfigError -from subprocess import run -import sys -import tempfile -import typing as t -import uuid -import warnings +logger = get_logger(__name__) # pylint: disable=redefined-outer-name,invalid-name,global-statement @@ -64,9 +82,12 @@ test_num_gpus = CONFIG.test_num_gpus test_nic = CONFIG.test_interface test_alloc_specs_path = os.getenv("SMARTSIM_TEST_ALLOC_SPEC_SHEET_PATH", None) -test_port = CONFIG.test_port +test_ports = CONFIG.test_ports test_account = CONFIG.test_account or "" -test_batch_resources: t.Dict[t.Any,t.Any] = CONFIG.test_batch_resources +test_batch_resources: t.Dict[t.Any, t.Any] = CONFIG.test_batch_resources +test_output_dirs = 0 +mpi_app_exe = None +built_mpi_app = False # Fill this at runtime if needed test_hostlist = None @@ -91,9 +112,7 @@ def print_test_configuration() -> None: print("TEST_ALLOC_SPEC_SHEET_PATH:", test_alloc_specs_path) print("TEST_DIR:", test_output_root) print("Test output will be located in TEST_DIR if there is a failure") - print( - "TEST_PORTS:", ", ".join(str(port) for port in range(test_port, test_port + 3)) - ) + print("TEST_PORTS:", ", ".join(str(port) for port in test_ports)) if test_batch_resources: print("TEST_BATCH_RESOURCES: ") print(json.dumps(test_batch_resources, indent=2)) @@ -101,7 +120,7 @@ def print_test_configuration() -> None: def pytest_configure() -> None: pytest.test_launcher = test_launcher - pytest.wlm_options = ["slurm", "pbs", "lsf", "pals"] + pytest.wlm_options = ["slurm", "pbs", "lsf", "pals", "dragon"] account = get_account() pytest.test_account = account pytest.test_device = test_device @@ -118,6 +137,14 @@ def pytest_sessionstart( if os.path.isdir(test_output_root): shutil.rmtree(test_output_root) os.makedirs(test_output_root) + while not os.path.isdir(test_output_root): + time.sleep(0.1) + + if CONFIG.dragon_server_path is None: + dragon_server_path = os.path.join(test_output_root, "dragon_server") + os.makedirs(dragon_server_path) + os.environ["SMARTSIM_DRAGON_SERVER_PATH"] = dragon_server_path + print_test_configuration() @@ -129,12 +156,62 @@ def pytest_sessionfinish( returning the exit status to the system. """ if exitstatus == 0: - shutil.rmtree(test_output_root) + cleanup_attempts = 5 + while cleanup_attempts > 0: + try: + shutil.rmtree(test_output_root) + except OSError as e: + cleanup_attempts -= 1 + time.sleep(1) + if not cleanup_attempts: + raise + else: + break else: - # kill all spawned processes in case of error + # kill all spawned processes + if CONFIG.test_launcher == "dragon": + time.sleep(5) kill_all_test_spawned_processes() +def build_mpi_app() -> t.Optional[pathlib.Path]: + global built_mpi_app + built_mpi_app = True + cc = shutil.which("cc") + if cc is None: + cc = shutil.which("gcc") + if cc is None: + return None + + path_to_src = pathlib.Path(FileUtils().get_test_conf_path("mpi")) + path_to_out = pathlib.Path(test_output_root) / "apps" / "mpi_app" + os.makedirs(path_to_out.parent, exist_ok=True) + cmd = [cc, str(path_to_src / "mpi_hello.c"), "-o", str(path_to_out)] + proc = subprocess.Popen(cmd) + proc.wait(timeout=1) + if proc.returncode == 0: + return path_to_out + else: + return None + +@pytest.fixture(scope="session") +def mpi_app_path() -> t.Optional[pathlib.Path]: + """Return path to MPI app if it was built + + return None if it could not or will not be built + """ + if not CONFIG.test_mpi: + return None + + # if we already tried to build, return what we have + if built_mpi_app: + return mpi_app_exe + + # attempt to build, set global + mpi_app_exe = build_mpi_app() + return mpi_app_exe + + def kill_all_test_spawned_processes() -> None: # in case of test failure, clean up all spawned processes pid = os.getpid() @@ -150,6 +227,7 @@ def kill_all_test_spawned_processes() -> None: print("Not all processes were killed after test") + def get_hostlist() -> t.Optional[t.List[str]]: global test_hostlist if not test_hostlist: @@ -200,7 +278,43 @@ def alloc_specs() -> t.Dict[str, t.Any]: return specs -@pytest.fixture +def _reset_signal(signalnum: int): + """SmartSim will set/overwrite signals on occasion. This function will + return a generator that can be used as a fixture to automatically reset the + signal handler to what it was at the beginning of the test suite to keep + tests atomic. + """ + original = signal.getsignal(signalnum) + + def _reset(): + yield + signal.signal(signalnum, original) + + return _reset + + +_reset_signal_interrupt = pytest.fixture( + _reset_signal(signal.SIGINT), autouse=True, scope="function" +) + + +def _find_free_port(ports: t.Collection[int]) -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + for port in ports: + try: + sock.bind(("127.0.0.1", port)) + except socket.error: + continue + else: + _, port_ = sock.getsockname() + return int(port_) + raise SSInternalError( + "Could not find a free port out of a options: " + f"{', '.join(str(port) for port in sorted(ports))}" + ) + + +@pytest.fixture(scope="session") def wlmutils() -> t.Type[WLMUtils]: return WLMUtils @@ -217,7 +331,9 @@ def get_test_launcher() -> str: @staticmethod def get_test_port() -> int: - return test_port + # TODO: Ideally this should find a free port on the correct host(s), + # but this is good enough for now + return _find_free_port(test_ports) @staticmethod def get_test_account() -> str: @@ -246,6 +362,12 @@ def get_base_run_settings( run_args.update(kwargs) settings = RunSettings(exe, args, run_command="srun", run_args=run_args) return settings + if test_launcher == "dragon": + run_args = {"nodes": nodes} + run_args = {"ntasks": ntasks} + run_args.update(kwargs) + settings = DragonRunSettings(exe, args, run_args=run_args) + return settings if test_launcher == "pbs": if shutil.which("aprun"): run_command = "aprun" @@ -287,6 +409,11 @@ def get_run_settings( run_args = {"nodes": nodes, "ntasks": ntasks, "time": "00:10:00"} run_args.update(kwargs) return SrunSettings(exe, args, run_args=run_args) + if test_launcher == "dragon": + run_args = {"nodes": nodes} + run_args.update(kwargs) + settings = DragonRunSettings(exe, args, run_args=run_args) + return settings if test_launcher == "pbs": if shutil.which("aprun"): run_args = {"pes": ntasks} @@ -312,53 +439,6 @@ def get_run_settings( return RunSettings(exe, args) - @staticmethod - def get_orchestrator(nodes: int = 1, batch: bool = False) -> Orchestrator: - if test_launcher == "pbs": - if not shutil.which("aprun"): - hostlist = get_hostlist() - else: - hostlist = None - return Orchestrator( - db_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - hosts=hostlist, - ) - if test_launcher == "pals": - hostlist = get_hostlist() - return Orchestrator( - db_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - hosts=hostlist, - ) - if test_launcher == "slurm": - return Orchestrator( - db_nodes=nodes, - port=test_port, - batch=batch, - interface=test_nic, - launcher=test_launcher, - ) - if test_launcher == "lsf": - return Orchestrator( - db_nodes=nodes, - port=test_port, - batch=batch, - cpus_per_shard=4, - gpus_per_shard=2 if test_device == "GPU" else 0, - project=get_account(), - interface=test_nic, - launcher=test_launcher, - ) - - return Orchestrator(port=test_port, interface="lo") - @staticmethod def choose_host(rs: RunSettings) -> t.Optional[str]: if isinstance(rs, (MpirunSettings, MpiexecSettings)): @@ -368,64 +448,6 @@ def choose_host(rs: RunSettings) -> t.Optional[str]: return None -@pytest.fixture -def local_db( - request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[Orchestrator, None, None]: - """Yield fixture for startup and teardown of an local orchestrator""" - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - db = Orchestrator(port=wlmutils.get_test_port(), interface="lo") - db.set_path(test_dir) - exp.start(db) - - yield db - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(db) - - -@pytest.fixture -def db( - request: t.Any, wlmutils: t.Type[WLMUtils], test_dir: str -) -> t.Generator[Orchestrator, None, None]: - """Yield fixture for startup and teardown of an orchestrator""" - launcher = wlmutils.get_test_launcher() - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - db = wlmutils.get_orchestrator() - db.set_path(test_dir) - exp.start(db) - - yield db - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(db) - - -@pytest.fixture -def db_cluster( - test_dir: str, wlmutils: t.Type[WLMUtils], request: t.Any -) -> t.Generator[Orchestrator, None, None]: - """ - Yield fixture for startup and teardown of a clustered orchestrator. - This should only be used in on_wlm and full_wlm tests. - """ - launcher = wlmutils.get_test_launcher() - - exp_name = request.function.__name__ - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) - db = wlmutils.get_orchestrator(nodes=3) - db.set_path(test_dir) - exp.start(db) - - yield db - # pass or fail, the teardown code below is ran after the - # completion of a test case that uses this fixture - exp.stop(db) - @pytest.fixture(scope="function", autouse=True) def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: @@ -436,6 +458,14 @@ def environment_cleanup(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("SSKEYOUT", raising=False) +@pytest.fixture(scope="function", autouse=True) +def check_output_dir() -> None: + global test_output_dirs + assert os.path.isdir(test_output_root) + assert len(os.listdir(test_output_root)) >= test_output_dirs + test_output_dirs = len(os.listdir(test_output_root)) + + @pytest.fixture def dbutils() -> t.Type[DBUtils]: return DBUtils @@ -524,7 +554,7 @@ def _sanitize_caller_function(caller_function: str) -> str: # We split at the opening bracket, sanitize the string # to its right and then merge the function name and # the sanitized list with a dot. - caller_function = caller_function.replace("]","") + caller_function = caller_function.replace("]", "") caller_function_list = caller_function.split("[", maxsplit=1) def is_accepted_char(char: str) -> bool: @@ -559,7 +589,8 @@ class FileUtils: @staticmethod def get_test_output_path(caller_function: str, caller_fspath: str) -> str: caller_file_to_dir = os.path.splitext(str(caller_fspath))[0] - rel_path = os.path.relpath(caller_file_to_dir, os.path.dirname(test_output_root)) + dir_name = os.path.dirname(test_output_root) + rel_path = os.path.relpath(caller_file_to_dir, dir_name) dir_path = os.path.join(test_output_root, rel_path, caller_function) return dir_path @@ -574,15 +605,14 @@ def get_test_dir_path(dirname: str) -> str: return dir_path @staticmethod - def make_test_file(file_name: str, file_dir: str, file_content: t.Optional[str] = None) -> str: + def make_test_file( + file_name: str, file_dir: str, file_content: t.Optional[str] = None + ) -> str: """Create a dummy file in the test output directory. :param file_name: name of file to create, e.g. "file.txt" - :type file_name: str :param file_dir: path - :type file_dir: str :return: String path to test output file - :rtype: str """ file_path = os.path.join(file_dir, file_name) os.makedirs(file_dir) @@ -625,7 +655,7 @@ def setup_test_colo( db_args: t.Dict[str, t.Any], colo_settings: t.Optional[RunSettings] = None, colo_model_name: str = "colocated_model", - port: int = test_port, + port: t.Optional[int] = None, on_wlm: bool = False, ) -> Model: """Setup database needed for the colo pinning tests""" @@ -641,16 +671,17 @@ def setup_test_colo( if on_wlm: colo_settings.set_tasks(1) colo_settings.set_nodes(1) + colo_model = exp.create_model(colo_model_name, colo_settings) if db_type in ["tcp", "deprecated"]: - db_args["port"] = port + db_args["port"] = port if port is not None else _find_free_port(test_ports) db_args["ifname"] = "lo" if db_type == "uds" and colo_model_name is not None: tmp_dir = tempfile.gettempdir() socket_suffix = str(uuid.uuid4())[:7] - db_args["unix_socket"] = os.path.join(tmp_dir, - f"{colo_model_name}_{socket_suffix}.socket") + socket_name = f"{colo_model_name}_{socket_suffix}.socket" + db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) colocate_fun: t.Dict[str, t.Callable[..., None]] = { "tcp": colo_model.colocate_db_tcp, @@ -659,16 +690,335 @@ def setup_test_colo( } with warnings.catch_warnings(): if db_type == "deprecated": - warnings.filterwarnings( - "ignore", - message="`colocate_db` has been deprecated" - ) + message = "`colocate_db` has been deprecated" + warnings.filterwarnings("ignore", message=message) colocate_fun[db_type](**db_args) # assert model will launch with colocated db assert colo_model.colocated # Check to make sure that limit_db_cpus made it into the colo settings return colo_model + +@pytest.fixture(scope="function") +def global_dragon_teardown() -> None: + """Connect to a dragon server started at the path indicated by + the environment variable SMARTSIM_DRAGON_SERVER_PATH and + force its shutdown to bring down the runtime and allow a subsequent + allocation of a new runtime. + """ + if test_launcher != "dragon" or CONFIG.dragon_server_path is None: + return + logger.debug(f"Tearing down Dragon infrastructure, server path: {CONFIG.dragon_server_path}") + dragon_connector = DragonConnector() + dragon_connector.ensure_connected() + dragon_connector.cleanup() + + @pytest.fixture -def config() -> smartsim._core.config.Config: +def config() -> Config: return CONFIG + + +class MockSink: + """Telemetry sink that writes console output for testing purposes""" + + def __init__(self, delay_ms: int = 0) -> None: + self._delay_ms = delay_ms + self.num_saves = 0 + self.args: t.Any = None + + async def save(self, *args: t.Any) -> None: + """Save all arguments as console logged messages""" + self.num_saves += 1 + if self._delay_ms: + # mimic slow collection.... + delay_s = self._delay_ms / 1000 + await asyncio.sleep(delay_s) + self.args = args + + +@pytest.fixture +def mock_sink() -> t.Type[MockSink]: + return MockSink + + +@pytest.fixture +def mock_con() -> t.Callable[[int, int], t.Iterable[t.Any]]: + """Generates mock db connection telemetry""" + + def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: + for i in range(min, max): + yield [ + {"addr": f"127.0.0.{i}:1234", "id": f"ABC{i}"}, + {"addr": f"127.0.0.{i}:2345", "id": f"XYZ{i}"}, + ] + + return _mock_con + + +@pytest.fixture +def mock_mem() -> t.Callable[[int, int], t.Iterable[t.Any]]: + """Generates mock db memory usage telemetry""" + + def _mock_mem(min: int = 1, max: int = 1000) -> t.Iterable[t.Any]: + for i in range(min, max): + yield { + "total_system_memory": 1000 * i, + "used_memory": 1111 * i, + "used_memory_peak": 1234 * i, + } + + return _mock_mem + + +@pytest.fixture +def mock_redis() -> t.Callable[..., t.Any]: + def _mock_redis( + conn_side_effect=None, + mem_stats=None, + client_stats=None, + coll_side_effect=None, + ): + """Generate a mock object for the redis.Redis contract""" + + class MockConn: + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + if conn_side_effect is not None: + conn_side_effect() + + async def info(self, *args: t.Any, **kwargs: t.Any) -> t.Dict[str, t.Any]: + if coll_side_effect: + await coll_side_effect() + + if mem_stats: + return next(mem_stats) + return { + "total_system_memory": "111", + "used_memory": "222", + "used_memory_peak": "333", + } + + async def client_list( + self, *args: t.Any, **kwargs: t.Any + ) -> t.Dict[str, t.Any]: + if coll_side_effect: + await coll_side_effect() + + if client_stats: + return next(client_stats) + return {"addr": "127.0.0.1", "id": "111"} + + async def ping(self): + return True + + return MockConn + + return _mock_redis + + +class MockCollectorEntityFunc(t.Protocol): + @staticmethod + def __call__( + host: str = "127.0.0.1", + port: int = 6379, + name: str = "", + type: str = "", + telemetry_on: bool = False, + ) -> "JobEntity": ... + + +@pytest.fixture +def mock_entity(test_dir: str) -> MockCollectorEntityFunc: + def _mock_entity( + host: str = "127.0.0.1", + port: int = 6379, + name: str = "", + type: str = "", + telemetry_on: bool = False, + ) -> "JobEntity": + test_path = pathlib.Path(test_dir) + + entity = JobEntity() + entity.name = name if name else str(uuid.uuid4()) + entity.status_dir = str(test_path / entity.name) + entity.type = type + entity.telemetry_on = True + entity.collectors = { + "client": "", + "client_count": "", + "memory": "", + } + entity.config = { + "host": host, + "port": str(port), + } + entity.telemetry_on = telemetry_on + return entity + + return _mock_entity + + +class CountingCallable: + def __init__(self) -> None: + self._num: int = 0 + self._details: t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]] = [] + + def __call__(self, *args: t.Any, **kwargs: t.Any) -> t.Any: + self._num += 1 + self._details.append((args, kwargs)) + + @property + def num_calls(self) -> int: + return self._num + + @property + def details(self) -> t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]]: + return self._details + +## Reuse database across tests + +database_registry: t.DefaultDict[str, t.Optional[Orchestrator]] = defaultdict(lambda: None) + +@pytest.fixture(scope="function") +def local_experiment(test_dir: str) -> smartsim.Experiment: + """Create a default experiment that uses the requested launcher""" + name = pathlib.Path(test_dir).stem + return smartsim.Experiment(name, exp_path=test_dir, launcher="local") + +@pytest.fixture(scope="function") +def wlm_experiment(test_dir: str, wlmutils: WLMUtils) -> smartsim.Experiment: + """Create a default experiment that uses the requested launcher""" + name = pathlib.Path(test_dir).stem + return smartsim.Experiment( + name, + exp_path=test_dir, + launcher=wlmutils.get_test_launcher() + ) + +def _cleanup_db(name: str) -> None: + global database_registry + db = database_registry[name] + if db and db.is_active(): + exp = Experiment("cleanup") + try: + db = exp.reconnect_orchestrator(db.checkpoint_file) + exp.stop(db) + except: + pass + +@dataclass +class DBConfiguration: + name: str + launcher: str + num_nodes: int + interface: t.Union[str,t.List[str]] + hostlist: t.Optional[t.List[str]] + port: int + +@dataclass +class PrepareDatabaseOutput: + orchestrator: t.Optional[Orchestrator] # The actual orchestrator object + new_db: bool # True if a new database was created when calling prepare_db + +# Reuse databases +@pytest.fixture(scope="session") +def local_db() -> t.Generator[DBConfiguration, None, None]: + name = "local_db_fixture" + config = DBConfiguration( + name, + "local", + 1, + "lo", + None, + _find_free_port(tuple(reversed(test_ports))), + ) + yield config + _cleanup_db(name) + +@pytest.fixture(scope="session") +def single_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: + hostlist = wlmutils.get_test_hostlist() + hostlist = hostlist[-1:] if hostlist is not None else None + name = "single_db_fixture" + config = DBConfiguration( + name, + wlmutils.get_test_launcher(), + 1, + wlmutils.get_test_interface(), + hostlist, + _find_free_port(tuple(reversed(test_ports))) + ) + yield config + _cleanup_db(name) + + +@pytest.fixture(scope="session") +def clustered_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: + hostlist = wlmutils.get_test_hostlist() + hostlist = hostlist[-4:-1] if hostlist is not None else None + name = "clustered_db_fixture" + config = DBConfiguration( + name, + wlmutils.get_test_launcher(), + 3, + wlmutils.get_test_interface(), + hostlist, + _find_free_port(tuple(reversed(test_ports))), + ) + yield config + _cleanup_db(name) + + +@pytest.fixture +def register_new_db() -> t.Callable[[DBConfiguration], Orchestrator]: + def _register_new_db( + config: DBConfiguration + ) -> Orchestrator: + exp_path = pathlib.Path(test_output_root, config.name) + exp_path.mkdir(exist_ok=True) + exp = Experiment( + config.name, + exp_path=str(exp_path), + launcher=config.launcher, + ) + orc = exp.create_database( + port=config.port, + batch=False, + interface=config.interface, + hosts=config.hostlist, + db_nodes=config.num_nodes + ) + exp.generate(orc, overwrite=True) + exp.start(orc) + global database_registry + database_registry[config.name] = orc + return orc + return _register_new_db + + +@pytest.fixture(scope="function") +def prepare_db( + register_new_db: t.Callable[ + [DBConfiguration], + Orchestrator + ] +) -> t.Callable[ + [DBConfiguration], + PrepareDatabaseOutput +]: + def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput: + global database_registry + db = database_registry[db_config.name] + + new_db = False + db_up = False + + if db: + db_up = db.is_active() + + if not db_up or db is None: + db = register_new_db(db_config) + new_db = True + + return PrepareDatabaseOutput(db, new_db) + return _prepare_db diff --git a/doc/_static/version_names.json b/doc/_static/version_names.json index bbe3b332d..bc095f84a 100644 --- a/doc/_static/version_names.json +++ b/doc/_static/version_names.json @@ -1,7 +1,8 @@ { "version_names":[ "develop (unstable)", - "0.6.2 (stable)", + "0.7.0 (stable)", + "0.6.2", "0.6.1", "0.6.0", "0.5.1", @@ -14,6 +15,7 @@ "version_urls": [ "https://www.craylabs.org/develop/overview.html", "https://www.craylabs.org/docs/overview.html", + "https://www.craylabs.org/docs/versions/0.6.2/overview.html", "https://www.craylabs.org/docs/versions/0.6.1/overview.html", "https://www.craylabs.org/docs/versions/0.6.0/overview.html", "https://www.craylabs.org/docs/versions/0.5.1/overview.html", diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index adf7081ec..d9615e04c 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -1,17 +1,15 @@ - ************* SmartSim API ************* - .. _experiment_api: Experiment ========== - .. currentmodule:: smartsim.experiment +.. _exp_init: .. autosummary:: Experiment.__init__ @@ -27,13 +25,17 @@ Experiment Experiment.finished Experiment.get_status Experiment.reconnect_orchestrator + Experiment.preview Experiment.summary + Experiment.telemetry .. autoclass:: Experiment :show-inheritance: :members: +.. _settings-info: + Settings ======== @@ -58,6 +60,7 @@ Types of Settings: MpiexecSettings OrterunSettings JsrunSettings + DragonRunSettings SbatchSettings QsubBatchSettings BsubBatchSettings @@ -111,6 +114,7 @@ steps to a batch. .. autosummary:: SrunSettings.set_nodes + SrunSettings.set_node_feature SrunSettings.set_tasks SrunSettings.set_tasks_per_node SrunSettings.set_walltime @@ -160,6 +164,28 @@ and within batch launches (e.g., ``QsubBatchSettings``) :members: +.. _dragonsettings_api: + +DragonRunSettings +----------------- + +``DragonRunSettings`` can be used on systems that support Slurm or +PBS, if Dragon is available in the Python environment (see `_dragon_install` +for instructions on how to install it through ``smart``). + +``DragonRunSettings`` can be used in interactive sessions (on allcation) +and within batch launches (i.e. ``SbatchSettings`` or ``QsubBatchSettings``, +for Slurm and PBS sessions, respectively). + +.. autosummary:: + DragonRunSettings.set_nodes + DragonRunSettings.set_tasks_per_node + +.. autoclass:: DragonRunSettings + :inherited-members: + :undoc-members: + :members: + .. _jsrun_api: @@ -377,23 +403,50 @@ container. :undoc-members: :members: +.. _orc_api: Orchestrator ============ .. currentmodule:: smartsim.database -.. _orc_api: +.. autosummary:: + + Orchestrator.__init__ + Orchestrator.db_identifier + Orchestrator.num_shards + Orchestrator.db_nodes + Orchestrator.hosts + Orchestrator.reset_hosts + Orchestrator.remove_stale_files + Orchestrator.get_address + Orchestrator.is_active + Orchestrator.set_cpus + Orchestrator.set_walltime + Orchestrator.set_hosts + Orchestrator.set_batch_arg + Orchestrator.set_run_arg + Orchestrator.enable_checkpoints + Orchestrator.set_max_memory + Orchestrator.set_eviction_strategy + Orchestrator.set_max_clients + Orchestrator.set_max_message_size + Orchestrator.set_db_conf + Orchestrator.telemetry + Orchestrator.checkpoint_file + Orchestrator.batch Orchestrator ------------ +.. _orchestrator_api: .. autoclass:: Orchestrator :members: :inherited-members: :undoc-members: +.. _model_api: Model ===== @@ -417,17 +470,17 @@ Model Model.disable_key_prefixing Model.query_key_prefixing +Model +----- + .. autoclass:: Model :members: :show-inheritance: :inherited-members: -.. _ensemble_api: - Ensemble ======== - .. currentmodule:: smartsim.entity.ensemble .. autosummary:: @@ -443,6 +496,11 @@ Ensemble Ensemble.query_key_prefixing Ensemble.register_incoming_entity +Ensemble +-------- + +.. _ensemble_api: + .. autoclass:: Ensemble :members: :show-inheritance: @@ -461,7 +519,6 @@ SmartSim includes built-in utilities for supporting TensorFlow, Keras, and Pytor TensorFlow ---------- - SmartSim includes built-in utilities for supporting TensorFlow and Keras in training and inference. .. currentmodule:: smartsim.ml.tf.utils @@ -510,13 +567,18 @@ SmartSim includes built-in utilities for supporting PyTorch in training and infe Slurm ===== - -.. currentmodule:: smartsim.slurm +.. currentmodule:: smartsim.wlm.slurm .. autosummary:: get_allocation release_allocation - -.. automodule:: smartsim.slurm + validate + get_default_partition + get_hosts + get_queue + get_tasks + get_tasks_per_node + +.. automodule:: smartsim.wlm.slurm :members: diff --git a/doc/batch_settings.rst b/doc/batch_settings.rst new file mode 100644 index 000000000..07cef4c95 --- /dev/null +++ b/doc/batch_settings.rst @@ -0,0 +1,127 @@ +.. _batch_settings_doc: + +************** +Batch Settings +************** +======== +Overview +======== +SmartSim provides functionality to launch entities (``Model`` or ``Ensemble``) +as batch jobs supported by the ``BatchSettings`` base class. While the ``BatchSettings`` base +class is not intended for direct use by users, its derived child classes offer batch +launching capabilities tailored for specific workload managers (WLMs). Each SmartSim +`launcher` interfaces with a ``BatchSettings`` subclass specific to a system's WLM: + +- The Slurm `launcher` supports: + - :ref:`SbatchSettings` +- The PBS Pro `launcher` supports: + - :ref:`QsubBatchSettings` +- The LSF `launcher` supports: + - :ref:`BsubBatchSettings` + +.. note:: + The local `launcher` does not support batch jobs. + +After creating a ``BatchSettings`` instance, users gain access to the methods +of the associated child class, providing them with the ability to further configure the batch +settings for jobs. + +In the following :ref:`Examples` subsection, we demonstrate the initialization +and configuration of a batch settings object. + +.. _batch_settings_ex: + +======== +Examples +======== +A ``BatchSettings`` child class is created using the ``Experiment.create_batch_settings`` +factory method. When the user initializes the ``Experiment`` at the beginning of the Python driver script, +they may specify a `launcher` argument. SmartSim will then register or detect the `launcher` and return the +corresponding supported child class when ``Experiment.create_batch_settings`` is called. This +design allows SmartSim driver scripts utilizing ``BatchSettings`` to be portable between systems, +requiring only a change in the specified `launcher` during ``Experiment`` initialization. + +Below are examples of how to initialize a ``BatchSettings`` object per `launcher`. + +.. tabs:: + + .. group-tab:: Slurm + To instantiate the ``SbatchSettings`` object, which interfaces with the Slurm job scheduler, specify + `launcher="slurm"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a SbatchSettings object + sbatch_settings = exp.create_batch_settings(nodes=1, time="10:00:00") + # Set the account for the slurm batch job + sbatch_settings.set_account("12345-Cray") + # Set the partition for the slurm batch job + sbatch_settings.set_queue("default") + + The initialized ``SbatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a Slurm based + machine and set the launcher to `"slurm"`. + + .. group-tab:: PBS Pro + To instantiate the ``QsubBatchSettings`` object, which interfaces with the PBS Pro job scheduler, specify + `launcher="pbs"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a QsubBatchSettings object + qsub_batch_settings = exp.create_batch_settings(nodes=1, time="10:00:00") + # Set the account for the PBS Pro batch job + qsub_batch_settings.set_account("12345-Cray") + # Set the partition for the PBS Pro batch job + qsub_batch_settings.set_queue("default") + + The initialized ``QsubBatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a PBS Pro based + machine and set the launcher to `"pbs"`. + + .. group-tab:: LSF + To instantiate the ``BsubBatchSettings`` object, which interfaces with the LSF job scheduler, specify + `launcher="lsf"` when initializing the ``Experiment``. Upon calling ``create_batch_settings``, + SmartSim will detect the job scheduler and return the appropriate batch settings object. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a BsubBatchSettings object + bsub_batch_settings = exp.create_batch_settings(nodes=1, time="10:00:00", batch_args={"ntasks": 1}) + # Set the account for the lsf batch job + bsub_batch_settings.set_account("12345-Cray") + # Set the partition for the lsf batch job + bsub_batch_settings.set_queue("default") + + The initialized ``BsubBatchSettings`` instance can now be passed to a SmartSim entity + (``Model`` or ``Ensemble``) via the `batch_settings` argument in ``create_batch_settings``. + + .. note:: + If `launcher="auto"`, SmartSim will detect that the ``Experiment`` is running on a LSF based + machine and set the launcher to `"lsf"`. + +.. warning:: + Note that initialization values provided (e.g., `nodes`, `time`, etc) will overwrite the same arguments in `batch_args` if present. \ No newline at end of file diff --git a/doc/changelog.md b/doc/changelog.md new file mode 100644 index 000000000..73ea36511 --- /dev/null +++ b/doc/changelog.md @@ -0,0 +1,952 @@ +# Changelog + +Listed here are the changes between each release of SmartSim, +SmartRedis and SmartDashboard. + +Jump to: +- {ref}`SmartRedis changelog` +- {ref}`SmartDashboard changelog` + +## SmartSim + +### 0.7.0 + +Released on 14 May, 2024 + +Description + +- Improve Dragon server shutdown +- Add dragon runtime installer +- Add launcher based on Dragon +- Reuse Orchestrators within the testing suite to improve performance. +- Fix building of documentation +- Preview entities on experiment before start +- Update authentication in release workflow +- Auto-generate type-hints into documentation +- Auto-post release PR to develop +- Bump manifest.json to version 0.0.4 +- Fix symlinking batch ensemble and model bug +- Fix noisy failing WLM test +- Remove defensive regexp in .gitignore +- Upgrade ubuntu to 22.04 +- Remove helper function `init_default` +- Fix telemetry monitor logging errors for task history +- Change default path for entities +- Drop Python 3.8 support +- Update watchdog dependency +- Historical output files stored under .smartsim directory +- Fixes unfalsifiable test that tests SmartSim's custom SIGINT signal + handler +- Add option to build Torch backend without the Intel Math Kernel + Library +- Fix ReadTheDocs build issue +- Disallow uninitialized variable use +- Promote device options to an Enum +- Update telemetry monitor, add telemetry collectors +- Add method to specify node features for a Slurm job +- Colo Orchestrator setup now blocks application start until setup + finished +- Refactor areas of the code where mypy potential errors +- Minor enhancements to test suite +- ExecArgs handling correction +- ReadTheDocs config file added and enabled on PRs +- Enforce changelog updates +- Fix Jupyter notebook math expressions +- Remove deprecated SmartSim modules +- SmartSim Documentation refactor +- Promote SmartSim statuses to a dedicated type +- Update the version of Redis from [7.0.4]{.title-ref} to + [7.2.4]{.title-ref} +- Increase disk space in doc builder container +- Update Experiment API typing +- Prevent duplicate entity names +- Fix publishing of development docs + +Detailed Notes + +- The Dragon server will now terminate any process which is still running + when a request of an immediate shutdown is sent. ([SmartSim-PR582](https://github.com/CrayLabs/SmartSim/pull/582)) +- Add `--dragon` option to `smart build`. Install appropriate Dragon + runtime from Dragon GitHub release assets. + ([SmartSim-PR580](https://github.com/CrayLabs/SmartSim/pull/580)) +- Add new launcher, based on [Dragon](https://dragonhpc.github.io/dragon/doc/_build/html/index.html). + The new launcher is compatible with the Slurm and PBS schedulers and can + be selected by specifying ``launcher="dragon"`` when creating an `Experiment`, + or by using ``DragonRunSettings`` to launch a job. The Dragon launcher + is at an early stage of development: early adopters are referred to the + dedicated documentation section to learn more about it. ([SmartSim-PR580](https://github.com/CrayLabs/SmartSim/pull/580)) +- Tests may now request a given configuration and will reconnect to + the existing orchestrator instead of building up and tearing down + a new one each test. + ([SmartSim-PR567](https://github.com/CrayLabs/SmartSim/pull/567)) +- Manually ensure that typing_extensions==4.6.1 in Dockerfile used to build + docs. This fixes the deploy_dev_docs Github action ([SmartSim-PR564](https://github.com/CrayLabs/SmartSim/pull/564)) +- Added preview functionality to Experiment, including preview of all entities, active infrastructure and + client configuration. ([SmartSim-PR525](https://github.com/CrayLabs/SmartSim/pull/525)) +- Replace the developer created token with the GH_TOKEN environment variable. + ([SmartSim-PR570](https://github.com/CrayLabs/SmartSim/pull/570)) +- Add extension to auto-generate function type-hints into documentation. + ([SmartSim-PR561](https://github.com/CrayLabs/SmartSim/pull/561)) +- Add to github release workflow to auto generate a pull request from + master into develop for release. + ([SmartSim-PR566](https://github.com/CrayLabs/SmartSim/pull/566)) +- The manifest.json version needs to match the SmartDashboard version, + which is 0.0.4 in the upcoming release. + ([SmartSim-PR563](https://github.com/CrayLabs/SmartSim/pull/563)) +- Properly symlinks batch ensembles and batch models. + ([SmartSim-PR547](https://github.com/CrayLabs/SmartSim/pull/547)) +- Remove defensive regexp in .gitignore and ensure tests write to + test_output. + ([SmartSim-PR560](https://github.com/CrayLabs/SmartSim/pull/560)) +- After dropping support for Python 3.8, ubuntu needs to be upgraded. + ([SmartSim-PR558](https://github.com/CrayLabs/SmartSim/pull/558)) +- Remove helper function `init_default` and replace with traditional + type narrowing. + ([SmartSim-PR545](https://github.com/CrayLabs/SmartSim/pull/545)) +- Ensure the telemetry monitor does not track a task_id for a managed + task. + ([SmartSim-PR557](https://github.com/CrayLabs/SmartSim/pull/557)) +- The default path for an entity is now the path to the experiment / + the entity name. create_database and create_ensemble now have path + arguments. All path arguments are compatible with relative paths. + Relative paths are relative to the CWD. + ([SmartSim-PR533](https://github.com/CrayLabs/SmartSim/pull/533)) +- Python 3.8 is reaching its end-of-life in October, 2024, so it will + no longer continue to be supported. + ([SmartSim-PR544](https://github.com/CrayLabs/SmartSim/pull/544)) +- Update watchdog dependency from 3.x to 4.x, fix new type issues + ([SmartSim-PR540](https://github.com/CrayLabs/SmartSim/pull/540)) +- The dashboard needs to display historical logs, so log files are + written out under the .smartsim directory and files under the + experiment directory are symlinked to them. + ([SmartSim-PR532](https://github.com/CrayLabs/SmartSim/pull/532)) +- Add an option to smart build + \"\--torch_with_mkl\"/\"\--no_torch_with_mkl\" to prevent Torch from + trying to link in the Intel Math Kernel Library. This is needed + because on machines that have the Intel compilers installed, the + Torch will unconditionally try to link in this library, however + fails because the linking flags are incorrect. + ([SmartSim-PR538](https://github.com/CrayLabs/SmartSim/pull/538)) +- Change typing\_extensions and pydantic versions in readthedocs + environment to enable docs build. + ([SmartSim-PR537](https://github.com/CrayLabs/SmartSim/pull/537)) +- Promote devices to a dedicated Enum type throughout the SmartSim + code base. + ([SmartSim-PR527](https://github.com/CrayLabs/SmartSim/pull/527)) +- Update the telemetry monitor to enable retrieval of metrics on a + scheduled interval. Switch basic experiment tracking telemetry to + default to on. Add database metric collectors. Improve telemetry + monitor logging. Create telemetry subpackage at + [smartsim.\_core.utils.telemetry]{.title-ref}. Refactor telemetry + monitor entrypoint. + ([SmartSim-PR460](https://github.com/CrayLabs/SmartSim/pull/460)) +- Users can now specify node features for a Slurm job through + `SrunSettings.set_node_feature`. The method accepts a string or list + of strings. + ([SmartSim-PR529](https://github.com/CrayLabs/SmartSim/pull/529)) +- The request to the colocated entrypoints file within the shell + script is now a blocking process. Once the Orchestrator is setup, it + returns which moves the process to the background and allows the + application to start. This prevents the application from requesting + a ML model or script that has not been uploaded to the Orchestrator + yet. + ([SmartSim-PR522](https://github.com/CrayLabs/SmartSim/pull/522)) +- Add checks and tests to ensure SmartSim users cannot initialize run + settings with a list of lists as the exe_args argument. + ([SmartSim-PR517](https://github.com/CrayLabs/SmartSim/pull/517)) +- Add readthedocs configuration file and enable readthedocs builds on + pull requests. Additionally added robots.txt file generation when + readthedocs environment detected. + ([SmartSim-PR512](https://github.com/CrayLabs/SmartSim/pull/512)) +- Add Github Actions workflow that checks if changelog is edited on + pull requests into develop. + ([SmartSim-PR518](https://github.com/CrayLabs/SmartSim/pull/518)) +- Add path to MathJax.js file so that Sphinx will use to render math + expressions. + ([SmartSim-PR516](https://github.com/CrayLabs/SmartSim/pull/516)) +- Removed deprecated SmartSim modules: slurm and mpirunSettings. + ([SmartSim-PR514](https://github.com/CrayLabs/SmartSim/pull/514)) +- Implemented new structure of SmartSim documentation. Added examples + images and further detail of SmartSim components. + ([SmartSim-PR463](https://github.com/CrayLabs/SmartSim/pull/463)) +- Promote SmartSim statuses to a dedicated type named SmartSimStatus. + ([SmartSim-PR509](https://github.com/CrayLabs/SmartSim/pull/509)) +- Update Redis version to [7.2.4]{.title-ref}. This change fixes an + issue in the Redis build scripts causing failures on Apple Silicon + hosts. + ([SmartSim-PR507](https://github.com/CrayLabs/SmartSim/pull/507)) +- The container which builds the documentation for every merge to + develop was failing due to a lack of space within the container. + This was fixed by including an additional Github action that removes + some unneeded software and files that come from the default Github + Ubuntu container. + ([SmartSim-PR504](https://github.com/CrayLabs/SmartSim/pull/504)) +- Update the generic [t.Any]{.title-ref} typehints in Experiment API. + ([SmartSim-PR501](https://github.com/CrayLabs/SmartSim/pull/501)) +- The CI will fail static analysis if common erroneous truthy checks + are detected. + ([SmartSim-PR524](https://github.com/CrayLabs/SmartSim/pull/524)) +- Prevent the launch of duplicate named entities. Allow completed + entities to run. + ([SmartSim-PR480](https://github.com/CrayLabs/SmartSim/pull/480)) +- The CI will fail static analysis if a local variable used while + potentially undefined. + ([SmartSim-PR521](https://github.com/CrayLabs/SmartSim/pull/521)) +- Remove previously deprecated behavior present in test suite on + machines with Slurm and Open MPI. + ([SmartSim-PR520](https://github.com/CrayLabs/SmartSim/pull/520)) +- Experiments in the WLM tests are given explicit paths to prevent + unexpected directory creation. Ensure database are not left open on + test suite failures. Update path to pickle file in + `tests/full_wlm/test_generic_orc_launch_batch.py::test_launch_cluster_orc_reconnect` + to conform with changes made in + ([SmartSim-PR533](https://github.com/CrayLabs/SmartSim/pull/533)). + ([SmartSim-PR559](https://github.com/CrayLabs/SmartSim/pull/559)) +- When calling `Experiment.start` SmartSim would register a signal + handler that would capture an interrupt signal (\^C) to kill any + jobs launched through its `JobManager`. This would replace the + default (or user defined) signal handler. SmartSim will now attempt + to kill any launched jobs before calling the previously registered + signal handler. + ([SmartSim-PR535](https://github.com/CrayLabs/SmartSim/pull/535)) + +### 0.6.2 + +Released on 16 February, 2024 + +Description + +- Patch SmartSim dependency version + +Detailed Notes + +- A critical performance concern was identified and addressed in + SmartRedis. A patch fix was deployed, and SmartSim was updated to + ensure users do not inadvertently pull the unpatched version of + SmartRedis. + ([SmartSim-PR493](https://github.com/CrayLabs/SmartSim/pull/493)) + +### 0.6.1 + +Released on 15 February, 2024 + +Description + +- Duplicate for DBModel/Script prevented +- Update license to include 2024 +- Telemetry monitor is now active by default +- Add support for Mac OSX on Apple Silicon +- Remove Torch warnings during testing +- Validate Slurm timing format +- Expose Python Typehints +- Fix test_logs to prevent generation of directory +- Fix Python Typehint for colocated database settings +- Python 3.11 Support +- Quality of life [smart validate]{.title-ref} improvements +- Remove Cobalt support +- Enrich logging through context variables +- Upgrade Machine Learning dependencies +- Override sphinx-tabs background color +- Add concurrency group to test workflow +- Fix index when installing torch through smart build + +Detailed Notes + +- Modify the [git clone]{.title-ref} for both Redis and RedisAI to set + the line endings to unix-style line endings when using MacOS on ARM. + ([SmartSim-PR482](https://github.com/CrayLabs/SmartSim/pull/482)) +- Separate install instructions are now provided for Mac OSX on x64 vs + ARM64 + ([SmartSim-PR479](https://github.com/CrayLabs/SmartSim/pull/479)) +- Prevent duplicate ML model and script names being added to an + Ensemble member if the names exists. + ([SmartSim-PR475](https://github.com/CrayLabs/SmartSim/pull/475)) +- Updates [Copyright (c) 2021-2023]{.title-ref} to [Copyright (c) + 2021-2024]{.title-ref} in all of the necessary files. + ([SmartSim-PR485](https://github.com/CrayLabs/SmartSim/pull/485)) +- Bug fix which prevents the expected behavior when the + [SMARTSIM_LOG_LEVEL]{.title-ref} environment variable was set to + [developer]{.title-ref}. + ([SmartSim-PR473](https://github.com/CrayLabs/SmartSim/pull/473)) +- Sets the default value of the \"enable telemetry\" flag to on. Bumps + the output [manifest.json]{.title-ref} version number to match that + of [smartdashboard]{.title-ref} and pins a watchdog version to avoid + build errors. + ([SmartSim-PR477](https://github.com/CrayLabs/SmartSim/pull/477)) +- Refactor logic of [Manifest.has_db_objects]{.title-ref} to remove + excess branching and improve readability/maintainability. + ([SmartSim-PR476](https://github.com/CrayLabs/SmartSim/pull/476)) +- SmartSim can now be built and used on platforms using Apple Silicon + (ARM64). Currently, only the PyTorch backend is supported. Note that + libtorch will be downloaded from a CrayLabs github repo. + ([SmartSim-PR465](https://github.com/CrayLabs/SmartSim/pull/465)) +- Tests that were saving Torch models were emitting warnings. These + warnings were addressed by updating the model save test function. + ([SmartSim-PR472](https://github.com/CrayLabs/SmartSim/pull/472)) +- Validate the timing format when requesting a slurm allocation. + ([SmartSim-PR471](https://github.com/CrayLabs/SmartSim/pull/471)) +- Add and ship [py.typed]{.title-ref} marker to expose inline type + hints. Fix type errors related to SmartRedis. + ([SmartSim-PR468](https://github.com/CrayLabs/SmartSim/pull/468)) +- Fix the [test_logs.py::test_context_leak]{.title-ref} test that was + erroneously creating a directory named [some value]{.title-ref} in + SmartSim\'s root directory. + ([SmartSim-PR467](https://github.com/CrayLabs/SmartSim/pull/467)) +- Add Python type hinting to colocated settings. + ([SmartSim-PR462](https://github.com/CrayLabs/SmartSim/pull/462)) +- Add github actions for running black and isort checks. + ([SmartSim-PR464](https://github.com/CrayLabs/SmartSim/pull/464)) +- Relax the required version of [typing_extensions]{.title-ref}. + ([SmartSim-PR459](https://github.com/CrayLabs/SmartSim/pull/459)) +- Addition of Python 3.11 to SmartSim. + ([SmartSim-PR461](https://github.com/CrayLabs/SmartSim/pull/461)) +- Quality of life [smart validate]{.title-ref} improvements such as + setting [CUDA_VISIBLE_DEVICES]{.title-ref} environment variable + within [smart validate]{.title-ref} prior to importing any ML deps + to prevent false negatives on multi-GPU systems. Additionally, move + SmartRedis logs from standard out to dedicated log file in the + validation temporary directory as well as suppress + [sklearn]{.title-ref} deprecation warning by pinning + [KMeans]{.title-ref} constructor argument. Lastly, move TF test to + last as TF may reserve the GPUs it uses. + ([SmartSim-PR458](https://github.com/CrayLabs/SmartSim/pull/458)) +- Some actions in the current GitHub CI/CD workflows were outdated. + They were replaced with the latest versions. + ([SmartSim-PR446](https://github.com/CrayLabs/SmartSim/pull/446)) +- As the Cobalt workload manager is not used on any system we are + aware of, its support in SmartSim was terminated and classes such as + [CobaltLauncher]{.title-ref} have been removed. + ([SmartSim-PR448](https://github.com/CrayLabs/SmartSim/pull/448)) +- Experiment logs are written to a file that can be read by the + dashboard. + ([SmartSim-PR452](https://github.com/CrayLabs/SmartSim/pull/452)) +- Updated SmartSim\'s machine learning backends to PyTorch 2.0.1, + Tensorflow 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result + of this change, there is now an available ONNX wheel for use with + Python 3.10, and wheels for all of SmartSim\'s machine learning + backends with Python 3.11. + ([SmartSim-PR451](https://github.com/CrayLabs/SmartSim/pull/451)) + ([SmartSim-PR461](https://github.com/CrayLabs/SmartSim/pull/461)) +- The sphinx-tabs documentation extension uses a white background for + the tabs component. A custom CSS for those components to inherit the + overall theme color has been added. + ([SmartSim-PR453](https://github.com/CrayLabs/SmartSim/pull/453)) +- Add concurrency groups to GitHub\'s CI/CD workflows, preventing + multiple workflows from the same PR to be launched concurrently. + ([SmartSim-PR439](https://github.com/CrayLabs/SmartSim/pull/439)) +- Torch changed their preferred indexing when trying to install their + provided wheels. Updated the [pip install]{.title-ref} command + within [smart build]{.title-ref} to ensure that the appropriate + packages can be found. + ([SmartSim-PR449](https://github.com/CrayLabs/SmartSim/pull/449)) + +### 0.6.0 + +Released on 18 December, 2023 + +Description + +- Conflicting directives in the SmartSim packaging instructions were + fixed +- [sacct]{.title-ref} and [sstat]{.title-ref} errors are now fatal for + Slurm-based workflow executions +- Added documentation section about ML features and TorchScript +- Added TorchScript functions to Online Analysis tutorial +- Added multi-DB example to documentation +- Improved test stability on HPC systems +- Added support for producing & consuming telemetry outputs +- Split tests into groups for parallel execution in CI/CD pipeline +- Change signature of [Experiment.summary()]{.title-ref} +- Expose first_device parameter for scripts, functions, models +- Added support for MINBATCHTIMEOUT in model execution +- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit +- Add support for multiple databases + +Detailed Notes + +- Several conflicting directives between the [setup.py]{.title-ref} + and the [setup.cfg]{.title-ref} were fixed to mitigate warnings + issued when building the pip wheel. + ([SmartSim-PR435](https://github.com/CrayLabs/SmartSim/pull/435)) +- When the Slurm functions [sacct]{.title-ref} and [sstat]{.title-ref} + returned an error, it would be ignored and SmartSim\'s state could + become inconsistent. To prevent this, errors raised by + [sacct]{.title-ref} or [sstat]{.title-ref} now result in an + exception. + ([SmartSim-PR392](https://github.com/CrayLabs/SmartSim/pull/392)) +- A section named *ML Features* was added to documentation. It + contains multiple examples of how ML models and functions can be + added to and executed on the DB. TorchScript-based post-processing + was added to the *Online Analysis* tutorial + ([SmartSim-PR411](https://github.com/CrayLabs/SmartSim/pull/411)) +- An example of how to use multiple Orchestrators concurrently was + added to the documentation + ([SmartSim-PR409](https://github.com/CrayLabs/SmartSim/pull/409)) +- The test infrastructure was improved. Tests on HPC system are now + stable, and issues such as non-stopped [Orchestrators]{.title-ref} + or experiments created in the wrong paths have been fixed + ([SmartSim-PR381](https://github.com/CrayLabs/SmartSim/pull/381)) +- A telemetry monitor was added to check updates and produce events + for SmartDashboard + ([SmartSim-PR426](https://github.com/CrayLabs/SmartSim/pull/426)) +- Split tests into [group_a]{.title-ref}, [group_b]{.title-ref}, + [slow_tests]{.title-ref} for parallel execution in CI/CD pipeline + ([SmartSim-PR417](https://github.com/CrayLabs/SmartSim/pull/417), + [SmartSim-PR424](https://github.com/CrayLabs/SmartSim/pull/424)) +- Change [format]{.title-ref} argument to [style]{.title-ref} in + [Experiment.summary()]{.title-ref}, this is an API break + ([SmartSim-PR391](https://github.com/CrayLabs/SmartSim/pull/391)) +- Added support for first_device parameter for scripts, functions, and + models. This causes them to be loaded to the first num_devices + beginning with first_device + ([SmartSim-PR394](https://github.com/CrayLabs/SmartSim/pull/394)) +- Added support for MINBATCHTIMEOUT in model execution, which caps the + delay waiting for a minimium number of model execution operations to + accumulate before executing them as a batch + ([SmartSim-PR387](https://github.com/CrayLabs/SmartSim/pull/387)) +- RedisAI 1.2.5 is not supported anymore. The only RedisAI version is + now 1.2.7. Since the officially released RedisAI 1.2.7 has a bug + which breaks the build process on Mac OSX, it was decided to use + commit + [634916c](https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2) + from RedisAI\'s GitHub repository, where such bug has been fixed. + This applies to all operating systems. + ([SmartSim-PR383](https://github.com/CrayLabs/SmartSim/pull/383)) +- Add support for creation of multiple databases with unique + identifiers. + ([SmartSim-PR342](https://github.com/CrayLabs/SmartSim/pull/342)) + +### 0.5.1 + +Released on 14 September, 2023 + +Description + +- Add typehints throughout the SmartSim codebase +- Provide support for Slurm heterogeneous jobs +- Provide better support for [PalsMpiexecSettings]{.title-ref} +- Allow for easier inspection of SmartSim entities +- Log ignored error messages from [sacct]{.title-ref} +- Fix colocated db preparation bug when using + [JsrunSettings]{.title-ref} +- Fix bug when user specify CPU and devices greater than 1 +- Fix bug when get_allocation called with reserved keywords +- Enabled mypy in CI for better type safety +- Mitigate additional suppressed pylint errors +- Update linting support and apply to existing errors +- Various improvements to the [smart]{.title-ref} CLI +- Various documentation improvements +- Various test suite improvements + +Detailed Notes + +- Add methods to allow users to inspect files attached to models and + ensembles. + ([SmartSim-PR352](https://github.com/CrayLabs/SmartSim/pull/352)) +- Add a [smart info]{.title-ref} target to provide rudimentary + information about the SmartSim installation. + ([SmartSim-PR350](https://github.com/CrayLabs/SmartSim/pull/350)) +- Remove unnecessary generation producing unexpected directories in + the test suite. + ([SmartSim-PR349](https://github.com/CrayLabs/SmartSim/pull/349)) +- Add support for heterogeneous jobs to [SrunSettings]{.title-ref} by + allowing users to set the [\--het-group]{.title-ref} parameter. + ([SmartSim-PR346](https://github.com/CrayLabs/SmartSim/pull/346)) +- Provide clearer guidelines on how to contribute to SmartSim. + ([SmartSim-PR344](https://github.com/CrayLabs/SmartSim/pull/344)) +- Integrate [PalsMpiexecSettings]{.title-ref} into the + [Experiment]{.title-ref} factory methods when using the + [\"pals\"]{.title-ref} launcher. + ([SmartSim-PR343](https://github.com/CrayLabs/SmartSim/pull/343)) +- Create public properties where appropriate to mitigate + [protected-access]{.title-ref} errors. + ([SmartSim-PR341](https://github.com/CrayLabs/SmartSim/pull/341)) +- Fix a failure to execute [\_prep_colocated_db]{.title-ref} due to + incorrect named attr check. + ([SmartSim-PR339](https://github.com/CrayLabs/SmartSim/pull/339)) +- Enabled and mitigated mypy [disallow_any_generics]{.title-ref} and + [warn_return_any]{.title-ref}. + ([SmartSim-PR338](https://github.com/CrayLabs/SmartSim/pull/338)) +- Add a [smart validate]{.title-ref} target to provide a simple smoke + test to assess a SmartSim build. + ([SmartSim-PR336](https://github.com/CrayLabs/SmartSim/pull/336), + [SmartSim-PR351](https://github.com/CrayLabs/SmartSim/pull/351)) +- Add typehints to [smartsim.\_core.launcher.step.\*]{.title-ref}. + ([SmartSim-PR334](https://github.com/CrayLabs/SmartSim/pull/334)) +- Log errors reported from slurm WLM when attempts to retrieve status + fail. + ([SmartSim-PR331](https://github.com/CrayLabs/SmartSim/pull/331), + [SmartSim-PR332](https://github.com/CrayLabs/SmartSim/pull/332)) +- Fix incorrectly formatted positional arguments in log format + strings. + ([SmartSim-PR330](https://github.com/CrayLabs/SmartSim/pull/330)) +- Ensure that launchers pass environment variables to unmanaged job + steps. + ([SmartSim-PR329](https://github.com/CrayLabs/SmartSim/pull/329)) +- Add additional tests surrounding the [RAI_PATH]{.title-ref} + configuration environment variable. + ([SmartSim-PR328](https://github.com/CrayLabs/SmartSim/pull/328)) +- Remove unnecessary execution of unescaped shell commands. + ([SmartSim-PR327](https://github.com/CrayLabs/SmartSim/pull/327)) +- Add error if user calls get_allocation with reserved keywords in + slurm get_allocation. + ([SmartSim-PR325](https://github.com/CrayLabs/SmartSim/pull/325)) +- Add error when user requests CPU with devices greater than 1 within + add_ml_model and add_script. + ([SmartSim-PR324](https://github.com/CrayLabs/SmartSim/pull/324)) +- Update documentation surrounding ensemble key prefixing. + ([SmartSim-PR322](https://github.com/CrayLabs/SmartSim/pull/322)) +- Fix formatting of the Frontier site installation. + ([SmartSim-PR321](https://github.com/CrayLabs/SmartSim/pull/321)) +- Update pylint dependency, update .pylintrc, mitigate non-breaking + issues, suppress api breaks. + ([SmartSim-PR311](https://github.com/CrayLabs/SmartSim/pull/311)) +- Refactor the [smart]{.title-ref} CLI to use subparsers for better + documentation and extension. + ([SmartSim-PR308](https://github.com/CrayLabs/SmartSim/pull/308)) + +### 0.5.0 + +Released on 6 July, 2023 + +Description + +A full list of changes and detailed notes can be found below: + +- Update SmartRedis dependency to v0.4.1 +- Fix tests for db models and scripts +- Fix add_ml_model() and add_script() documentation, tests, and code +- Remove [requirements.txt]{.title-ref} and other places where + dependencies were defined +- Replace [limit_app_cpus]{.title-ref} with + [limit_db_cpus]{.title-ref} for co-located orchestrators +- Remove wait time associated with Experiment launch summary +- Update and rename Redis conf file +- Migrate from redis-py-cluster to redis-py +- Update full test suite to not require a TF wheel at test time +- Update doc strings +- Remove deprecated code +- Relax the coloredlogs version +- Update Fortran tutorials for SmartRedis +- Add support for multiple network interface binding in Orchestrator + and Colocated DBs +- Add typehints and static analysis + +Detailed notes + +- Updates SmartRedis to the most current release + ([SmartSim-PR316](https://github.com/CrayLabs/SmartSim/pull/316)) +- Fixes and enhancements to documentation + ([SmartSim-PR317](https://github.com/CrayLabs/SmartSim/pull/317), + [SmartSim-PR314](https://github.com/CrayLabs/SmartSim/pull/314), + [SmartSim-PR287](https://github.com/CrayLabs/SmartSim/pull/287)) +- Various fixes and enhancements to the test suite + ([SmartSim-PR315](https://github.com/CrayLabs/SmartSim/pull/314), + [SmartSim-PR312](https://github.com/CrayLabs/SmartSim/pull/312), + [SmartSim-PR310](https://github.com/CrayLabs/SmartSim/pull/310), + [SmartSim-PR302](https://github.com/CrayLabs/SmartSim/pull/302), + [SmartSim-PR283](https://github.com/CrayLabs/SmartSim/pull/283)) +- Fix a defect in the tests related to database models and scripts + that was causing key collisions when testing on workload managers + ([SmartSim-PR313](https://github.com/CrayLabs/SmartSim/pull/313)) +- Remove [requirements.txt]{.title-ref} and other places where + dependencies were defined. + ([SmartSim-PR307](https://github.com/CrayLabs/SmartSim/pull/307)) +- Fix defect where dictionaries used to create run settings can be + changed unexpectedly due to copy-by-ref + ([SmartSim-PR305](https://github.com/CrayLabs/SmartSim/pull/305)) +- The underlying code for Model.add_ml_model() and Model.add_script() + was fixed to correctly handle multi-GPU configurations. Tests were + updated to run on non-local launchers. Documentation was updated and + fixed. Also, the default testing interface has been changed to lo + instead of ipogif. + ([SmartSim-PR304](https://github.com/CrayLabs/SmartSim/pull/304)) +- Typehints have been added. A makefile target [make + check-mypy]{.title-ref} executes static analysis with mypy. + ([SmartSim-PR295](https://github.com/CrayLabs/SmartSim/pull/295), + [SmartSim-PR301](https://github.com/CrayLabs/SmartSim/pull/301), + [SmartSim-PR303](https://github.com/CrayLabs/SmartSim/pull/303)) +- Replace [limit_app_cpus]{.title-ref} with + [limit_db_cpus]{.title-ref} for co-located orchestrators. This + resolves some incorrect behavior/assumptions about how the + application would be pinned. Instead, users should directly specify + the binding options in their application using the options + appropriate for their launcher + ([SmartSim-PR306](https://github.com/CrayLabs/SmartSim/pull/306)) +- Simplify code in [random_permutations]{.title-ref} parameter + generation strategy + ([SmartSim-PR300](https://github.com/CrayLabs/SmartSim/pull/300)) +- Remove wait time associated with Experiment launch summary + ([SmartSim-PR298](https://github.com/CrayLabs/SmartSim/pull/298)) +- Update Redis conf file to conform with Redis v7.0.5 conf file + ([SmartSim-PR293](https://github.com/CrayLabs/SmartSim/pull/293)) +- Migrate from redis-py-cluster to redis-py for cluster status checks + ([SmartSim-PR292](https://github.com/CrayLabs/SmartSim/pull/292)) +- Update full test suite to no longer require a tensorflow wheel to be + available at test time. + ([SmartSim-PR291](https://github.com/CrayLabs/SmartSim/pull/291)) +- Correct spelling of colocated in doc strings + ([SmartSim-PR290](https://github.com/CrayLabs/SmartSim/pull/290)) +- Deprecated launcher-specific orchestrators, constants, and ML + utilities were removed. + ([SmartSim-PR289](https://github.com/CrayLabs/SmartSim/pull/289)) +- Relax the coloredlogs version to be greater than 10.0 + ([SmartSim-PR288](https://github.com/CrayLabs/SmartSim/pull/288)) +- Update the Github Actions runner image from + [macos-10.15]{.title-ref}[ to \`macos-12]{.title-ref}\`. The former + began deprecation in May 2022 and was finally removed in May 2023. + ([SmartSim-PR285](https://github.com/CrayLabs/SmartSim/pull/285)) +- The Fortran tutorials had not been fully updated to show how to + handle return/error codes. These have now all been updated. + ([SmartSim-PR284](https://github.com/CrayLabs/SmartSim/pull/284)) +- Orchestrator and Colocated DB now accept a list of interfaces to + bind to. The argument name is still [interface]{.title-ref} for + backward compatibility reasons. + ([SmartSim-PR281](https://github.com/CrayLabs/SmartSim/pull/281)) +- Typehints have been added to public APIs. A makefile target to + execute static analysis with mypy is available [make + check-mypy]{.title-ref}. + ([SmartSim-PR295](https://github.com/CrayLabs/SmartSim/pull/295)) + +### 0.4.2 + +Released on April 12, 2023 + +Description + +This release of SmartSim had a focus on polishing and extending exiting +features already provided by SmartSim. Most notably, this release +provides support to allow users to colocate their models with an +orchestrator using Unix domain sockets and support for launching models +as batch jobs. + +Additionally, SmartSim has updated its tool chains to provide a better +user experience. Notably, SmarSim can now be used with Python 3.10, +Redis 7.0.5, and RedisAI 1.2.7. Furthermore, SmartSim now utilizes +SmartRedis\'s aggregation lists to streamline the use and extension of +ML data loaders, making working with popular machine learning frameworks +in SmartSim a breeze. + +A full list of changes and detailed notes can be found below: + +- Add support for colocating an orchestrator over UDS +- Add support for Python 3.10, deprecate support for Python 3.7 and + RedisAI 1.2.3 +- Drop support for Ray +- Update ML data loaders to make use of SmartRedis\'s aggregation + lists +- Allow for models to be launched independently as batch jobs +- Update to current version of Redis to 7.0.5 +- Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, + ONNXRuntime 1.11.1 +- Fix bug in colocated database entrypoint when loading PyTorch models +- Fix test suite behavior with environment variables + +Detailed Notes + +- Running some tests could result in some SmartSim-specific + environment variables to be set. Such environment variables are now + reset after each test execution. Also, a warning for environment + variable usage in Slurm was added, to make the user aware in case an + environment variable will not be assigned the desired value with + [\--export]{.title-ref}. + ([SmartSim-PR270](https://github.com/CrayLabs/SmartSim/pull/270)) +- The PyTorch and TensorFlow data loaders were update to make use of + aggregation lists. This breaks their API, but makes them easier to + use. + ([SmartSim-PR264](https://github.com/CrayLabs/SmartSim/pull/264)) +- The support for Ray was dropped, as its most recent versions caused + problems when deployed through SmartSim. We plan to release a + separate add-on library to accomplish the same results. If you are + interested in getting the Ray launch functionality back in your + workflow, please get in touch with us! + ([SmartSim-PR263](https://github.com/CrayLabs/SmartSim/pull/263)) +- Update from Redis version 6.0.8 to 7.0.5. + ([SmartSim-PR258](https://github.com/CrayLabs/SmartSim/pull/258)) +- Adds support for Python 3.10 without the ONNX machine learning + backend. Deprecates support for Python 3.7 as it will stop receiving + security updates. Deprecates support for RedisAI 1.2.3. Update the + build process to be able to correctly fetch supported dependencies. + If a user attempts to build an unsupported dependency, an error + message is shown highlighting the discrepancy. + ([SmartSim-PR256](https://github.com/CrayLabs/SmartSim/pull/256)) +- Models were given a [batch_settings]{.title-ref} attribute. When + launching a model through [Experiment.start]{.title-ref} the + [Experiment]{.title-ref} will first check for a non-nullish value at + that attribute. If the check is satisfied, the + [Experiment]{.title-ref} will attempt to wrap the underlying run + command in a batch job using the object referenced at + [Model.batch_settings]{.title-ref} as the batch settings for the + job. If the check is not satisfied, the [Model]{.title-ref} is + launched in the traditional manner as a job step. + ([SmartSim-PR245](https://github.com/CrayLabs/SmartSim/pull/245)) +- Fix bug in colocated database entrypoint stemming from uninitialized + variables. This bug affects PyTorch models being loaded into the + database. + ([SmartSim-PR237](https://github.com/CrayLabs/SmartSim/pull/237)) +- The release of RedisAI 1.2.7 allows us to update support for recent + versions of PyTorch, Tensorflow, and ONNX + ([SmartSim-PR234](https://github.com/CrayLabs/SmartSim/pull/234)) +- Make installation of correct Torch backend more reliable according + to instruction from PyTorch +- In addition to TCP, add UDS support for colocating an orchestrator + with models. Methods [Model.colocate_db_tcp]{.title-ref} and + [Model.colocate_db_uds]{.title-ref} were added to expose this + functionality. The [Model.colocate_db]{.title-ref} method remains + and uses TCP for backward compatibility + ([SmartSim-PR246](https://github.com/CrayLabs/SmartSim/pull/246)) + +### 0.4.1 + +Released on June 24, 2022 + +Description: This release of SmartSim introduces a new experimental +feature to help make SmartSim workflows more portable: the ability to +run simulations models in a container via Singularity. This feature has +been tested on a small number of platforms and we encourage users to +provide feedback on its use. + +We have also made improvements in a variety of areas: new utilities to +load scripts and machine learning models into the database directly from +SmartSim driver scripts and install-time choice to use either +[KeyDB]{.title-ref} or [Redis]{.title-ref} for the Orchestrator. The +[RunSettings]{.title-ref} API is now more consistent across subclasses. +Another key focus of this release was to aid new SmartSim users by +including more extensive tutorials and improving the documentation. The +docker image containing the SmartSim tutorials now also includes a +tutorial on online training. + +Launcher improvements + +- New methods for specifying [RunSettings]{.title-ref} parameters + ([SmartSim-PR166](https://github.com/CrayLabs/SmartSim/pull/166)) + ([SmartSim-PR170](https://github.com/CrayLabs/SmartSim/pull/170)) +- Better support for [mpirun]{.title-ref}, [mpiexec]{.title-ref}, + and [orterun]{.title-ref} as launchers + ([SmartSim-PR186](https://github.com/CrayLabs/SmartSim/pull/186)) +- Experimental: add support for running models via Singularity + ([SmartSim-PR204](https://github.com/CrayLabs/SmartSim/pull/204)) + +Documentation and tutorials + +- Tutorial updates + ([SmartSim-PR155](https://github.com/CrayLabs/SmartSim/pull/155)) + ([SmartSim-PR203](https://github.com/CrayLabs/SmartSim/pull/203)) + ([SmartSim-PR208](https://github.com/CrayLabs/SmartSim/pull/208)) +- Add SmartSim Zoo info to documentation + ([SmartSim-PR175](https://github.com/CrayLabs/SmartSim/pull/175)) +- New tutorial for demonstrating online training + ([SmartSim-PR176](https://github.com/CrayLabs/SmartSim/pull/176)) + ([SmartSim-PR188](https://github.com/CrayLabs/SmartSim/pull/188)) + +General improvements and bug fixes + +- Set models and scripts at the driver level + ([SmartSim-PR185](https://github.com/CrayLabs/SmartSim/pull/185)) +- Optionally use KeyDB for the orchestrator + ([SmartSim-PR180](https://github.com/CrayLabs/SmartSim/pull/180)) +- Ability to specify system-level libraries + ([SmartSim-PR154](https://github.com/CrayLabs/SmartSim/pull/154)) + ([SmartSim-PR182](https://github.com/CrayLabs/SmartSim/pull/182)) +- Fix the handling of LSF gpus_per_shard + ([SmartSim-PR164](https://github.com/CrayLabs/SmartSim/pull/164)) +- Fix error when re-running [smart build]{.title-ref} + ([SmartSim-PR165](https://github.com/CrayLabs/SmartSim/pull/165)) +- Fix generator hanging when tagged configuration variables are + missing + ([SmartSim-PR177](https://github.com/CrayLabs/SmartSim/pull/177)) + +Dependency updates + +- CMake version from 3.10 to 3.13 + ([SmartSim-PR152](https://github.com/CrayLabs/SmartSim/pull/152)) +- Update click to 8.0.2 + ([SmartSim-PR200](https://github.com/CrayLabs/SmartSim/pull/200)) + +### 0.4.0 + +Released on Feb 11, 2022 + +Description: In this release SmartSim continues to promote ease of use. +To this end SmartSim has introduced new portability features that allow +users to abstract away their targeted hardware, while providing even +more compatibility with existing libraries. + +A new feature, Co-located orchestrator deployments has been added which +provides scalable online inference capabilities that overcome previous +performance limitations in seperated orchestrator/application +deployments. For more information on advantages of co-located +deployments, see the Orchestrator section of the SmartSim documentation. + +The SmartSim build was significantly improved to increase customization +of build toolchain and the `smart` command line inferface was expanded. + +Additional tweaks and upgrades have also been made to ensure an optimal +experience. Here is a comprehensive list of changes made in SmartSim +0.4.0. + +Orchestrator Enhancements: + +- Add Orchestrator Co-location + ([SmartSim-PR139](https://github.com/CrayLabs/SmartSim/pull/139)) +- Add Orchestrator configuration file edit methods + ([SmartSim-PR109](https://github.com/CrayLabs/SmartSim/pull/109)) + +Emphasize Driver Script Portability: + +- Add ability to create run settings through an experiment + ([SmartSim-PR110](https://github.com/CrayLabs/SmartSim/pull/110)) +- Add ability to create batch settings through an experiment + ([SmartSim-PR112](https://github.com/CrayLabs/SmartSim/pull/112)) +- Add automatic launcher detection to experiment portability + functions + ([SmartSim-PR120](https://github.com/CrayLabs/SmartSim/pull/120)) + +Expand Machine Learning Library Support: + +- Data loaders for online training in Keras/TF and Pytorch + ([SmartSim-PR115](https://github.com/CrayLabs/SmartSim/pull/115)) + ([SmartSim-PR140](https://github.com/CrayLabs/SmartSim/pull/140)) +- ML backend versions updated with expanded support for multiple + versions + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- Launch Ray internally using `RunSettings` + ([SmartSim-PR118](https://github.com/CrayLabs/SmartSim/pull/118)) +- Add Ray cluster setup and deployment to SmartSim + ([SmartSim-PR50](https://github.com/CrayLabs/SmartSim/pull/50)) + +Expand Launcher Setting Options: + +- Add ability to use base `RunSettings` on a Slurm, or PBS launchers + ([SmartSim-PR90](https://github.com/CrayLabs/SmartSim/pull/90)) +- Add ability to use base `RunSettings` on LFS launcher + ([SmartSim-PR108](https://github.com/CrayLabs/SmartSim/pull/108)) + +Deprecations and Breaking Changes + +- Orchestrator classes combined into single implementation for + portability + ([SmartSim-PR139](https://github.com/CrayLabs/SmartSim/pull/139)) +- `smartsim.constants` changed to `smartsim.status` + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- `smartsim.tf` migrated to `smartsim.ml.tf` + ([SmartSim-PR115](https://github.com/CrayLabs/SmartSim/pull/115)) + ([SmartSim-PR140](https://github.com/CrayLabs/SmartSim/pull/140)) +- TOML configuration option removed in favor of environment variable + approach + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) + +General Improvements and Bug Fixes: + +- Improve and extend parameter handling + ([SmartSim-PR107](https://github.com/CrayLabs/SmartSim/pull/107)) + ([SmartSim-PR119](https://github.com/CrayLabs/SmartSim/pull/119)) +- Abstract away non-user facing implementation details + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- Add various dimensions to the CI build matrix for SmartSim testing + ([SmartSim-PR130](https://github.com/CrayLabs/SmartSim/pull/130)) +- Add missing functions to LSFSettings API + ([SmartSim-PR113](https://github.com/CrayLabs/SmartSim/pull/113)) +- Add RedisAI checker for installed backends + ([SmartSim-PR137](https://github.com/CrayLabs/SmartSim/pull/137)) +- Remove heavy and unnecessary dependencies + ([SmartSim-PR116](https://github.com/CrayLabs/SmartSim/pull/116)) + ([SmartSim-PR132](https://github.com/CrayLabs/SmartSim/pull/132)) +- Fix LSFLauncher and LSFOrchestrator + ([SmartSim-PR86](https://github.com/CrayLabs/SmartSim/pull/86)) +- Fix over greedy Workload Manager Parsers + ([SmartSim-PR95](https://github.com/CrayLabs/SmartSim/pull/95)) +- Fix Slurm handling of comma-separated env vars + ([SmartSim-PR104](https://github.com/CrayLabs/SmartSim/pull/104)) +- Fix internal method calls + ([SmartSim-PR138](https://github.com/CrayLabs/SmartSim/pull/138)) + +Documentation Updates: + +- Updates to documentation build process + ([SmartSim-PR133](https://github.com/CrayLabs/SmartSim/pull/133)) + ([SmartSim-PR143](https://github.com/CrayLabs/SmartSim/pull/143)) +- Updates to documentation content + ([SmartSim-PR96](https://github.com/CrayLabs/SmartSim/pull/96)) + ([SmartSim-PR129](https://github.com/CrayLabs/SmartSim/pull/129)) + ([SmartSim-PR136](https://github.com/CrayLabs/SmartSim/pull/136)) + ([SmartSim-PR141](https://github.com/CrayLabs/SmartSim/pull/141)) +- Update SmartSim Examples + ([SmartSim-PR68](https://github.com/CrayLabs/SmartSim/pull/68)) + ([SmartSim-PR100](https://github.com/CrayLabs/SmartSim/pull/100)) + +### 0.3.2 + +Released on August 10, 2021 + +Description: + +- Upgraded RedisAI backend to 1.2.3 + ([SmartSim-PR69](https://github.com/CrayLabs/SmartSim/pull/69)) +- PyTorch 1.7.1, TF 2.4.2, and ONNX 1.6-7 + ([SmartSim-PR69](https://github.com/CrayLabs/SmartSim/pull/69)) +- LSF launcher for IBM machines + ([SmartSim-PR62](https://github.com/CrayLabs/SmartSim/pull/62)) +- Improved code coverage by adding more unit tests + ([SmartSim-PR53](https://github.com/CrayLabs/SmartSim/pull/53)) +- Orchestrator methods to get address and check status + ([SmartSim-PR60](https://github.com/CrayLabs/SmartSim/pull/60)) +- Added Manifest object that tracks deployables in Experiments + ([SmartSim-PR61](https://github.com/CrayLabs/SmartSim/pull/61)) +- Bug fixes + ([SmartSim-PR52](https://github.com/CrayLabs/SmartSim/pull/52)) + ([SmartSim-PR58](https://github.com/CrayLabs/SmartSim/pull/58)) + ([SmartSim-PR67](https://github.com/CrayLabs/SmartSim/pull/67)) + ([SmartSim-PR73](https://github.com/CrayLabs/SmartSim/pull/73)) +- Updated documentation and examples + ([SmartSim-PR51](https://github.com/CrayLabs/SmartSim/pull/51)) + ([SmartSim-PR57](https://github.com/CrayLabs/SmartSim/pull/57)) + ([SmartSim-PR71](https://github.com/CrayLabs/SmartSim/pull/71)) +- Improved IP address aquisition + ([SmartSim-PR72](https://github.com/CrayLabs/SmartSim/pull/72)) +- Binding database to network interfaces + +### 0.3.1 + +Released on May 5, 2021 + +Description: This release was dedicated to making the install process +easier. SmartSim can be installed from PyPI now and the `smart` cli tool +makes installing the machine learning runtimes much easier. + +- Pip install + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- `smart` cli tool for ML backends + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- Build Documentation for updated install + ([SmartSim-PR43](https://github.com/CrayLabs/SmartSim/pull/43)) +- Migrate from Jenkins to Github Actions CI + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- Bug fix for setup.cfg + ([SmartSim-PR35](https://github.com/CrayLabs/SmartSim/pull/35)) + +### 0.3.0 + +Released on April 1, 2021 + +Description: + +- initial 0.3.0 (first public) release of SmartSim + +------------------------------------------------------------------------ + +(smartredis-changelog)= +## SmartRedis + +```{include} ../smartredis/doc/changelog.md +:start-line: 2 +``` + +------------------------------------------------------------------------ + +(smartdashboard-changelog)= +## SmartDashboard + +```{include} ../smartdashboard/doc/changelog.md +:start-line: 2 +``` diff --git a/doc/changelog.rst b/doc/changelog.rst deleted file mode 100644 index d6b735232..000000000 --- a/doc/changelog.rst +++ /dev/null @@ -1,699 +0,0 @@ -********* -Changelog -********* - -Listed here are the changes between each release of SmartSim -and SmartRedis. - -Jump to :ref:`SmartRedis Changelog ` - - -SmartSim -======== - - - -0.6.2 ------ - -Released on 16 February, 2024 - -Description - -- Patch SmartSim dependency version - - -Detailed Notes - -- A critical performance concern was identified and addressed in SmartRedis. A - patch fix was deployed, and SmartSim was updated to ensure users do not - inadvertently pull the unpatched version of SmartRedis. (SmartSim-PR493_) - - -.. _SmartSim-PR493: https://github.com/CrayLabs/SmartSim/pull/493 - - -0.6.1 ------ - -Released on 15 February, 2024 - -Description - -- Duplicate for DBModel/Script prevented -- Update license to include 2024 -- Telemetry monitor is now active by default -- Add support for Mac OSX on Apple Silicon -- Remove Torch warnings during testing -- Validate Slurm timing format -- Expose Python Typehints -- Fix test_logs to prevent generation of directory -- Fix Python Typehint for colocated database settings -- Python 3.11 Support -- Quality of life `smart validate` improvements -- Remove Cobalt support -- Enrich logging through context variables -- Upgrade Machine Learning dependencies -- Override sphinx-tabs background color -- Add concurrency group to test workflow -- Fix index when installing torch through smart build - - -Detailed Notes - -- Modify the `git clone` for both Redis and RedisAI to set the line endings to - unix-style line endings when using MacOS on ARM. (SmartSim-PR482_) -- Separate install instructions are now provided for Mac OSX on x64 vs ARM64 (SmartSim-PR479_) -- Prevent duplicate ML model and script names being added to an - Ensemble member if the names exists. (SmartSim-PR475_) -- Updates `Copyright (c) 2021-2023` to `Copyright (c) 2021-2024` - in all of the necessary files. (SmartSim-PR485_) -- Bug fix which prevents the expected behavior when the `SMARTSIM_LOG_LEVEL` - environment variable was set to `developer`. (SmartSim-PR473_) -- Sets the default value of the "enable telemetry" flag to on. - Bumps the output `manifest.json` version number to match that of - `smartdashboard` and pins a watchdog version to avoid build errors. - (SmartSim-PR477_) -- Refactor logic of `Manifest.has_db_objects` to remove excess branching - and improve readability/maintainability. (SmartSim-PR476_) -- SmartSim can now be built and used on platforms using Apple Silicon - (ARM64). Currently, only the PyTorch backend is supported. Note that libtorch - will be downloaded from a CrayLabs github repo. (SmartSim-PR465_) -- Tests that were saving Torch models were emitting warnings. These warnings - were addressed by updating the model save test function. (SmartSim-PR472_) -- Validate the timing format when requesting a slurm allocation. (SmartSim-PR471_) -- Add and ship `py.typed` marker to expose inline type hints. Fix - type errors related to SmartRedis. (SmartSim-PR468_) -- Fix the `test_logs.py::test_context_leak` test that was - erroneously creating a directory named `some value` in SmartSim's root - directory. (SmartSim-PR467_) -- Add Python type hinting to colocated settings. (SmartSim-PR462_) -- Add github actions for running black and isort checks. (SmartSim-PR464_) -- Relax the required version of `typing_extensions`. (SmartSim-PR459_) -- Addition of Python 3.11 to SmartSim. (SmartSim-PR461_) -- Quality of life `smart validate` improvements such as setting `CUDA_VISIBLE_DEVICES` - environment variable within `smart validate` prior to importing any ML deps to - prevent false negatives on multi-GPU systems. Additionally, move SmartRedis logs - from standard out to dedicated log file in the validation temporary directory as well as - suppress `sklearn` deprecation warning by pinning `KMeans` constructor - argument. Lastly, move TF test to last as TF may reserve the GPUs it uses. - (SmartSim-PR458_) -- Some actions in the current GitHub CI/CD workflows were outdated. They were - replaced with the latest versions. (SmartSim-PR446_) -- As the Cobalt workload manager is not used on any system we are aware of, - its support in SmartSim was terminated and classes such as `CobaltLauncher` have - been removed. (SmartSim-PR448_) -- Experiment logs are written to a file that can be read by the dashboard. (SmartSim-PR452_) -- Updated SmartSim's machine learning backends to PyTorch 2.0.1, Tensorflow - 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result of this change, - there is now an available ONNX wheel for use with Python 3.10, and wheels for - all of SmartSim's machine learning backends with Python 3.11. - (SmartSim-PR451_) (SmartSim-PR461_) -- The sphinx-tabs documentation extension uses a white background for the tabs component. - A custom CSS for those components to inherit the overall theme color has - been added. (SmartSim-PR453_) -- Add concurrency groups to GitHub's CI/CD workflows, preventing - multiple workflows from the same PR to be launched concurrently. - (SmartSim-PR439_) -- Torch changed their preferred indexing when trying to install - their provided wheels. Updated the `pip install` command within - `smart build` to ensure that the appropriate packages can be found. - (SmartSim-PR449_) - - -.. _SmartSim-PR485: https://github.com/CrayLabs/SmartSim/pull/485 -.. _SmartSim-PR482: https://github.com/CrayLabs/SmartSim/pull/482 -.. _SmartSim-PR479: https://github.com/CrayLabs/SmartSim/pull/479 -.. _SmartSim-PR477: https://github.com/CrayLabs/SmartSim/pull/477 -.. _SmartSim-PR476: https://github.com/CrayLabs/SmartSim/pull/476 -.. _SmartSim-PR475: https://github.com/CrayLabs/SmartSim/pull/475 -.. _SmartSim-PR473: https://github.com/CrayLabs/SmartSim/pull/473 -.. _SmartSim-PR472: https://github.com/CrayLabs/SmartSim/pull/472 -.. _SmartSim-PR471: https://github.com/CrayLabs/SmartSim/pull/471 -.. _SmartSim-PR468: https://github.com/CrayLabs/SmartSim/pull/468 -.. _SmartSim-PR467: https://github.com/CrayLabs/SmartSim/pull/467 -.. _SmartSim-PR465: https://github.com/CrayLabs/SmartSim/pull/465 -.. _SmartSim-PR464: https://github.com/CrayLabs/SmartSim/pull/464 -.. _SmartSim-PR462: https://github.com/CrayLabs/SmartSim/pull/462 -.. _SmartSim-PR461: https://github.com/CrayLabs/SmartSim/pull/461 -.. _SmartSim-PR459: https://github.com/CrayLabs/SmartSim/pull/459 -.. _SmartSim-PR458: https://github.com/CrayLabs/SmartSim/pull/458 -.. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 -.. _SmartSim-PR452: https://github.com/CrayLabs/SmartSim/pull/452 -.. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 -.. _SmartSim-PR449: https://github.com/CrayLabs/SmartSim/pull/449 -.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 -.. _SmartSim-PR446: https://github.com/CrayLabs/SmartSim/pull/446 -.. _SmartSim-PR439: https://github.com/CrayLabs/SmartSim/pull/439 - -0.6.0 ------ - -Released on 18 December, 2023 - -Description - -- Conflicting directives in the SmartSim packaging instructions were fixed -- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions -- Added documentation section about ML features and TorchScript -- Added TorchScript functions to Online Analysis tutorial -- Added multi-DB example to documentation -- Improved test stability on HPC systems -- Added support for producing & consuming telemetry outputs -- Split tests into groups for parallel execution in CI/CD pipeline -- Change signature of `Experiment.summary()` -- Expose first_device parameter for scripts, functions, models -- Added support for MINBATCHTIMEOUT in model execution -- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit -- Add support for multiple databases - -Detailed Notes - -- Several conflicting directives between the `setup.py` and the `setup.cfg` were fixed - to mitigate warnings issued when building the pip wheel. (SmartSim-PR435_) -- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored - and SmartSim's state could become inconsistent. To prevent this, errors - raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_) -- A section named *ML Features* was added to documentation. It contains multiple - examples of how ML models and functions can be added to and executed on the DB. - TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_) -- An example of how to use multiple Orchestrators concurrently was added to the documentation (SmartSim-PR409_) -- The test infrastructure was improved. Tests on HPC system are now stable, and issues such - as non-stopped `Orchestrators` or experiments created in the wrong paths have been fixed (SmartSim-PR381_) -- A telemetry monitor was added to check updates and produce events for SmartDashboard (SmartSim-PR426_) -- Split tests into `group_a`, `group_b`, `slow_tests` for parallel execution in CI/CD pipeline (SmartSim-PR417_, SmartSim-PR424_) -- Change `format` argument to `style` in `Experiment.summary()`, this is - an API break (SmartSim-PR391_) -- Added support for first_device parameter for scripts, functions, - and models. This causes them to be loaded to the first num_devices - beginning with first_device (SmartSim-PR394_) -- Added support for MINBATCHTIMEOUT in model execution, which caps the delay - waiting for a minimium number of model execution operations to accumulate - before executing them as a batch (SmartSim-PR387_) -- RedisAI 1.2.5 is not supported anymore. The only RedisAI version - is now 1.2.7. Since the officially released RedisAI 1.2.7 has a - bug which breaks the build process on Mac OSX, it was decided to - use commit 634916c_ from RedisAI's GitHub repository, where such - bug has been fixed. This applies to all operating systems. (SmartSim-PR383_) -- Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_) - - -.. _SmartSim-PR435: https://github.com/CrayLabs/SmartSim/pull/435 -.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392 -.. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411 -.. _SmartSim-PR409: https://github.com/CrayLabs/SmartSim/pull/409 -.. _SmartSim-PR381: https://github.com/CrayLabs/SmartSim/pull/381 -.. _SmartSim-PR426: https://github.com/CrayLabs/SmartSim/pull/426 -.. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424 -.. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417 -.. _SmartSim-PR391: https://github.com/CrayLabs/SmartSim/pull/391 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 -.. _SmartSim-PR394: https://github.com/CrayLabs/SmartSim/pull/394 -.. _SmartSim-PR387: https://github.com/CrayLabs/SmartSim/pull/387 -.. _SmartSim-PR383: https://github.com/CrayLabs/SmartSim/pull/383 -.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 - - -0.5.1 ------ - -Released on 14 September, 2023 - -Description - -- Add typehints throughout the SmartSim codebase -- Provide support for Slurm heterogeneous jobs -- Provide better support for `PalsMpiexecSettings` -- Allow for easier inspection of SmartSim entities -- Log ignored error messages from `sacct` -- Fix colocated db preparation bug when using `JsrunSettings` -- Fix bug when user specify CPU and devices greater than 1 -- Fix bug when get_allocation called with reserved keywords -- Enabled mypy in CI for better type safety -- Mitigate additional suppressed pylint errors -- Update linting support and apply to existing errors -- Various improvements to the `smart` CLI -- Various documentation improvements -- Various test suite improvements - -Detailed Notes - -- Add methods to allow users to inspect files attached to models and ensembles. (SmartSim-PR352_) -- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (SmartSim-PR350_) -- Remove unnecessary generation producing unexpected directories in the test suite. (SmartSim-PR349_) -- Add support for heterogeneous jobs to `SrunSettings` by allowing users to set the `--het-group` parameter. (SmartSim-PR346_) -- Provide clearer guidelines on how to contribute to SmartSim. (SmartSim-PR344_) -- Integrate `PalsMpiexecSettings` into the `Experiment` factory methods when using the `"pals"` launcher. (SmartSim-PR343_) -- Create public properties where appropriate to mitigate `protected-access` errors. (SmartSim-PR341_) -- Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (SmartSim-PR339_) -- Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (SmartSim-PR338_) -- Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (SmartSim-PR336_, SmartSim-PR351_) -- Add typehints to `smartsim._core.launcher.step.*`. (SmartSim-PR334_) -- Log errors reported from slurm WLM when attempts to retrieve status fail. (SmartSim-PR331_, SmartSim-PR332_) -- Fix incorrectly formatted positional arguments in log format strings. (SmartSim-PR330_) -- Ensure that launchers pass environment variables to unmanaged job steps. (SmartSim-PR329_) -- Add additional tests surrounding the `RAI_PATH` configuration environment variable. (SmartSim-PR328_) -- Remove unnecessary execution of unescaped shell commands. (SmartSim-PR327_) -- Add error if user calls get_allocation with reserved keywords in slurm get_allocation. (SmartSim-PR325_) -- Add error when user requests CPU with devices greater than 1 within add_ml_model and add_script. (SmartSim-PR324_) -- Update documentation surrounding ensemble key prefixing. (SmartSim-PR322_) -- Fix formatting of the Frontier site installation. (SmartSim-PR321_) -- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (SmartSim-PR311_) -- Refactor the `smart` CLI to use subparsers for better documentation and extension. (SmartSim-PR308_) - -.. _SmartSim-PR352: https://github.com/CrayLabs/SmartSim/pull/352 -.. _SmartSim-PR351: https://github.com/CrayLabs/SmartSim/pull/351 -.. _SmartSim-PR350: https://github.com/CrayLabs/SmartSim/pull/350 -.. _SmartSim-PR349: https://github.com/CrayLabs/SmartSim/pull/349 -.. _SmartSim-PR346: https://github.com/CrayLabs/SmartSim/pull/346 -.. _SmartSim-PR344: https://github.com/CrayLabs/SmartSim/pull/344 -.. _SmartSim-PR343: https://github.com/CrayLabs/SmartSim/pull/343 -.. _SmartSim-PR341: https://github.com/CrayLabs/SmartSim/pull/341 -.. _SmartSim-PR339: https://github.com/CrayLabs/SmartSim/pull/339 -.. _SmartSim-PR338: https://github.com/CrayLabs/SmartSim/pull/338 -.. _SmartSim-PR336: https://github.com/CrayLabs/SmartSim/pull/336 -.. _SmartSim-PR334: https://github.com/CrayLabs/SmartSim/pull/334 -.. _SmartSim-PR332: https://github.com/CrayLabs/SmartSim/pull/332 -.. _SmartSim-PR331: https://github.com/CrayLabs/SmartSim/pull/331 -.. _SmartSim-PR330: https://github.com/CrayLabs/SmartSim/pull/330 -.. _SmartSim-PR329: https://github.com/CrayLabs/SmartSim/pull/329 -.. _SmartSim-PR328: https://github.com/CrayLabs/SmartSim/pull/328 -.. _SmartSim-PR327: https://github.com/CrayLabs/SmartSim/pull/327 -.. _SmartSim-PR325: https://github.com/CrayLabs/SmartSim/pull/325 -.. _SmartSim-PR324: https://github.com/CrayLabs/SmartSim/pull/324 -.. _SmartSim-PR322: https://github.com/CrayLabs/SmartSim/pull/322 -.. _SmartSim-PR321: https://github.com/CrayLabs/SmartSim/pull/321 -.. _SmartSim-PR311: https://github.com/CrayLabs/SmartSim/pull/311 -.. _SmartSim-PR308: https://github.com/CrayLabs/SmartSim/pull/308 - - -0.5.0 ------ - -Released on 6 July, 2023 - -Description - -A full list of changes and detailed notes can be found below: - -- Update SmartRedis dependency to v0.4.1 -- Fix tests for db models and scripts -- Fix add_ml_model() and add_script() documentation, tests, and code -- Remove `requirements.txt` and other places where dependencies were defined -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators -- Remove wait time associated with Experiment launch summary -- Update and rename Redis conf file -- Migrate from redis-py-cluster to redis-py -- Update full test suite to not require a TF wheel at test time -- Update doc strings -- Remove deprecated code -- Relax the coloredlogs version -- Update Fortran tutorials for SmartRedis -- Add support for multiple network interface binding in Orchestrator and Colocated DBs -- Add typehints and static analysis - -Detailed notes - -- Updates SmartRedis to the most current release (SmartSim-PR316_) -- Fixes and enhancements to documentation (SmartSim-PR317_, SmartSim-PR314_, SmartSim-PR287_) -- Various fixes and enhancements to the test suite (SmartSim-PR315_, SmartSim-PR312_, SmartSim-PR310_, SmartSim-PR302_, SmartSim-PR283_) -- Fix a defect in the tests related to database models and scripts that was - causing key collisions when testing on workload managers (SmartSim-PR313_) -- Remove `requirements.txt` and other places where dependencies were defined. (SmartSim-PR307_) -- Fix defect where dictionaries used to create run settings can be changed - unexpectedly due to copy-by-ref (SmartSim-PR305_) -- The underlying code for Model.add_ml_model() and Model.add_script() was fixed - to correctly handle multi-GPU configurations. Tests were updated to run on - non-local launchers. Documentation was updated and fixed. Also, the default - testing interface has been changed to lo instead of ipogif. (SmartSim-PR304_) -- Typehints have been added. A makefile target `make check-mypy` executes static - analysis with mypy. (SmartSim-PR295_, SmartSim-PR301_, SmartSim-PR303_) -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators. - This resolves some incorrect behavior/assumptions about how the application - would be pinned. Instead, users should directly specify the binding options in - their application using the options appropriate for their launcher (SmartSim-PR306_) -- Simplify code in `random_permutations` parameter generation strategy (SmartSim-PR300_) -- Remove wait time associated with Experiment launch summary (SmartSim-PR298_) -- Update Redis conf file to conform with Redis v7.0.5 conf file (SmartSim-PR293_) -- Migrate from redis-py-cluster to redis-py for cluster status checks (SmartSim-PR292_) -- Update full test suite to no longer require a tensorflow wheel to be available at test time. (SmartSim-PR291_) -- Correct spelling of colocated in doc strings (SmartSim-PR290_) -- Deprecated launcher-specific orchestrators, constants, and ML - utilities were removed. (SmartSim-PR289_) -- Relax the coloredlogs version to be greater than 10.0 (SmartSim-PR288_) -- Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The - former began deprecation in May 2022 and was finally removed in May 2023. (SmartSim-PR285_) -- The Fortran tutorials had not been fully updated to show how to handle - return/error codes. These have now all been updated. (SmartSim-PR284_) -- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The - argument name is still `interface` for backward compatibility reasons. (SmartSim-PR281_) -- Typehints have been added to public APIs. A makefile target to execute static - analysis with mypy is available `make check-mypy`. (SmartSim-PR295_) - -.. _SmartSim-PR317: https://github.com/CrayLabs/SmartSim/pull/317 -.. _SmartSim-PR316: https://github.com/CrayLabs/SmartSim/pull/316 -.. _SmartSim-PR315: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR314: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR313: https://github.com/CrayLabs/SmartSim/pull/313 -.. _SmartSim-PR312: https://github.com/CrayLabs/SmartSim/pull/312 -.. _SmartSim-PR310: https://github.com/CrayLabs/SmartSim/pull/310 -.. _SmartSim-PR307: https://github.com/CrayLabs/SmartSim/pull/307 -.. _SmartSim-PR306: https://github.com/CrayLabs/SmartSim/pull/306 -.. _SmartSim-PR305: https://github.com/CrayLabs/SmartSim/pull/305 -.. _SmartSim-PR304: https://github.com/CrayLabs/SmartSim/pull/304 -.. _SmartSim-PR303: https://github.com/CrayLabs/SmartSim/pull/303 -.. _SmartSim-PR302: https://github.com/CrayLabs/SmartSim/pull/302 -.. _SmartSim-PR301: https://github.com/CrayLabs/SmartSim/pull/301 -.. _SmartSim-PR300: https://github.com/CrayLabs/SmartSim/pull/300 -.. _SmartSim-PR298: https://github.com/CrayLabs/SmartSim/pull/298 -.. _SmartSim-PR295: https://github.com/CrayLabs/SmartSim/pull/295 -.. _SmartSim-PR293: https://github.com/CrayLabs/SmartSim/pull/293 -.. _SmartSim-PR292: https://github.com/CrayLabs/SmartSim/pull/292 -.. _SmartSim-PR291: https://github.com/CrayLabs/SmartSim/pull/291 -.. _SmartSim-PR290: https://github.com/CrayLabs/SmartSim/pull/290 -.. _SmartSim-PR289: https://github.com/CrayLabs/SmartSim/pull/289 -.. _SmartSim-PR288: https://github.com/CrayLabs/SmartSim/pull/288 -.. _SmartSim-PR287: https://github.com/CrayLabs/SmartSim/pull/287 -.. _SmartSim-PR285: https://github.com/CrayLabs/SmartSim/pull/285 -.. _SmartSim-PR284: https://github.com/CrayLabs/SmartSim/pull/284 -.. _SmartSim-PR283: https://github.com/CrayLabs/SmartSim/pull/283 -.. _SmartSim-PR281: https://github.com/CrayLabs/SmartSim/pull/281 - -0.4.2 ------ - -Released on April 12, 2023 - -Description - -This release of SmartSim had a focus on polishing and extending exiting -features already provided by SmartSim. Most notably, this release provides -support to allow users to colocate their models with an orchestrator using -Unix domain sockets and support for launching models as batch jobs. - -Additionally, SmartSim has updated its tool chains to provide a better user -experience. Notably, SmarSim can now be used with Python 3.10, Redis 7.0.5, and -RedisAI 1.2.7. Furthermore, SmartSim now utilizes SmartRedis's aggregation lists to -streamline the use and extension of ML data loaders, making working with popular -machine learning frameworks in SmartSim a breeze. - -A full list of changes and detailed notes can be found below: - -- Add support for colocating an orchestrator over UDS -- Add support for Python 3.10, deprecate support for Python 3.7 and RedisAI 1.2.3 -- Drop support for Ray -- Update ML data loaders to make use of SmartRedis's aggregation lists -- Allow for models to be launched independently as batch jobs -- Update to current version of Redis to 7.0.5 -- Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, ONNXRuntime 1.11.1 -- Fix bug in colocated database entrypoint when loading PyTorch models -- Fix test suite behavior with environment variables - -Detailed Notes - -- Running some tests could result in some SmartSim-specific environment variables to be set. Such environment variables are now reset - after each test execution. Also, a warning for environment variable usage in Slurm was added, to make the user aware in case an environment - variable will not be assigned the desired value with `--export`. (SmartSim-PR270_) -- The PyTorch and TensorFlow data loaders were update to make use of aggregation lists. This breaks their API, but makes them easier to use. (SmartSim-PR264_) -- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. - We plan to release a separate add-on library to accomplish the same results. If - you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (SmartSim-PR263_) -- Update from Redis version 6.0.8 to 7.0.5. (SmartSim-PR258_) -- Adds support for Python 3.10 without the ONNX machine learning backend. Deprecates support for - Python 3.7 as it will stop receiving security updates. Deprecates support for RedisAI 1.2.3. - Update the build process to be able to correctly fetch supported dependencies. If a user - attempts to build an unsupported dependency, an error message is shown highlighting the - discrepancy. (SmartSim-PR256_) -- Models were given a `batch_settings` attribute. When launching a model through `Experiment.start` - the `Experiment` will first check for a non-nullish value at that attribute. If the check is - satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using - the object referenced at `Model.batch_settings` as the batch settings for the job. If the check - is not satisfied, the `Model` is launched in the traditional manner as a job step. (SmartSim-PR245_) -- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (SmartSim-PR237_) -- The release of RedisAI 1.2.7 allows us to update support for recent versions of PyTorch, Tensorflow, and ONNX (SmartSim-PR234_) -- Make installation of correct Torch backend more reliable according to instruction from PyTorch -- In addition to TCP, add UDS support for colocating an orchestrator with models. Methods - `Model.colocate_db_tcp` and `Model.colocate_db_uds` were added to expose this functionality. - The `Model.colocate_db` method remains and uses TCP for backward compatibility (SmartSim-PR246_) - -.. _SmartSim-PR270: https://github.com/CrayLabs/SmartSim/pull/270 -.. _SmartSim-PR264: https://github.com/CrayLabs/SmartSim/pull/264 -.. _SmartSim-PR263: https://github.com/CrayLabs/SmartSim/pull/263 -.. _SmartSim-PR258: https://github.com/CrayLabs/SmartSim/pull/258 -.. _SmartSim-PR256: https://github.com/CrayLabs/SmartSim/pull/256 -.. _SmartSim-PR246: https://github.com/CrayLabs/SmartSim/pull/246 -.. _SmartSim-PR245: https://github.com/CrayLabs/SmartSim/pull/245 -.. _SmartSim-PR237: https://github.com/CrayLabs/SmartSim/pull/237 -.. _SmartSim-PR234: https://github.com/CrayLabs/SmartSim/pull/234 - - -0.4.1 ------ - -Released on June 24, 2022 - -Description: -This release of SmartSim introduces a new experimental feature to help make -SmartSim workflows more portable: the ability to run simulations models in a -container via Singularity. This feature has been tested on a small number of -platforms and we encourage users to provide feedback on its use. - -We have also made improvements in a variety of areas: new utilities to load -scripts and machine learning models into the database directly from SmartSim -driver scripts and install-time choice to use either `KeyDB` or `Redis` for the -Orchestrator. The `RunSettings` API is now more consistent across subclasses. Another -key focus of this release was to aid new SmartSim users by including more -extensive tutorials and improving the documentation. The docker image containing -the SmartSim tutorials now also includes a tutorial on online training. - - -Launcher improvements - - - New methods for specifying `RunSettings` parameters (SmartSim-PR166_) (SmartSim-PR170_) - - Better support for `mpirun`, `mpiexec`, and `orterun` as launchers (SmartSim-PR186_) - - Experimental: add support for running models via Singularity (SmartSim-PR204_) - -Documentation and tutorials - - - Tutorial updates (SmartSim-PR155_) (SmartSim-PR203_) (SmartSim-PR208_) - - Add SmartSim Zoo info to documentation (SmartSim-PR175_) - - New tutorial for demonstrating online training (SmartSim-PR176_) (SmartSim-PR188_) - -General improvements and bug fixes - - - Set models and scripts at the driver level (SmartSim-PR185_) - - Optionally use KeyDB for the orchestrator (SmartSim-PR180_) - - Ability to specify system-level libraries (SmartSim-PR154_) (SmartSim-PR182_) - - Fix the handling of LSF gpus_per_shard (SmartSim-PR164_) - - Fix error when re-running `smart build` (SmartSim-PR165_) - - Fix generator hanging when tagged configuration variables are missing (SmartSim-PR177_) - -Dependency updates - - - CMake version from 3.10 to 3.13 (SmartSim-PR152_) - - Update click to 8.0.2 (SmartSim-PR200_) - -.. _SmartSim-PR152: https://github.com/CrayLabs/SmartSim/pull/152 -.. _SmartSim-PR154: https://github.com/CrayLabs/SmartSim/pull/154 -.. _SmartSim-PR155: https://github.com/CrayLabs/SmartSim/pull/155 -.. _SmartSim-PR164: https://github.com/CrayLabs/SmartSim/pull/164 -.. _SmartSim-PR165: https://github.com/CrayLabs/SmartSim/pull/165 -.. _SmartSim-PR166: https://github.com/CrayLabs/SmartSim/pull/166 -.. _SmartSim-PR170: https://github.com/CrayLabs/SmartSim/pull/170 -.. _SmartSim-PR175: https://github.com/CrayLabs/SmartSim/pull/175 -.. _SmartSim-PR176: https://github.com/CrayLabs/SmartSim/pull/176 -.. _SmartSim-PR177: https://github.com/CrayLabs/SmartSim/pull/177 -.. _SmartSim-PR180: https://github.com/CrayLabs/SmartSim/pull/180 -.. _SmartSim-PR182: https://github.com/CrayLabs/SmartSim/pull/182 -.. _SmartSim-PR185: https://github.com/CrayLabs/SmartSim/pull/185 -.. _SmartSim-PR186: https://github.com/CrayLabs/SmartSim/pull/186 -.. _SmartSim-PR188: https://github.com/CrayLabs/SmartSim/pull/188 -.. _SmartSim-PR200: https://github.com/CrayLabs/SmartSim/pull/200 -.. _SmartSim-PR203: https://github.com/CrayLabs/SmartSim/pull/203 -.. _SmartSim-PR204: https://github.com/CrayLabs/SmartSim/pull/204 -.. _SmartSim-PR208: https://github.com/CrayLabs/SmartSim/pull/208 - -0.4.0 ------ - -Released on Feb 11, 2022 - -Description: -In this release SmartSim continues to promote ease of use. -To this end SmartSim has introduced new portability features -that allow users to abstract away their targeted hardware, -while providing even more compatibility with existing -libraries. - -A new feature, Co-located orchestrator deployments has -been added which provides scalable online inference -capabilities that overcome previous performance limitations -in seperated orchestrator/application deployments. -For more information on advantages of co-located deployments, -see the Orchestrator section of the SmartSim documentation. - -The SmartSim build was significantly improved to increase -customization of build toolchain and the ``smart`` command -line inferface was expanded. - -Additional tweaks and upgrades have also been -made to ensure an optimal experience. Here is a -comprehensive list of changes made in SmartSim 0.4.0. - - -Orchestrator Enhancements: - - - Add Orchestrator Co-location (SmartSim-PR139_) - - Add Orchestrator configuration file edit methods (SmartSim-PR109_) - -Emphasize Driver Script Portability: - - - Add ability to create run settings through an experiment (SmartSim-PR110_) - - Add ability to create batch settings through an experiment (SmartSim-PR112_) - - Add automatic launcher detection to experiment portability functions (SmartSim-PR120_) - -Expand Machine Learning Library Support: - - - Data loaders for online training in Keras/TF and Pytorch (SmartSim-PR115_) (SmartSim-PR140_) - - ML backend versions updated with expanded support for multiple versions (SmartSim-PR122_) - - Launch Ray internally using ``RunSettings`` (SmartSim-PR118_) - - Add Ray cluster setup and deployment to SmartSim (SmartSim-PR50_) - -Expand Launcher Setting Options: - - - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) - -Deprecations and Breaking Changes - - - Orchestrator classes combined into single implementation for portability (SmartSim-PR139_) - - ``smartsim.constants`` changed to ``smartsim.status`` (SmartSim-PR122_) - - ``smartsim.tf`` migrated to ``smartsim.ml.tf`` (SmartSim-PR115_) (SmartSim-PR140_) - - TOML configuration option removed in favor of environment variable approach (SmartSim-PR122_) - -General Improvements and Bug Fixes: - - - Improve and extend parameter handling (SmartSim-PR107_) (SmartSim-PR119_) - - Abstract away non-user facing implementation details (SmartSim-PR122_) - - Add various dimensions to the CI build matrix for SmartSim testing (SmartSim-PR130_) - - Add missing functions to LSFSettings API (SmartSim-PR113_) - - Add RedisAI checker for installed backends (SmartSim-PR137_) - - Remove heavy and unnecessary dependencies (SmartSim-PR116_) (SmartSim-PR132_) - - Fix LSFLauncher and LSFOrchestrator (SmartSim-PR86_) - - Fix over greedy Workload Manager Parsers (SmartSim-PR95_) - - Fix Slurm handling of comma-separated env vars (SmartSim-PR104_) - - Fix internal method calls (SmartSim-PR138_) - -Documentation Updates: - - - Updates to documentation build process (SmartSim-PR133_) (SmartSim-PR143_) - - Updates to documentation content (SmartSim-PR96_) (SmartSim-PR129_) (SmartSim-PR136_) (SmartSim-PR141_) - - Update SmartSim Examples (SmartSim-PR68_) (SmartSim-PR100_) - - -.. _SmartSim-PR50: https://github.com/CrayLabs/SmartSim/pull/50 -.. _SmartSim-PR68: https://github.com/CrayLabs/SmartSim/pull/68 -.. _SmartSim-PR86: https://github.com/CrayLabs/SmartSim/pull/86 -.. _SmartSim-PR90: https://github.com/CrayLabs/SmartSim/pull/90 -.. _SmartSim-PR95: https://github.com/CrayLabs/SmartSim/pull/95 -.. _SmartSim-PR96: https://github.com/CrayLabs/SmartSim/pull/96 -.. _SmartSim-PR100: https://github.com/CrayLabs/SmartSim/pull/100 -.. _SmartSim-PR104: https://github.com/CrayLabs/SmartSim/pull/104 -.. _SmartSim-PR107: https://github.com/CrayLabs/SmartSim/pull/107 -.. _SmartSim-PR108: https://github.com/CrayLabs/SmartSim/pull/108 -.. _SmartSim-PR109: https://github.com/CrayLabs/SmartSim/pull/109 -.. _SmartSim-PR110: https://github.com/CrayLabs/SmartSim/pull/110 -.. _SmartSim-PR112: https://github.com/CrayLabs/SmartSim/pull/112 -.. _SmartSim-PR113: https://github.com/CrayLabs/SmartSim/pull/113 -.. _SmartSim-PR115: https://github.com/CrayLabs/SmartSim/pull/115 -.. _SmartSim-PR116: https://github.com/CrayLabs/SmartSim/pull/116 -.. _SmartSim-PR118: https://github.com/CrayLabs/SmartSim/pull/118 -.. _SmartSim-PR119: https://github.com/CrayLabs/SmartSim/pull/119 -.. _SmartSim-PR120: https://github.com/CrayLabs/SmartSim/pull/120 -.. _SmartSim-PR122: https://github.com/CrayLabs/SmartSim/pull/122 -.. _SmartSim-PR129: https://github.com/CrayLabs/SmartSim/pull/129 -.. _SmartSim-PR130: https://github.com/CrayLabs/SmartSim/pull/130 -.. _SmartSim-PR132: https://github.com/CrayLabs/SmartSim/pull/132 -.. _SmartSim-PR133: https://github.com/CrayLabs/SmartSim/pull/133 -.. _SmartSim-PR136: https://github.com/CrayLabs/SmartSim/pull/136 -.. _SmartSim-PR137: https://github.com/CrayLabs/SmartSim/pull/137 -.. _SmartSim-PR138: https://github.com/CrayLabs/SmartSim/pull/138 -.. _SmartSim-PR139: https://github.com/CrayLabs/SmartSim/pull/139 -.. _SmartSim-PR140: https://github.com/CrayLabs/SmartSim/pull/140 -.. _SmartSim-PR141: https://github.com/CrayLabs/SmartSim/pull/141 -.. _SmartSim-PR143: https://github.com/CrayLabs/SmartSim/pull/143 - - -0.3.2 ------ - -Released on August 10, 2021 - -Description: - - - Upgraded RedisAI backend to 1.2.3 (SmartSim-PR69_) - - PyTorch 1.7.1, TF 2.4.2, and ONNX 1.6-7 (SmartSim-PR69_) - - LSF launcher for IBM machines (SmartSim-PR62_) - - Improved code coverage by adding more unit tests (SmartSim-PR53_) - - Orchestrator methods to get address and check status (SmartSim-PR60_) - - Added Manifest object that tracks deployables in Experiments (SmartSim-PR61_) - - Bug fixes (SmartSim-PR52_) (SmartSim-PR58_) (SmartSim-PR67_) (SmartSim-PR73_) - - Updated documentation and examples (SmartSim-PR51_) (SmartSim-PR57_) (SmartSim-PR71_) - - Improved IP address aquisition (SmartSim-PR72_) - - Binding database to network interfaces - -.. _SmartSim-PR51: https://github.com/CrayLabs/SmartSim/pull/51 -.. _SmartSim-PR52: https://github.com/CrayLabs/SmartSim/pull/52 -.. _SmartSim-PR53: https://github.com/CrayLabs/SmartSim/pull/53 -.. _SmartSim-PR57: https://github.com/CrayLabs/SmartSim/pull/57 -.. _SmartSim-PR58: https://github.com/CrayLabs/SmartSim/pull/58 -.. _SmartSim-PR60: https://github.com/CrayLabs/SmartSim/pull/60 -.. _SmartSim-PR61: https://github.com/CrayLabs/SmartSim/pull/61 -.. _SmartSim-PR62: https://github.com/CrayLabs/SmartSim/pull/62 -.. _SmartSim-PR67: https://github.com/CrayLabs/SmartSim/pull/67 -.. _SmartSim-PR69: https://github.com/CrayLabs/SmartSim/pull/69 -.. _SmartSim-PR71: https://github.com/CrayLabs/SmartSim/pull/71 -.. _SmartSim-PR72: https://github.com/CrayLabs/SmartSim/pull/72 -.. _SmartSim-PR73: https://github.com/CrayLabs/SmartSim/pull/73 - -0.3.1 ------ - -Released on May 5, 2021 - -Description: -This release was dedicated to making the install process -easier. SmartSim can be installed from PyPI now and the -``smart`` cli tool makes installing the machine learning -runtimes much easier. - - - Pip install (SmartSim-PR42_) - - ``smart`` cli tool for ML backends (SmartSim-PR42_) - - Build Documentation for updated install (SmartSim-PR43_) - - Migrate from Jenkins to Github Actions CI (SmartSim-PR42_) - - Bug fix for setup.cfg (SmartSim-PR35_) - -.. _SmartSim-PR43: https://github.com/CrayLabs/SmartSim/pull/43 -.. _SmartSim-PR42: https://github.com/CrayLabs/SmartSim/pull/42 -.. _SmartSim-PR35: https://github.com/CrayLabs/SmartSim/pull/35 - -0.3.0 ------ - -Released on April 1, 2021 - -Description: - - - initial 0.3.0 (first public) release of SmartSim - - ---------------------------------------------------------------- - -.. _sr_changelog: - -SmartRedis -========== - -.. include:: ../smartredis/doc/changelog.rst - :start-line: 3 diff --git a/doc/conf.py b/doc/conf.py index e489fd797..932bce013 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -14,6 +14,9 @@ import os import sys +import logging +import inspect +from sphinx.util.logging import SphinxLoggerAdapter sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- @@ -26,7 +29,7 @@ import smartsim version = smartsim.__version__ except ImportError: - version = "0.6.2" + version = "0.7.0" # The full version, including alpha/beta/rc tags release = version @@ -39,6 +42,7 @@ # ones. extensions = [ 'sphinx.ext.autodoc', + 'sphinx_autodoc_typehints', 'sphinx.ext.autosectionlabel', 'sphinx.ext.todo', 'sphinx.ext.coverage', @@ -52,18 +56,39 @@ 'breathe', 'nbsphinx', 'sphinx_copybutton', - 'sphinx_tabs.tabs' + 'sphinx_tabs.tabs', + 'sphinx_design', + 'sphinx.ext.mathjax', + 'myst_parser' ] - +# sphinx_autodoc_typehints configurations +always_use_bars_union = True +typehints_document_rtype = True +typehints_use_signature = True +typehints_use_signature_return = True +typehints_defaults = 'comma' + +autodoc_mock_imports = ["smartredis.smartredisPy"] suppress_warnings = ['autosectionlabel'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +linkcheck_ignore = [ + 'Redis::set_model_multigpu', +] + +# The path to the MathJax.js file that Sphinx will use to render math expressions +mathjax_path = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "**.ipynb_checkpoints"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "**.ipynb_checkpoints", "tutorials/ml_training/surrogate/README.md", "tutorials/online_analysis/lattice/README.md"] breathe_projects = { "c_client":"../smartredis/doc/c_client/xml", @@ -82,6 +107,12 @@ # a list of builtin themes. html_theme = "sphinx_book_theme" +# Check if the environment variable is set to 'True' +if os.environ.get('READTHEDOCS') == "True": + # If it is, generate the robots.txt file + with open('./robots.txt', 'w') as f: + f.write("# Disallow crawling of the Read the Docs URL\nUser-agent: *\nDisallow: /en/") + html_extra_path = ['./robots.txt'] # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -104,8 +135,43 @@ # white background with dark themes. If sphinx-tabs updates its # static/tabs.css, this may need to be updated. html_css_files = ['custom_tab_style.css'] - autoclass_content = 'both' add_module_names = False nbsphinx_execute = 'never' + +from inspect import getsourcefile + +# Get path to directory containing this file, conf.py. +DOCS_DIRECTORY = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0))) + +def ensure_pandoc_installed(_): + import pypandoc + + # Download pandoc if necessary. If pandoc is already installed and on + # the PATH, the installed version will be used. Otherwise, we will + # download a copy of pandoc into docs/bin/ and add that to our PATH. + pandoc_dir = os.path.join(DOCS_DIRECTORY, "bin") + # Add dir containing pandoc binary to the PATH environment variable + if pandoc_dir not in os.environ["PATH"].split(os.pathsep): + os.environ["PATH"] += os.pathsep + pandoc_dir + pypandoc.ensure_pandoc_installed( + targetfolder=pandoc_dir, + delete_installer=True, + ) + + +def setup(app): + app.connect("builder-inited", ensure_pandoc_installed) + + # Below code from https://github.com/sphinx-doc/sphinx/issues/10219 + def _is_sphinx_logger_adapter(obj): + return isinstance(obj, SphinxLoggerAdapter) + class ForwardReferenceFilter(logging.Filter): + def filter(self, record): + # Suppress the warning related to forward references + return "Cannot resolve forward reference in type annotations" not in record.getMessage() + + members = inspect.getmembers(app.extensions['sphinx_autodoc_typehints'].module, _is_sphinx_logger_adapter) + for _, adapter in members: + adapter.logger.addFilter(ForwardReferenceFilter()) diff --git a/doc/dragon.rst b/doc/dragon.rst new file mode 100644 index 000000000..0bf6a8ea3 --- /dev/null +++ b/doc/dragon.rst @@ -0,0 +1,169 @@ +****** +Dragon +****** + +======== +Overview +======== + +Dragon is a composable distributed run-time targeting HPC workflows. In SmartSim, +Dragon can be used as a launcher, within a Slurm or PBS allocation or batch job. +The SmartSim team collaborates with the Dragon team to develop an efficient +launcher which will enable fast, interactive, and customized execution of +complex workflows on large HPC systems. As Dragon is scheduler-agnostic, +the same SmartSim script using Dragon as a launcher can be run indifferently +on a Slurm or PBS system. Support for additional schedulers is coming soon. + +.. warning:: + The Dragon launcher is currently in its early development stage and should be treated as + a prototype implementation. Your assistance is invaluable in identifying any issues + encountered during usage and suggesting missing features for implementation. Please + provide feedback in the form of a created issue on the + `SmartSim issues GitHub page `_. + The :ref:`Known Issues section` is also a good starting + point when troubleshooting workflows run using the Dragon launcher. + +===== +Usage +===== +To use Dragon, you need to install it in your current Python environment. This can +be accomplished by providing the ``--dragon`` flag to the ``smart build`` command, as +detailed in the :ref:`Dragon Install `. Note that specifying the device +configuration is also required for a proper build. + +After installation, specify Dragon as the launcher when creating an ``Experiment``: + +.. code-block:: python + + exp = Experiment(name="dragon-example", launcher="dragon") + +Dragon introduces its own run settings class, ``DragonRunSettings``, which allows users to +specify nodes and tasks per node for a ``Model``. For instance, continuing from the previous +example: + +.. code-block:: python + + # Because "dragon" was specified as the launcher during Experiment initialization, + # create_run_settings will return a DragonRunSettings object + rs = exp.create_run_settings(exe="mpi_app", + exe_args=["--option", "value"], + env_vars={"MYVAR": "VALUE"}) + # Above we specify the executable (exe), executable arguments (exe_args) + # and environment variables (env_vars) + + # Sets the number of nodes for this job + rs.set_nodes(4) + # Set the tasks per node for this job + rs.set_tasks_per_node(3) + # Initialize the Model and pass in the DragonRunSettings object + mpi_app = exp.create_model("MPI_APP", run_settings=rs) + # Start the Model + exp.start(mpi_app) + +SmartSim supports ``DragonRunSettings`` with ``Model``, ``Ensemble`` and ``Orchestrator`` entities. +In the next sections, we detail how Dragon is integrated into SmartSim. + +For more information on HPC launchers, visit the :ref:`Run Settings` page. + +================= +The Dragon Server +================= + +Dragon can initiate processes on any available resource within an allocation. To facilitate +this, SmartSim initializes the Dragon infrastructure whenever a ``Model`` is launched and maintains +it until the parent ``Experiment`` concludes. To facilitate interaction with processes managed by +Dragon, SmartSim establishes a command server within the Dragon infrastructure. This server, +known as the `Dragon Server`, is responsible for executing commands to start or stop processes +and to query their status. + +Sharing the Dragon Server across Experiments +============================================ + +Currently, SmartSim supports only one Dragon server per allocation. Consequently, +if multiple Experiments need to run within the same allocation, the Dragon server +must be shared among them. By default, the server starts from a subdirectory +of the ``Experiment`` path, where it creates a configuration file. +To enable server sharing, users can specify a custom path +from which the server should be launched. This can be achieved by setting the +environment variable ``SMARTSIM_DRAGON_SERVER_PATH`` to an existing absolute path. +Each ``Experiment`` will then search for the configuration file in the specified path +and initiate a new server instance only if the file is not found. + +Dragon's High-Speed Transport Agents +==================================== + +On systems equipped with the HPE Slingshot interconnect, Dragon utilizes High-Speed +Transport Agents (HSTA) by default for internal messaging within the infrastructure +launched by SmartSim. On systems without the HPE Slingshot interconnect, +TCP agents are employed. To specify the use of TCP agents, users must set the environment +variable ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp`` prior to executing the Experiment. +To specify HSTA, ``SMARTSIM_DRAGON_TRANSPORT`` can be set to ``hsta`` or left unset. + +============= +Communication +============= + +SmartSim and the Dragon Server communicate using `ZeroMQ `_. + +Similar to other communication protocols, defining timeouts for send and receive operations +is crucial in SmartSim. SmartSim configures default timeouts that have been tested on various +systems, such as Polaris, Perlmutter, and other HPE Cray EX and Apollo systems. +However, if you encounter failed communication attempts, adjusting the timeouts may +be necessary. You can adjust these timeouts by setting the corresponding environment variables: + +- **Server Start-up Timeout**: This timeout specifies the duration the SmartSim ``Experiment`` + waits when the server is initially started. It must accommodate the time required for + Dragon to set up the infrastructure, which varies based on the system's workload manager + response time. The default timeout is `"300000"` milliseconds (i.e., five minutes), and you can override + it using the ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` environment variable. + +- **Server Send and Receive Timeout**: This timeout dictates how long SmartSim and the Dragon + server wait to send or receive a message. The default timeout is `"30000"` milliseconds (i.e., 30 seconds), + and you can modify it using the ``SMARTSIM_DRAGON_TIMEOUT`` environment variable. + +Setting any timeout to "-1" will result in an infinite waiting time, causing the execution to +block until the communication is completed, potentially hanging indefinitely if issues occur. + +It's important to note that all communications are secured with `elliptic curve cryptography `_. +SmartSim generates the necessary key-pairs and stores them in the user's home directory by +default. However, you can specify an alternative absolute path using the ``SMARTSIM_KEY_PATH`` +environment variable. + +.. _dragon_known_issues: + +============ +Known issues +============ + +As previously noted, the integration of SmartSim with Dragon is still in its early +development stage, and there are known issues that may result in unexpected behavior +during runs: + +- **Incomplete cleanup of Dragon resources**: When SmartSim exits, it attempts to properly + shut down the Dragon infrastructure to clean up associated resources, such as shared memory + segments, and terminate all processes. However, in rare cases, if the execution is + abruptly interrupted (e.g., by terminating SmartSim with ``SIGKILL``), the cleanup process + may be incomplete, leaving processes like the Dragon overlay network active on the node + where SmartSim was executed (which could be a login node, particularly on Slurm systems). + If this occurs, you can use the following command to address the issue: + + .. code-block:: + + smart teardown --dragon + + This command will terminate all Dragon-related processes, release shared memory segments, + but also terminate all Python processes associated with your username. + +- **Dragon server not starting**: This issue may arise due to two main reasons: + + 1. *HSTA not available on the system*: Try setting the environment variable + ``SMARTSIM_DRAGON_TRANSPORT`` to ``tcp``. + 2. *System or Workload Manager too busy*: Attempt to mitigate this by setting the environment + variable ``SMARTSIM_DRAGON_STARTUP_TIMEOUT`` to a larger value or ``"-1"``. + +- **MPI-based applications hanging**: To run MPI-based applications on Dragon, Cray PMI or + Cray PALS must be available on the system. This limitation is currently being addressed. + + +Interested users can learn more about the Dragon project at the external +`Dragon documentation page `_. \ No newline at end of file diff --git a/doc/ensemble.rst b/doc/ensemble.rst new file mode 100644 index 000000000..93019d18d --- /dev/null +++ b/doc/ensemble.rst @@ -0,0 +1,1214 @@ +.. _ensemble_doc: + +******** +Ensemble +******** +======== +Overview +======== +A SmartSim ``Ensemble`` enables users to run a **group** of computational tasks together in an +``Experiment`` workflow. An ``Ensemble`` is comprised of multiple ``Model`` objects, +where each ``Ensemble`` member (SmartSim ``Model``) represents an individual application. +An ``Ensemble`` can be managed as a single entity and +launched with other :ref:`Model's` and :ref:`Orchestrators` to construct AI-enabled workflows. + +The :ref:`Ensemble API` offers key features, including methods to: + +- :ref:`Attach Configuration Files` for use at ``Ensemble`` runtime. +- :ref:`Load AI Models` (TF, TF-lite, PT, or ONNX) into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Load TorchScripts` into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Prevent Data Collisions` within the ``Ensemble``, which allows for reuse of application code. + +To create a SmartSim ``Ensemble``, use the ``Experiment.create_ensemble`` API function. When +initializing an ``Ensemble``, consider one of the **three** creation strategies explained +in the :ref:`Initialization` section. + +SmartSim manages ``Ensemble`` instances through the :ref:`Experiment API` by providing functions to +launch, monitor, and stop applications. + +.. _init_ensemble_strategies: + +============== +Initialization +============== +Overview +======== +The :ref:`Experiment API` is responsible for initializing all workflow entities. +An ``Ensemble`` is created using the ``Experiment.create_ensemble`` factory method, and users can customize the +``Ensemble`` creation via the factory method parameters. + +The factory method arguments for ``Ensemble`` creation can be found in the :ref:`Experiment API` +under the ``create_ensemble`` docstring. + +By using specific combinations of the factory method arguments, users can tailor +the creation of an ``Ensemble`` to align with one of the following creation strategies: + +1. :ref:`Parameter Expansion`: Generate a variable-sized set of unique simulation instances + configured with user-defined input parameters. +2. :ref:`Replica Creation`: Generate a specified number of ``Model`` replicas. +3. :ref:`Manually`: Attach pre-configured ``Model``'s to an ``Ensemble`` to manage as a single unit. + +.. _param_expansion_init: + +Parameter Expansion +=================== +Parameter expansion is a technique that allows users to set parameter values per ``Ensemble`` member. +This is done by specifying input to the `params` and `perm_strategy` factory method arguments during +``Ensemble`` creation (``Experiment.create_ensemble``). Users may control how the `params` values +are applied to the ``Ensemble`` through the `perm_strategy` argument. The `perm_strategy` argument +accepts three values listed below. + +**Parameter Expansion Strategy Options:** + +- `"all_perm"`: Generate all possible parameter permutations for an exhaustive exploration. This + means that every possible combination of parameters will be used in the ``Ensemble``. +- `"step"`: Create parameter sets by collecting identically indexed values across parameter lists. + This allows for discrete combinations of parameters for ``Model``'s. +- `"random"`: Enable random selection from predefined parameter spaces, offering a stochastic approach. + This means that the parameters will be chosen randomly for each ``Model``, which can be useful + for exploring a wide range of possibilities. + +-------- +Examples +-------- +This subsection contains two examples of ``Ensemble`` parameter expansion. The +:ref:`first example` illustrates parameter expansion using two parameters +while the :ref:`second example` demonstrates parameter expansion with two +parameters along with the launch of the ``Ensemble`` as a batch workload. + +.. _param_first_ex: + +Example 1 : Parameter Expansion Using `all_perm` Strategy + + In this example an ``Ensemble`` of four ``Model`` entities is created by expanding two parameters + using the `all_perm` strategy. All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings`` + and only differ in the value of the `params` assigned to each member. The source code example + is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + + Begin by initializing a ``RunSettings`` object to apply to + all ``Ensemble`` members: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 6-7 + + Next, define the parameters that will be applied to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 9-13 + + Finally, initialize an ``Ensemble`` by specifying the ``RunSettings``, `params` and `perm_strategy="all_perm"`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py + :language: python + :linenos: + :lines: 15-16 + + By specifying `perm_strategy="all_perm"`, all permutations of the `params` will + be calculated and distributed across ``Ensemble`` members. Here there are four permutations of the `params` values: + + .. code-block:: bash + + ensemble member 1: ["Ellie", 2] + ensemble member 2: ["Ellie", 11] + ensemble member 3: ["John", 2] + ensemble member 4: ["John", 11] + +.. _param_second_ex: + +Example 2 : Parameter Expansion Using `step` Strategy with the ``Ensemble`` Configured For Batch Launching + + In this example an ``Ensemble`` of two ``Model`` entities is created by expanding two parameters + using the `step` strategy. All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings`` + and only differ in the value of the `params` assigned to each member. Lastly, the ``Ensemble`` is + submitted as a batch workload. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script source code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + + Begin by initializing and configuring a ``BatchSettings`` object to + run the ``Ensemble`` instance: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 6-8 + + The above ``BatchSettings`` object will instruct SmartSim to run the ``Ensemble`` on two + nodes with a timeout of `10 hours`. + + Next initialize a ``RunSettings`` object to apply to all ``Ensemble`` members: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 10-12 + + Next, define the parameters to include in ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 14-18 + + Finally, initialize an ``Ensemble`` by passing in the ``RunSettings``, `params` and `perm_strategy="step"`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py + :language: python + :linenos: + :lines: 20-21 + + When specifying `perm_strategy="step"`, the `params` sets are created by collecting identically + indexed values across the `param` value lists. + + .. code-block:: bash + + ensemble member 1: ["Ellie", 2] + ensemble member 2: ["John", 11] + +.. _replicas_init: + +Replicas +======== +A replica strategy involves the creation of identical ``Model``'s within an ``Ensemble``. +This strategy is particularly useful for applications that have some inherent randomness. +Users may use the `replicas` factory method argument to create a specified number of identical +``Model`` members during ``Ensemble`` creation (``Experiment.create_ensemble``). + +-------- +Examples +-------- +This subsection contains two examples of using the replicas creation strategy. The +:ref:`first example` illustrates creating four ``Ensemble`` member clones +while the :ref:`second example` demonstrates creating four ``Ensemble`` +member clones along with the launch of the ``Ensemble`` as a batch workload. + +.. _replicas_first_ex: + +Example 1 : ``Ensemble`` creation with replicas strategy + + In this example an ``Ensemble`` of four identical ``Model`` members is created by + specifying the number of clones to create via the `replicas` argument. + All of the ``Model``'s in the ``Ensemble`` share the same ``RunSettings``. + The source code example is available in the dropdown below for convenient execution + and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + + To create an ``Ensemble`` of identical ``Model``'s, begin by initializing a ``RunSettings`` + object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + :language: python + :linenos: + :lines: 6-7 + + Initialize the ``Ensemble`` by specifying the ``RunSettings`` object and number of clones to `replicas`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_1.py + :language: python + :linenos: + :lines: 9-10 + + By passing in `replicas=4`, four identical ``Ensemble`` members will be initialized. + +.. _replicas_second_ex: + +Example 2 : ``Ensemble`` Creation with Replicas Strategy and ``Ensemble`` Batch Launching + + In this example an ``Ensemble`` of four ``Model`` entities is created by specifying + the number of clones to create via the `replicas` argument. All of the ``Model``'s in + the ``Ensemble`` share the same ``RunSettings`` and the ``Ensemble`` is + submitted as a batch workload. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + + To launch the ``Ensemble`` of identical ``Model``'s as a batch job, begin by initializing a ``BatchSettings`` + object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 6-9 + + The above ``BatchSettings`` object will instruct SmartSim to run the ``Ensemble`` on four + nodes with a timeout of `10 hours`. + + Next, create a ``RunSettings`` object to apply to all ``Model`` replicas: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 10-12 + + Initialize the ``Ensemble`` by specifying the ``RunSettings`` object, ``BatchSettings`` object + and number of clones to `replicas`: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/replicas_2.py + :language: python + :linenos: + :lines: 14-15 + + By passing in `replicas=4`, four identical ``Ensemble`` members will be initialized. + +.. _append_init: + +Manually Append +=============== +Manually appending ``Model``'s to an ``Ensemble`` offers an in-depth level of customization in ``Ensemble`` design. +This approach is favorable when users have distinct requirements for individual ``Model``'s, such as variations +in parameters, run settings, or different types of simulations. + +-------- +Examples +-------- +This subsection contains an example of creating an ``Ensemble`` by manually appending ``Model``'s. +The example illustrates attaching two SmartSim ``Model``'s to the ``Ensemble``. +The ``Ensemble`` is submitted as a batch workload. + +Example 1 : Append ``Model``'s to an ``Ensemble`` and Launch as a Batch Job + + In this example, we append ``Model``'s to an ``Ensemble`` for batch job execution. To do + this, we first initialize an Ensemble with a ``BatchSettings`` object. Then, manually + create ``Model``'s and add each to the ``Ensemble`` using the ``Ensemble.add_model`` function. + The source code example is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + + To create an empty ``Ensemble`` to append ``Model``'s, initialize the ``Ensemble`` with + a batch settings object: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 6-11 + + Next, create the ``Model``'s to append to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 13-20 + + Finally, append the ``Model`` objects to the ``Ensemble``: + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py + :language: python + :linenos: + :lines: 22-25 + + The new ``Ensemble`` is comprised of two appended ``Model`` members. + +.. _attach_files_ensemble: + +===== +Files +===== +Overview +======== +``Ensemble`` members often depend on external files (e.g. training datasets, evaluation datasets, etc) +to operate as intended. Users can instruct SmartSim to copy, symlink, or manipulate external files +prior to an ``Ensemble`` launch via the ``Ensemble.attach_generator_files`` function. Attached files +will be applied to all ``Ensemble`` members. + +.. note:: + Multiple calls to ``Ensemble.attach_generator_files`` will overwrite previous file configurations + on the ``Ensemble``. + +To attach a file to an ``Ensemble`` for use at runtime, provide one of the following arguments to the +``Ensemble.attach_generator_files`` function: + +* `to_copy` (t.Optional[t.List[str]] = None): Files that are copied into the path of the ``Ensemble`` members. +* `to_symlink` (t.Optional[t.List[str]] = None): Files that are symlinked into the path of the ``Ensemble`` members. + A symlink, or symbolic link, is a file that points to another file or directory, allowing you to access that file + as if it were located in the same directory as the symlink. + +To specify a template file in order to programmatically replace specified parameters during generation +of ``Ensemble`` member directories, pass the following value to the ``Ensemble.attach_generator_files`` function: + +* `to_configure` (t.Optional[t.List[str]] = None): This parameter is designed for text-based ``Ensemble`` + member input files. During directory generation for ``Ensemble`` members, the linked files are parsed and replaced with + the `params` values applied to each ``Ensemble`` member. To further explain, the ``Ensemble`` + creation strategy is considered when replacing the tagged parameters in the input files. + These tagged parameters are placeholders in the text that are replaced with the actual + parameter values during the directory generation process. The default tag is a semicolon + (e.g., THERMO = ;THERMO;). + +In the :ref:`Example` subsection, we provide an example using the value `to_configure` +within ``Ensemble.attach_generator_files``. + +.. seealso:: + To add a file to a single ``Model`` that will be appended to an ``Ensemble``, refer to the :ref:`Files` + section of the ``Model`` documentation. + +.. _files_example_doc_ensem: + +Example +======= +This example demonstrates how to attach a text file to an ``Ensemble`` for parameter replacement. +This is accomplished using the `params` function parameter in +the ``Experiment.create_ensemble`` factory function and the `to_configure` function parameter +in ``Ensemble.attach_generator_files``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + +In this example, we have a text file named `params_inputs.txt`. Within the text, is the parameter `THERMO` +that is required by each ``Ensemble`` member at runtime: + +.. code-block:: bash + + THERMO = ;THERMO; + +In order to have the tagged parameter `;THERMO;` replaced with a usable value at runtime, two steps are required: + +1. The `THERMO` variable must be included in ``Experiment.create_ensemble`` factory method as + part of the `params` parameter. +2. The file containing the tagged parameter `;THERMO;`, `params_inputs.txt`, must be attached to the ``Ensemble`` + via the ``Ensemble.attach_generator_files`` method as part of the `to_configure` parameter. + +To encapsulate our application within an ``Ensemble``, we must create an ``Experiment`` instance +to gain access to the ``Experiment`` factory method that creates the ``Ensemble``. +Begin by importing the ``Experiment`` module and initializing an ``Experiment``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 1-4 + +To create our ``Ensemble``, we are using the `replicas` initialization strategy. +Begin by creating a simple ``RunSettings`` object to specify the path to +the executable simulation as an executable: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 6-7 + +Next, initialize an ``Ensemble`` object with ``Experiment.create_ensemble`` +by passing in `ensemble_settings`, `params={"THERMO":1}` and `replicas=2`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 9-10 + +We now have an ``Ensemble`` instance named `example_ensemble`. Attach the above text file +to the ``Ensemble`` for use at entity runtime. To do so, we use the +``Ensemble.attach_generator_files`` function and specify the `to_configure` +parameter with the path to the text file, `params_inputs.txt`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 12-13 + +To create an isolated directory for the ``Ensemble`` member outputs and configuration files, invoke ``Experiment.generate`` via the +``Experiment`` instance `exp` with `example_ensemble` as an input parameter: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 15-16 + +After invoking ``Experiment.generate``, the attached generator files will be available for the +application when ``exp.start(example_ensemble)`` is called. + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/file_attach.py + :language: python + :linenos: + :lines: 18-19 + +The contents of `params_inputs.txt` after ``Ensemble`` completion are: + +.. code-block:: bash + + THERMO = 1 + +.. _ensemble_ml_model_script: + +===================== +ML Models and Scripts +===================== +Overview +======== +SmartSim users have the capability to load ML models and TorchScripts into an ``Orchestrator`` +within the ``Experiment`` script for use within ``Ensemble`` members. Functions +accessible through an ``Ensemble`` object support loading ML models (TensorFlow, TensorFlow-lite, +PyTorch, and ONNX) and TorchScripts into standalone or colocated ``Orchestrators`` before +application runtime. + +.. seealso:: + To add an ML model or TorchScript to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`ML Models and Scripts` + section of the ``Model`` documentation. + +Depending on the planned storage method of the **ML model**, there are **two** distinct +approaches to load it into the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` + +.. warning:: + Uploading an ML model :ref:`from memory` is solely supported for + standalone ``Orchestrators``. To upload an ML model to a colocated ``Orchestrator``, users + must save the ML model to disk and upload :ref:`from file`. + +Depending on the planned storage method of the **TorchScript**, there are **three** distinct +approaches to load it into the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` +- :ref:`From String` + +.. warning:: + Uploading a TorchScript :ref:`from memory` is solely supported for + standalone ``Orchestrators``. To upload a TorchScript to a colocated ``Orchestrator``, users + upload :ref:`from file` or :ref:`from string`. + +Once a ML model or TorchScript is loaded into the ``Orchestrator``, ``Ensemble`` members can +leverage ML capabilities by utilizing the SmartSim client (:ref:`SmartRedis`) +to execute the stored ML models or TorchScripts. + +.. _ai_model_ensemble_doc: + +AI Models +========= +When configuring an ``Ensemble``, users can instruct SmartSim to load +Machine Learning (ML) models dynamically to the ``Orchestrator`` (colocated or standalone). ML models added +are loaded into the ``Orchestrator`` prior to the execution of the ``Ensemble``. To load an ML model +to the ``Orchestrator``, SmartSim users can serialize and provide the ML model **in-memory** or specify the **file path** +via the ``Ensemble.add_ml_model`` function. The supported ML frameworks are TensorFlow, +TensorFlow-lite, PyTorch, and ONNX. + +Users must **serialize TensorFlow ML models** before sending to an ``Orchestrator`` from memory +or from file. To save a TensorFlow model to memory, SmartSim offers the ``serialize_model`` +function. This function returns the TF model as a byte string with the names of the +input and output layers, which will be required upon uploading. To save a TF model to disk, +SmartSim offers the ``freeze_model`` function which returns the path to the serialized +TF model file with the names of the input and output layers. Additional TF model serialization +information and examples can be found in the :ref:`ML Features` section of SmartSim. + +.. note:: + Uploading an ML model from memory is only supported for standalone ``Orchestrators``. + +When attaching an ML model using ``Ensemble.add_ml_model``, the +following arguments are offered to customize storage and execution: + +- `name` (str): name to reference the ML model in the ``Orchestrator``. +- `backend` (str): name of the backend (TORCH, TF, TFLITE, ONNX). +- `model` (t.Optional[str] = None): An ML model in memory (only supported for non-colocated ``Orchestrators``). +- `model_path` (t.Optional[str] = None): serialized ML model. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): name of device for execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `batch_size` (int = 0): batch size for execution, defaults to 0. +- `min_batch_size` (int = 0): minimum batch size for ML model execution, defaults to 0. +- `min_batch_timeout` (int = 0): time to wait for minimum batch size, defaults to 0. +- `tag` (str = ""): additional tag for ML model information, defaults to “”. +- `inputs` (t.Optional[t.List[str]] = None): ML model inputs (TF only), defaults to None. +- `outputs` (t.Optional[t.List[str]] = None): ML model outputs (TF only), defaults to None. + +.. seealso:: + To add an ML model to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`AI Models` + section of the ``Model`` documentation. + +.. _in_mem_ML_model_ensemble_ex: + +------------------------------------- +Example: Attach an In-Memory ML Model +------------------------------------- +This example demonstrates how to attach an in-memory ML model to a SmartSim ``Ensemble`` +to load into an ``Orchestrator`` at ``Ensemble`` runtime. The source code example is +available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to the ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``serialize_model`` which returns the + ML model as a byte string with the names of the input and output layers + +**Attach the ML Model to a SmartSim Ensemble** + +In this example, we have a serialized Tensorflow-based ML model that was saved to a byte string stored under `model`. +Additionally, the ``serialize_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add the byte string TensorFlow model using +``Ensemble.add_ml_model``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``ensemble_instance.add_ml_model`` code snippet, we offer the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model` (model): The in-memory representation of the TensorFlow model. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to the launch of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent standalone ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the ML model will be loaded to the +launched standalone ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application code. + +.. _from_file_ML_model_ensemble_ex: + +------------------------------------- +Example: Attach an ML Model From File +------------------------------------- +This example demonstrates how to attach a ML model from file to a SmartSim ``Ensemble`` +to load into an ``Orchestrator`` at ``Ensemble`` runtime. The source code example is +available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py + +.. note:: + This example assumes: + + - a standalone ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``freeze_model`` which returns the + the path to the serialized model file and the names of the input and output layers + +**Attach the ML Model to a SmartSim Ensemble** + +In this example, we have a serialized Tensorflow-based ML model that was saved to disk and stored under `model`. +Additionally, the ``freeze_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TensorFlow model using +the ``Ensemble.add_ml_model`` function and specify the ML model path to the parameter `model_path`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``ensemble_instance.add_ml_model`` code snippet, we offer the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model_path` (model_file): The path to the ML model script. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the ML model will be loaded to the +launched ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application executable. + +.. _TS_ensemble_doc: + +TorchScripts +============ +When configuring an ``Ensemble``, users can instruct SmartSim to load TorchScripts dynamically +to the ``Orchestrator``. The TorchScripts become available for each ``Ensemble`` member upon being loaded +into the ``Orchestrator`` prior to the execution of the ``Ensemble``. SmartSim users may upload +a single TorchScript function via ``Ensemble.add_function`` or alternatively upload a script +containing multiple functions via ``Ensemble.add_script``. To load a TorchScript to the +``Orchestrator``, SmartSim users can follow one of the following processes: + +- :ref:`Define a TorchScript Function In-Memory` + Use the ``Ensemble.add_function`` to instruct SmartSim to load an in-memory TorchScript to the ``Orchestrator``. +- :ref:`Define Multiple TorchScript Functions From File` + Provide file path to ``Ensemble.add_script`` to instruct SmartSim to load the TorchScript from file to the ``Orchestrator``. +- :ref:`Define a TorchScript Function as String` + Provide function string to ``Ensemble.add_script`` to instruct SmartSim to load a raw string as a TorchScript function to the ``Orchestrator``. + +.. note:: + Uploading a TorchScript :ref:`from memory` using ``Ensemble.add_function`` + is only supported for standalone ``Orchestrators``. Users uploading + TorchScripts to colocated ``Orchestrators`` should instead use the function ``Ensemble.add_script`` + to upload :ref:`from file` or as a :ref:`string`. + +Each function also provides flexible device selection, allowing users to choose between which device the TorchScript is executed on, `"GPU"` or `"CPU"`. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +.. note:: + If `device=GPU` is specified when attaching a TorchScript function to an ``Ensemble``, this instructs + SmartSim to execute the TorchScript on GPU nodes. However, TorchScripts loaded to an ``Orchestrator`` are + executed on the ``Orchestrator`` compute resources. Therefore, users must make sure that the device + specified is included in the ``Orchestrator`` compute resources. To further explain, if a user + specifies `device=GPU`, however, initializes ``Orchestrator`` on only CPU nodes, + the TorchScript will not run on GPU nodes as advised. + +Continue or select the respective process link to learn more on how each function (``Ensemble.add_script`` and ``Ensemble.add_function``) +dynamically loads TorchScripts to the ``Orchestrator``. + +.. seealso:: + To add a TorchScript to a single ``Model`` that will be appended to an + ``Ensemble``, refer to the :ref:`TorchScripts` + section of the ``Model`` documentation. + +.. _in_mem_TF_ensemble_doc: + +------------------------------- +Attach an In-Memory TorchScript +------------------------------- +Users can define TorchScript functions within the ``Experiment`` driver script +to attach to an ``Ensemble``. This feature is supported by ``Ensemble.add_function``. + +.. warning:: + ``Ensemble.add_function`` does **not** support loading in-memory TorchScript functions to a colocated ``Orchestrator``. + If you would like to load a TorchScript function to a colocated ``Orchestrator``, define the function + as a :ref:`raw string` or :ref:`load from file`. + +When specifying an in-memory TF function using ``Ensemble.add_function``, the +following arguments are offered: + +- `name` (str): reference name for the script inside of the ``Orchestrator``. +- `function` (t.Optional[str] = None): TorchScript function code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +.. _in_mem_TF_ex: + +Example: Load a In-Memory TorchScript Function +---------------------------------------------- +This example walks through the steps of instructing SmartSim to load an in-memory TorchScript function +to a standalone ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + +.. note:: + The example assumes: + + - a standalone ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define an In-Memory TF Function** + +To begin, define an in-memory TorchScript function within the Python driver script. +For the purpose of the example, we add a simple TorchScript function, `timestwo`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 3-4 + +**Attach the In-Memory TorchScript Function to a SmartSim Ensemble** + +We use the ``Ensemble.add_function`` function to instruct SmartSim to load the TorchScript function `timestwo` +onto the launched standalone ``Orchestrator``. Specify the function `timestwo` to the `function` +parameter: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``ensemble_instance.add_function`` code snippet, we offer the following arguments: + +- `name` ("example_func"): A name to uniquely identify the TorchScript within the ``Orchestrator``. +- `function` (timestwo): Name of the TorchScript function defined in the Python driver script. +- `device` ("GPU"): Specifying the device for TorchScript execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the TorchScript to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the TF function will be loaded to the +standalone ``Orchestrator``. The function can then be executed on the ``Orchestrator`` via a SmartSim +client (:ref:`SmartRedis`) within the application code. + +.. _TS_from_file_ensemble: + +------------------------------ +Attach a TorchScript From File +------------------------------ +Users can attach TorchScript functions from a file to an ``Ensemble`` and upload them to a +colocated or standalone ``Orchestrator``. This functionality is supported by the ``Ensemble.add_script`` +function's `script_path` parameter. + +When specifying a TorchScript using ``Ensemble.add_script``, the +following arguments are offered: + +- `name` (str): Reference name for the script inside of the ``Orchestrator``. +- `script` (t.Optional[str] = None): TorchScript code (only supported for non-colocated ``Orchestrators``). +- `script_path` (t.Optional[str] = None): path to TorchScript code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +Example: Loading a TorchScript From File +---------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript from file +to an ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define a TorchScript Script** + +For the example, we create the Python script `torchscript.py`. The file contains multiple +simple torch function shown below: + +.. code-block:: python + + def negate(x): + return torch.neg(x) + + def random(x, y): + return torch.randn(x, y) + + def pos(z): + return torch.positive(z) + +**Attach the TorchScript Script to a SmartSim Ensemble** + +Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TorchScript script using +the ``Ensemble.add_script`` function and specify the script path to the parameter `script_path`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py + :language: python + :linenos: + :lines: 12-13 + +In the above ``smartsim_model.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): Reference name for the script inside of the ``Orchestrator``. +- `script_path` ("path/to/torchscript.py"): Path to the script file. +- `device` ("GPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When `ensemble_instance` is started via ``Experiment.start``, the TorchScript will be loaded from file to the +``Orchestrator`` that is launched prior to the start of `ensemble_instance`. + +.. _TS_raw_string_ensemble: + +--------------------------------- +Define TorchScripts as Raw String +--------------------------------- +Users can upload TorchScript functions from string to send to a colocated or +standalone ``Orchestrator``. This feature is supported by the +``Ensemble.add_script`` function's `script` parameter. + +When specifying a TorchScript using ``Ensemble.add_script``, the +following arguments are offered: + +- `name` (str): Reference name for the script inside of the ``Orchestrator``. +- `script` (t.Optional[str] = None): String of function code (e.g. TorchScript code string). +- `script_path` (t.Optional[str] = None): path to TorchScript code. +- `device` (t.Literal["CPU", "GPU"] = "CPU"): device for script execution, defaults to “CPU”. +- `devices_per_node` (int = 1): The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. +- `first_device` (int = 0): The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. + +Example: Load a TorchScript From String +--------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript function +from string to an ``Orchestrator`` before the execution of the associated ``Ensemble``. +The source code example is available in the dropdown below for convenient execution and customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to ``Ensemble`` execution + - an initialized ``Ensemble`` named `ensemble_instance` exists within the ``Experiment`` workflow + +**Define a String TorchScript** + +Define the TorchScript code as a variable in the Python driver script: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + :language: python + :linenos: + :lines: 12-13 + +**Attach the TorchScript Function to a SmartSim Ensemble** + +Assuming an initialized ``Ensemble`` named `ensemble_instance` exists, we add a TorchScript using +the ``Ensemble.add_script`` function and specify the variable `torch_script_str` to the parameter +`script`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``ensemble_instance.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): key to store script under. +- `script` (torch_script_str): TorchScript code. +- `device` ("GPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(ensemble_instance)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Ensemble`` is started via ``Experiment.start``, the TorchScript will be loaded to the +``Orchestrator`` that is launched prior to the start of the ``Ensemble``. + +.. _prefix_ensemble: + +========================= +Data Collision Prevention +========================= +Overview +======== +When multiple ``Ensemble`` members use the same code to send and access their respective data +in the ``Orchestrator``, key overlapping can occur, leading to inadvertent data access +between ``Ensemble`` members. To address this, SmartSim supports key prefixing +through ``Ensemble.enable_key_prefixing`` which enables key prefixing for all +``Ensemble`` members. For example, during an ``Ensemble`` simulation with prefixing enabled, SmartSim will add +the ``Ensemble`` member `name` as a prefix to the keys sent to the ``Orchestrator``. +Enabling key prefixing eliminates issues related to key overlapping, allowing ``Ensemble`` +members to use the same code without issue. + +The key components of SmartSim ``Ensemble`` prefixing functionality include: + +1. **Sending Data to the Orchestrator**: Users can send data to an ``Orchestrator`` + with the ``Ensemble`` member name prepended to the data name by utilizing SmartSim :ref:`Ensemble functions`. +2. **Retrieving Data From the Orchestrator**: Users can instruct a ``Client`` to prepend a + ``Ensemble`` member name to a key during data retrieval, polling, or check for existence on the ``Orchestrator`` + through SmartRedis :ref:`Client functions`. However, entity interaction + must be registered using :ref:`Ensemble` or :ref:`Model` functions. + +.. seealso:: + For information on prefixing ``Client`` functions, visit the :ref:`Client functions` page of the ``Model`` + documentation. + +For example, assume you have an ``Ensemble`` that was initialized using the :ref:`replicas` creation strategy. +Two identical ``Model`` were created named `ensemble_0` and `ensemble_1` that use the same executable application +within an ``Ensemble`` named `ensemble`. In the application code you use the function ``Client.put_tensor("tensor_0", data)``. +Without key prefixing enabled, the slower member will overwrite the data from the faster simulation. +With ``Ensemble`` key prefixing turned on, `ensemble_0` and `ensemble_1` can access +their tensor `"tensor_0"` by name without overwriting or accessing the other ``Model``'s `"tensor_0"` tensor. +In this scenario, the two tensors placed in the ``Orchestrator`` are named `ensemble_0.tensor_0` and `ensemble_1.tensor_0`. + +.. _model_prefix_func_ensemble: + +------------------ +Ensemble Functions +------------------ +An ``Ensemble`` object supports two prefixing functions: ``Ensemble.enable_key_prefixing`` and +``Ensemble.register_incoming_entity``. For more information on each function, reference the +:ref:`Ensemble API docs`. + +To enable prefixing on a ``Ensemble``, users must use the ``Ensemble.enable_key_prefixing`` +function in the ``Experiment`` driver script. This function activates prefixing for tensors, +``Datasets``, and lists sent to an ``Orchestrator`` for all ``Ensemble`` members. This function +also enables access to prefixing ``Client`` functions within the ``Ensemble`` members. This excludes +the ``Client.set_data_source`` function, where ``enable_key_prefixing`` is not require for access. + +.. note:: + ML model and script prefixing is not automatically enabled through ``Ensemble.enable_key_prefixing``. + Prefixing must be enabled within the ``Ensemble`` by calling the ``use_model_ensemble_prefix`` method + on the ``Client`` embedded within the member application. + +Users can enable the SmartRedis ``Client`` to interact with prefixed data, ML models and TorchScripts +using the ``Client.set_data_source``. However, for SmartSim to recognize the producer entity name +passed to the function within an application, the producer entity must be registered on the consumer +entity using ``Ensemble.register_incoming_entity``. + +If a consumer ``Ensemble`` member requests data sent to the ``Orchestrator`` by other ``Ensemble`` members, the producer members must be +registered on consumer member. To access ``Ensemble`` members, SmartSim offers the attribute ``Ensemble.models`` that returns +a list of ``Ensemble`` members. Below we demonstrate registering producer members on a consumer member: + +.. code-block:: python + + # list of producer Ensemble members + list_of_ensemble_names = ["producer_0", "producer_1", "producer_2"] + + # Grab the consumer Ensemble member + ensemble_member = ensemble.models.get("producer_3") + # Register the producer members on the consumer member + for name in list_of_ensemble_names: + ensemble_member.register_incoming_entity(ensemble.models.get(name)) + +For examples demonstrating how to retrieve data within the entity application that produced +the data, visit the ``Model`` :ref:`Copy/Rename/Delete Operations` subsection. + +Example: Ensemble Key Prefixing +=============================== +In this example, we create an ``Ensemble`` comprised of two ``Model``'s that use identical code +to send data to a standalone ``Orchestrator``. To prevent key collisions and ensure data +integrity, we enable key prefixing on the ``Ensemble`` which automatically +appends the ``Ensemble`` member `name` to the data sent to the ``Orchestrator``. After the +``Ensemble`` completes, we launch a consumer ``Model`` within the ``Experiment`` driver script +to demonstrate accessing prefixed data sent to the ``Orchestrator`` by ``Ensemble`` members. + +This example consists of **three** Python scripts: + +1. :ref:`Application Producer Script`: This script is encapsulated + in a SmartSim ``Ensemble`` within the ``Experiment`` driver script. Prefixing is enabled + on the ``Ensemble``. The producer script puts NumPy tensors on an ``Orchestrator`` + launched in the ``Experiment`` driver script. The ``Ensemble`` creates two + identical ``Ensemble`` members. The producer script is executed + in both ``Ensemble`` members to send two prefixed tensors to the ``Orchestrator``. + The source code example is available in the dropdown below for convenient customization. + +.. dropdown:: Application Producer Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py + +1. :ref:`Application Consumer Script`: This script is encapsulated + within a SmartSim ``Model`` in the ``Experiment`` driver script. The script requests the + prefixed tensors placed by the producer script. The source code example is available in + the dropdown below for convenient customization. + +.. dropdown:: Application Consumer Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + +1. :ref:`Experiment Driver Script`: The driver script launches the + ``Orchestrator``, the ``Ensemble`` (which sends prefixed keys to the ``Orchestrator``), + and the ``Model`` (which requests prefixed keys from the ``Orchestrator``). The + ``Experiment`` driver script is the centralized spot that controls the workflow. + The source code example is available in the dropdown below for convenient execution and + customization. + +.. dropdown:: Experiment Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + +.. _app_prod_prefix_ensemble: + +------------------------------- +The Application Producer Script +------------------------------- +In the ``Experiment`` driver script, we instruct SmartSim to create an ``Ensemble`` comprised of +two duplicate members that execute this producer script. In the producer script, a SmartRedis ``Client`` sends a +tensor to the ``Orchestrator``. Since the ``Ensemble`` members are identical and therefore use the same +application code, two tensors are sent to the ``Orchestrator``. Without prefixing enabled on the ``Ensemble`` +the keys can be overwritten. To prevent this, we enable key prefixing on the ``Ensemble`` in the driver script +via ``Ensemble.enable_key_prefixing``. When the producer script is executed by each ``Ensemble`` member, a +tensor is sent to the ``Orchestrator`` with the ``Ensemble`` member `name` prepended to the tensor `name`. + +Here we provide the producer script that is applied to the ``Ensemble`` members: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py + :language: python + :linenos: + +After the completion of ``Ensemble`` members `producer_0` and `producer_1`, the contents of the ``Orchestrator`` are: + +.. code-block:: bash + + 1) "producer_0.tensor" + 2) "producer_1.tensor" + +.. _app_con_prefix_ensemble: + +------------------------------- +The Application Consumer Script +------------------------------- +In the ``Experiment`` driver script, we initialize a consumer ``Model`` that encapsulates +the consumer application to request the tensors produced from the ``Ensemble``. To do +so, we use SmartRedis key prefixing functionality to instruct the SmartRedis ``Client`` +to append the name of an ``Ensemble`` member to the key `name`. + +.. seealso:: + For more information on ``Client`` prefixing functions, visit the :ref:`Client functions` + subsection of the ``Model`` documentation. + +To begin, specify the imports and initialize a SmartRedis ``Client``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 1-4 + +To retrieve the tensor from the first ``Ensemble`` member named `producer_0`, use +``Client.set_data_source``. Specify the name of the first ``Ensemble`` member +as an argument to the function. This instructs SmartSim to append the ``Ensemble`` member name to the data +search on the ``Orchestrator``. When ``Client.poll_tensor`` is executed, +the SmartRedis `client` will poll for key, `producer_0.tensor`: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 6-9 + +Follow the same steps above, however, change the data source `name` to the `name` +of the second ``Ensemble`` member (`producer_1`): + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 11-14 + +We print the boolean return to verify that the tensors were found: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py + :language: python + :linenos: + :lines: 16-17 + +When the ``Experiment`` driver script is executed, the following output will appear in `consumer.out`: + +.. code-block:: bash + + Default@11-46-05:producer_0.tensor was found: True + Default@11-46-05:producer_1.tensor was found: True + +.. warning:: + For SmartSim to recognize the ``Ensemble`` member names as a valid data source + to ``Client.set_data_source``, you must register each ``Ensemble`` member + on the consumer ``Model`` in the driver script via ``Model.register_incoming_entity``. + We demonstrate this in the ``Experiment`` driver script section of the example. + +.. _exp_prefix_ensemble: + +--------------------- +The Experiment Script +--------------------- +The ``Experiment`` driver script manages all workflow components and utilizes the producer and consumer +application scripts. In the example, the ``Experiment``: + +- launches standalone ``Orchestrator`` +- launches an ``Ensemble`` via the replicas initialization strategy +- launches a consumer ``Model`` +- clobbers the ``Orchestrator`` + +To begin, add the necessary imports, initialize an ``Experiment`` instance and initialize the +standalone ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 1-9 + +We are now setup to discuss key prefixing within the ``Experiment`` driver script. +To create an ``Ensemble`` using the replicas strategy, begin by initializing a ``RunSettings`` +object to apply to all ``Ensemble`` members. Specify the path to the application +producer script: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 11-12 + +Next, initialize an ``Ensemble`` by specifying `ensemble_settings` and the number of ``Model`` `replicas` to create: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 14-15 + +Instruct SmartSim to prefix all tensors sent to the ``Orchestrator`` from the ``Ensemble`` via ``Ensemble.enable_key_prefixing``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 17-18 + +Next, initialize the consumer ``Model``. The consumer ``Model`` application requests +the prefixed tensors produced by the ``Ensemble``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 20-23 + +Next, organize the SmartSim entity output files into a single ``Experiment`` folder: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 25-26 + +Launch the ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 28-29 + +Launch the ``Ensemble``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 31-32 + +Set `block=True` so that ``Experiment.start`` waits until the last ``Ensemble`` member has finished before continuing. + +The consumer ``Model`` application script uses ``Client.set_data_source`` which +accepts the ``Ensemble`` member names when searching for prefixed +keys in the ``Orchestrator``. In order for SmartSim to recognize the ``Ensemble`` +member names as a valid data source in the consumer ``Model``, we must register +the entity interaction: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 34-36 + +Launch the consumer ``Model``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 38-39 + +To finish, tear down the standalone ``Orchestrator``: + +.. literalinclude:: tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py + :language: python + :linenos: + :lines: 41-42 \ No newline at end of file diff --git a/doc/experiment.rst b/doc/experiment.rst index 986db4cad..716df1228 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -1,326 +1,534 @@ - *********** Experiments *********** +======== +Overview +======== +SmartSim helps automate the deployment of AI-enabled workflows on HPC systems. With SmartSim, users +can describe and launch combinations of applications and AI/ML infrastructure to produce novel and +scalable workflows. SmartSim supports launching these workflows on a diverse set of systems, including +local environments such as Mac or Linux, as well as HPC job schedulers (e.g. Slurm, PBS Pro, and LSF). -The Experiment acts as both a factory class for constructing the stages of an -experiment (``Model``, ``Ensemble``, ``Orchestrator``, etc.) as well as an -interface to interact with the entities created by the experiment. - -Users can initialize an :ref:`Experiment ` at the beginning of a -Jupyter notebook, interactive python session, or Python file and use the -``Experiment`` to iteratively create, configure and launch computational kernels -on the system through the specified launcher. - -.. |SmartSim Architecture| image:: images/ss-arch-overview.png - :width: 700 - :alt: Alternative text - -|SmartSim Architecture| - +The ``Experiment`` API is SmartSim's top level API that provides users with methods for creating, combining, +configuring, launching and monitoring :ref:`entities` in an AI-enabled workflow. More specifically, the +``Experiment`` API offers three customizable workflow components that are created and initialized via factory +methods: -The interface was designed to be simple, with as little complexity as possible, -and agnostic to the backend launching mechanism (local, Slurm, PBSPro, etc.). +* :ref:`Orchestrator` +* :ref:`Model` +* :ref:`Ensemble` -Model -===== +Settings are given to ``Model`` and ``Ensemble`` objects to provide parameters for how the job should be executed. The +:ref:`Experiment API` offers two customizable Settings objects that are created via the factory methods: -``Model(s)`` are subclasses of ``SmartSimEntity(s)`` and are created through the -Experiment API. Models represent any computational kernel. Models are flexible -enough to support many different applications, however, to be used with our -clients (SmartRedis) the application will have to be written in Python, C, C++, -or Fortran. +* :ref:`RunSettings` +* :ref:`BatchSettings` -Models are given :ref:`RunSettings ` objects that specify how a kernel -should be executed with regard to the workload manager (e.g. Slurm) and the -available compute resources on the system. +Once a workflow component is initialized (e.g. ``Orchestrator``, ``Model`` or ``Ensemble``), a user has access +to the associated entity API which supports configuring and retrieving the entities' information: -Each launcher supports specific types of ``RunSettings``. +* :ref:`Orchestrator API` +* :ref:`Model API` +* :ref:`Ensemble API` - - :ref:`SrunSettings ` for Slurm - - :ref:`AprunSettings ` for PBSPro - - :ref:`MpirunSettings ` for OpenMPI with `mpirun` on PBSPro, LSF, and Slurm - - :ref:`JsrunSettings ` for LSF +There is no limit to the number of SmartSim entities a user can +initialize within an ``Experiment``. -These settings can be manually specified by the user, or auto-detected by the -SmartSim Experiment through the ``Experiment.create_run_settings`` method. +.. figure:: images/Experiment.png -A simple example of using the Experiment API to create a model and run it -locally: + Sample ``Experiment`` showing a user application leveraging + machine learning infrastructure launched by SmartSim and connected + to online analysis and visualization via the in-memory ``Orchestrator``. -.. code-block:: Python +Find an example of the ``Experiment`` class and factory methods used within a +workflow in the :ref:`Example` section of this page. - from smartsim import Experiment +.. _launcher_exp_docs: - exp = Experiment("simple", launcher="local") +========= +Launchers +========= +SmartSim supports launching AI-enabled workflows on a wide variety of systems, including locally on a Mac or +Linux machine or on HPC machines with a job scheduler (e.g. Slurm, PBS Pro, and LSF). When creating a SmartSim +``Experiment``, the user has the opportunity to specify the `launcher` type or defer to automatic `launcher` selection. +`Launcher` selection determines how SmartSim translates entity configurations into system calls to launch, +manage, and monitor. Currently, SmartSim supports 7 `launcher` options: - settings = exp.create_run_settings("echo", exe_args="Hello World") - model = exp.create_model("hello_world", settings) +1. ``local`` **[default]**: for single-node, workstation, or laptop +2. ``slurm``: for systems using the Slurm scheduler +3. ``pbs``: for systems using the PBS Pro scheduler +4. ``pals``: for systems using the PALS scheduler +5. ``lsf``: for systems using the LSF scheduler +6. ``dragon``: if Dragon is installed in the current Python environment, see :ref:`Dragon Install ` +7. ``auto``: have SmartSim auto-detect the launcher to use (will not detect ``dragon``) - exp.start(model, block=True) - print(exp.get_status(model)) +The :ref:`Dragon-based launcher ` can be run on PBS- or Slurm-based systems +(MPI applications are supported only when Cray PMI or Cray PALS are available). -If the launcher has been specified, or auto-detected through setting -``launcher=auto`` in the Experiment initialization, the ``create_run_settings`` -method will automatically create the appropriate ``RunSettings`` object and -return it. +If the systems `launcher` cannot be found or no `launcher` argument is provided, the default value of +`"local"` will be assigned which will start all ``Experiment`` launched entities on the +localhost. -For example with Slurm +For examples specifying a `launcher` during ``Experiment`` initialization, navigate to the +``Experiment`` :ref:`__init__ special method` in the ``Experiment`` API docstring. -.. code-block:: Python +.. _entities_exp_docs: - from smartsim import Experiment - - exp = Experiment("hello_world_exp", launcher="slurm") - srun = exp.create_run_settings(exe="echo", exe_args="Hello World!") - - # helper methods for configuring run settings are available in - # each of the implementations of RunSettings - srun.set_nodes(1) - srun.set_tasks(32) +======== +Entities +======== +Entities are SmartSim API objects that can be launched and +managed on the compute system through the ``Experiment`` API. +The SmartSim entities include: + +* ``Orchestrator`` +* ``Model`` +* ``Ensemble`` + +While the ``Experiment`` object is intended to be instantiated once in the +Python driver script, there is no limit to the number of SmartSim entities +within the ``Experiment``. In the following subsections, we define the +general purpose of the three entities that can be created through the +``Experiment``. + +To create a reference to a newly instantiated entity object, use the +associated ``Experiment.create_...`` factory method shown below. + +.. list-table:: Experiment API Entity Creation + :widths: 20 65 25 + :header-rows: 1 + + * - Factory Method + - Example + - Return Type + * - ``create_database`` + - ``orch = exp.create_database([port, db_nodes, ...])`` + - :ref:`Orchestrator ` + * - ``create_model`` + - ``model = exp.create_model(name, run_settings)`` + - :ref:`Model ` + * - ``create_ensemble`` + - ``ensemble = exp.create_ensemble(name[, params, ...])`` + - :ref:`Ensemble ` + +After initialization, each entity can be started, monitored, and stopped using +the ``Experiment`` post-creation methods. + +.. list-table:: Interact with Entities During the Experiment + :widths: 25 55 25 + :header-rows: 1 + + * - Factory Method + - Example + - Desc + * - ``start`` + - ``exp.start(*args[, block, summary, ...])`` + - Launch an Entity + * - ``stop`` + - ``exp.stop(*args)`` + - Stop an Entity + * - ``get_status`` + - ``exp.get_status(*args)`` + - Retrieve Entity Status + * - ``preview`` + - ``exp.preview(*args, ...)`` + - Preview an Entity + +.. _orchestrator_exp_docs: + +Orchestrator +============ +The :ref:`Orchestrator` is an in-memory database built for +a wide variety of AI-enabled workflows. The ``Orchestrator`` can be thought of as a general +feature store for numerical data, ML models, and scripts. The ``Orchestrator`` is capable +of performing inference and script evaluation using data in the feature store. +Any SmartSim ``Model`` or ``Ensemble`` member can connect to the +``Orchestrator`` via the :ref:`SmartRedis` +``Client`` library to transmit data, execute ML models, and execute scripts. + +**SmartSim Offers Two Types of Orchestrator Deployments:** + +* :ref:`Standalone Orchestrator Deployment` +* :ref:`Colocated Orchestrator Deployment` + +To create a standalone ``Orchestrator`` that does not share compute resources with other +SmartSim entities, use the ``Experiment.create_database`` factory method which +returns an ``Orchestrator`` object. To create a colocated ``Orchestrator`` that +shares compute resources with a ``Model``, use the ``Model.colocate_db_tcp`` +or ``Model.colocate_db_uds`` member functions accessible after a +``Model`` object has been initialized. The functions instruct +SmartSim to launch an ``Orchestrator`` on the application compute nodes. An ``Orchestrator`` object is not +returned from a ``Model.colocate_db`` instruction, and subsequent interactions with the +colocated ``Orchestrator`` are handled through the :ref:`Model API`. + +SmartSim supports :ref:`multi-database` functionality, enabling an ``Experiment`` to have +several concurrently launched ``Orchestrator(s)``. If there is a need to launch more than +one ``Orchestrator``, the ``Experiment.create_database`` and ``Model.colocate..`` +functions mandate the specification of a unique ``Orchestrator`` identifier, denoted +by the `db_identifier` argument for each ``Orchestrator``. The `db_identifier` is used +in an application script by a SmartRedis ``Client`` to connect to a specific ``Orchestrator``. + +.. _model_exp_docs: - model = exp.create_model("hello_world", srun) - exp.start(model, block=True, summary=True) +Model +===== +:ref:`Model(s)` represent a simulation model or any computational kernel, +including applications, scripts, or generally, a program. They can +interact with other SmartSim entities via data transmitted to/from +SmartSim ``Orchestrator(s)`` using a SmartRedis ``Client``. - print(exp.get_status(model)) +A ``Model`` is created through the factory method: ``Experiment.create_model``. +``Model(s)`` are initialized with ``RunSettings`` objects that specify +how a ``Model`` should be launched by a workload manager +(e.g., Slurm) and the compute resources required. +Optionally, the user may also specify a ``BatchSettings`` object if +the ``Model`` should be launched as a batch job on the WLM system. +The ``create_model`` factory method returns an initialized ``Model`` object that +gives you access to functions associated with the :ref:`Model API`. -The above will run ``srun -n 32 -N 1 echo Hello World!``, monitor its -execution, and inform the user when it is completed. This driver script can be -executed in an interactive allocation, or placed into a batch script as follows: +A ``Model`` supports key features, including methods to: -.. code-block:: bash +- :ref:`Attach configuration files` for use at ``Model`` runtime. +- :ref:`Colocate an Orchestrator` to a SmartSim ``Model``. +- :ref:`Load an ML model` into the ``Orchestrator`` at ``Model`` runtime. +- :ref:`Load a TorchScript function` into the ``Orchestrator`` at ``Model`` runtime. +- :ref:`Enable data collision prevention` which allows + for reuse of key names in different ``Model`` applications. - #!/bin/bash - #SBATCH --exclusive - #SBATCH --nodes=1 - #SBATCH --ntasks-per-node=32 - #SBATCH --time=00:10:00 +Visit the respective links for more information on each topic. - python /path/to/script.py +.. _ensemble_exp_docs: Ensemble ======== - -In addition to a single model, SmartSim has the ability to launch an -``Ensemble`` of ``Model`` applications simultaneously. - -An ``Ensemble`` can be constructed in three ways: - 1. Parameter expansion (by specifying ``params`` and ``perm_strat`` argument) - 2. Replica creation (by specifying ``replicas`` argument) - 3. Manually (by adding created ``Model`` objects) if launching as a batch job - -Ensembles can be given parameters and permutation strategies that define how the -``Ensemble`` will create the underlying model objects. - -Three strategies are built in: - 1. ``all_perm``: for generating all permutations of model parameters - 2. ``step``: for creating one set of parameters for each element in `n` arrays - 3. ``random``: for random selection from predefined parameter spaces - -Here is an example that uses the ``random`` strategy to intialize four models -with random parameters within a set range. We use the ``params_as_args`` field -to specify that the randomly selected learning rate parameter should be passed -to the created models as a executable argument. - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="auto") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} - - # define how each member should run - run = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - run_settings=run, - perm_strategy="random", - n_models=4) - exp.start(ensemble, summary=True) - - -A callable function can also be supplied for custom permutation strategies. The -function should take two arguments: a list of parameter names, and a list of -lists of potential parameter values. The function should return a list of -dictionaries that will be supplied as model parameters. The length of the list -returned will determine how many ``Model`` instances are created. - -For example, the following is the built-in strategy ``all_perm``: +In addition to a single ``Model``, SmartSim allows users to create, +configure, and launch an :ref:`Ensemble` of ``Model`` objects. +``Ensemble(s)`` can be given parameters and a permutation strategy that define how the +``Ensemble`` will create the underlying ``Model`` objects. Users may also +manually create and append ``Model(s)`` to an ``Ensemble``. For information +and examples on ``Ensemble`` creation strategies, visit the :ref:`Initialization` +section within the ``Ensemble`` documentation. + +An ``Ensemble`` supports key features, including methods to: + +- :ref:`Attach configuration files` for use at ``Ensemble`` runtime. +- :ref:`Load an ML model` (TF, TF-lite, PT, or ONNX) into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Load a TorchScript function` into the ``Orchestrator`` at ``Ensemble`` runtime. +- :ref:`Prevent data collisions` within the ``Ensemble``, which allows for reuse of application code. + +Visit the respective links for more information on each topic. + +============== +File Structure +============== +When a user executes an ``Experiment`` script, it generates output folders in the system's directory. +By default, SmartSim creates a predefined file structure and assigns a path to each entity initialized. +However, users have the flexibility to customize this according to workflow needs. Please refer +to the respective :ref:`default` and :ref:`configure` sections below +for more details. + +.. note:: + Files added for symlinking, copying, or configuration will not be organized into the generated + directories unless ``Experiment.generate`` is invoked on the designated entity. + +.. _default_folder: + +Default +======= +By default, an ``Experiment`` folder is created in your current working directory, using the +specified `name` parameter during ``Experiment`` initialization. Each entity created by the +``Experiment`` generates an output folder under the ``Experiment`` directory, named after the +entity. These folders hold `.err` and `.out` files, containing execution-related information. + +For instance, consider the following Python script: .. code-block:: python - from itertools import product - - def create_all_permutations(param_names, param_values): - perms = list(product(*param_values)) - all_permutations = [] - for p in perms: - temp_model = dict(zip(param_names, p)) - all_permutations.append(temp_model) - return all_permutations - - -After ``Ensemble`` initialization, ``Ensemble`` instances can be -passed as arguments to ``Experiment.generate()`` to write assigned -parameter values into attached and tagged configuration files. - -Launching Ensembles -------------------- - -Ensembles can be launched in previously obtained interactive allocations -and as a batch. Similar to ``RunSettings``, ``BatchSettings`` specify how -an application(s) in a batch job should be executed with regards to the system -workload manager and available compute resources. + from smartsim import Experiment + + exp = Experiment(name="experiment-example") + database = exp.create_database(port=6379, interface="ib0") + exp.start(database) + settings = exp.create_run_settings(exe="echo", exec_args="hello world") + model = exp.create_model(name="model-name", run_settings=settings) + ensemble = exp.create_ensemble(name="ensemble-name", run_settings=settings, replicas=2) + exp.start(model, ensemble) + exp.stop(database) + +When executed, this script creates the following directory structure in your +working directory: + +:: + + experiment-example + ├── orchestrator + │ ├── orchestrator_0.err + │ └── orchestrator_0.out + ├── model-name + │ ├── model-name.err + │ └── model-name.out + └── ensemble-name + ├── ensemble-name_0 + │ ├── ensemble-name_0.err + │ └── ensemble-name_0.out + ├── ensemble-name_1 + │ ├── ensemble-name_1.err + │ └── ensemble-name_1.out + +.. _config_folder: + +Configure +========= +Customizing the path of the ``Experiment`` and entity folders is possible by providing +either an absolute or relative path to the `path` argument during initialization. When +a relative path is provided, SmartSim executes the entity relative to the current working +directory. + +For instance, consider the following Python script: - - :ref:`SbatchSettings ` for Slurm - - :ref:`QsubBatchSettings ` for PBSPro - - :ref:`BsubBatchSettings ` for LSF - -If it only passed ``RunSettings``, ``Ensemble``, objects will require either -a ``replicas`` argument or a ``params`` argument to expand parameters -into ``Model`` instances. At launch, the ``Ensemble`` will look for -interactive allocations to launch models in. - -If it passed ``BatchSettings`` without other arguments, an empty ``Ensemble`` -will be created that ``Model`` objects can be added to manually. All ``Model`` -objects added to the ``Ensemble`` will be launched in a single batch. - -If it passed ``BatchSettings`` and ``RunSettings``, the ``BatchSettings`` will -determine the allocation settings for the entire batch, and the ``RunSettings`` -will determine how each individual ``Model`` instance is executed within -that batch. - -This is the same example as above, but tailored towards a running as a batch job -on a slurm system: - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="slurm") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} - - # define resources for all ensemble members - sbatch = exp.create_batch_settings(nodes=4, - time="01:00:00", - account="12345-Cray", - queue="gpu") - - # define how each member should run - srun = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - srun.set_nodes(1) - srun.set_tasks(24) - - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - batch_settings=sbatch, - run_settings=srun, - perm_strategy="random", - n_models=4) - exp.start(ensemble, summary=True) - - -This will generate and execute a batch script that looks something like -the following: - -.. code-block:: bash - - # GENERATED - - #!/bin/bash - - #SBATCH --output=/lus/smartsim/Training-Ensemble.out - #SBATCH --error=/lus/smartsim/Training-Ensemble.err - #SBATCH --job-name=Training-Ensemble-CHTN0UI2DORX - #SBATCH --nodes=4 - #SBATCH --time=01:00:00 - #SBATCH --partition=gpu - #SBATCH --account=12345-Cray - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_0.out --error /scratch/smartsim/Training-Ensemble_0.err --job-name Training-Ensemble_0-CHTN0UI2E5DX --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.17 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_1.out --error /scratch/smartsim/Training-Ensemble_1.err --job-name Training-Ensemble_1-CHTN0UI2JQR5 --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.32 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_2.out --error /scratch/smartsim/Training-Ensemble_2.err --job-name Training-Ensemble_2-CHTN0UI2P2AR --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.060000000000000005 & - - cd /scratch/smartsim/Training-Run ; /usr/bin/srun --output /scratch/smartsim/Training-Run/Training-Ensemble_3.out --error /scratch/smartsim/Training-Ensemble_3.err --job-name Training-Ensemble_3-CHTN0UI2TRE7 --nodes=1 --ntasks=24 /scratch/pyenvs/smartsim/bin/python ./train-model.py --LR=0.35000000000000003 & - - wait - -Prefixing Keys in the Orchestrator ----------------------------------- - -If each of multiple ensemble members attempt to use the same code to access their respective models -in the Orchestrator, the keys by which they do this will overlap and they can end up accessing each -others' data inadvertently. To prevent this situation, the SmartSim Entity object supports key -prefixing, which automatically prepends the name of the model to the keys by which it is accessed. -With this enabled, key overlapping is no longer an issue and ensemble members can use the same code. - -Under the hood, calling ensemble.enable_key_prefixing() causes the SSKEYOUT environment variable to -be set, which in turn causes all keys generated by an ensemble member to be prefixed with its model -name. Similarly, if the model for the ensemble member has incoming entities (such as those set via -model.register_incoming_entity() or ensemble.register_incoming_entity()), the SSKEYIN environment -variable will be set and the keys associated with those inputs will be automatically prefixed. Note -that entities must register themselves as this is not done by default. - -Finally, please note that while prefixing is enabled by default for tensors, datasets, and aggregated -lists of datasets, a SmartRedis client must manually call Client.use_model_ensemble_prefix() to -ensure that prefixes are used with models and scripts. - -We modify the example above to enable key prefixing as follows: - -.. code-block:: bash - - import numpy as np - from smartsim import Experiment - - exp = Experiment("Training-Run", launcher="slurm") - - # setup ensemble parameter space - learning_rate = list(np.linspace(.01, .5)) - train_params = {"LR": learning_rate} +.. code-block:: python - # define resources for all ensemble members - sbatch = exp.create_batch_settings(nodes=4, - time="01:00:00", - account="12345-Cray", - queue="gpu") + from smartsim import Experiment + + exp = Experiment(name="experiment-example", exp_path="absolute/path/to/experiment-folder") + database = exp.create_database(port=6379, interface="ib0") + exp.start(database) + settings = exp.create_run_settings(exe="echo", exec_args="hello world") + model = exp.create_model(name="model-name", run_settings=settings, path="./model-folder") + ensemble = exp.create_ensemble(name="ensemble-name", run_settings=settings, replicas=2, path="./ensemble-folder") + exp.start(model, ensemble) + exp.stop(database) + +When executed, this script creates the following directory structure in your +working directory: + +:: + + ├── experiment-folder + | ├── orchestrator + | │ ├── orchestrator_0.err + | │ └── orchestrator_0.out + ├── model-folder + │ ├── model-name.err + │ └── model-name.out + └── ensemble-folder + ├── ensemble-name_0 + │ ├── ensemble-name_0.err + │ └── ensemble-name_0.out + ├── ensemble-name_1 + │ ├── ensemble-name_1.err + │ └── ensemble-name_1.out + +.. _exp_example: + +======= +Example +======= +.. compound:: + In the following section, we provide an example of using SmartSim to automate the + deployment of an HPC workflow consisting of a ``Model`` and standalone ``Orchestrator``. + The example demonstrates: + + *Initializing* + - a workflow (``Experiment``) + - an in-memory database (standalone ``Orchestrator``) + - an application (``Model``) + *Generating* + - the ``Orchestrator`` output directory + - the ``Model`` output directory + *Previewing* + - the ``Orchestrator`` contents + - the ``Model`` contents + *Starting* + - an in-memory database (standalone ``Orchestrator``) + - an application (``Model``) + *Stopping* + - an in-memory database (standalone ``Orchestrator``) + + The example source code is available in the dropdown below for convenient execution + and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + +Initializing +============ +.. compound:: + To create a workflow, *initialize* an ``Experiment`` object + at the start of the Python driver script. This involves specifying + a name and the system launcher that will execute all entities. + Set the `launcher` argument to `auto` to instruct SmartSim to attempt + to find the machines WLM. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 1-8 + + We also initialize a SmartSim :ref:`logger`. We will use the logger to log the ``Experiment`` + summary. + +.. compound:: + Next, launch an in-memory database, referred to as an ``Orchestrator``. + To *initialize* an ``Orchestrator`` object, use the ``Experiment.create_database`` + factory method. Create a multi-sharded ``Orchestrator`` by setting the argument `db_nodes` to three. + SmartSim will assign a `port` to the ``Orchestrator`` and attempt to detect your machine's + network interface if not provided. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 10-11 + +.. compound:: + Before invoking the factory method to create a ``Model``, + first create a ``RunSettings`` object. ``RunSettings`` hold the + information needed to execute the ``Model`` on the machine. The ``RunSettings`` + object is initialized using the ``Experiment.create_run_settings`` method. + Specify the executable to run and arguments to pass to the executable. + + The example ``Model`` is a simple `Hello World` program + that echos `Hello World` to stdout. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 13-14 + + After creating the ``RunSettings`` object, initialize the ``Model`` object by passing the `name` + and `settings` to ``create_model``. + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 15-16 + +Generating +========== +.. compound:: + Next we generate the file structure for the ``Experiment``. A call to ``Experiment.generate`` + instructs SmartSim to create directories within the ``Experiment`` folder for each instance passed in. + We organize the ``Orchestrator`` and ``Model`` output files within the ``Experiment`` folder by + passing the ``Orchestrator`` and ``Model`` instances to ``exp.generate``: + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 18-19 + + `Overwrite=True` instructs SmartSim to overwrite entity contents if files and subdirectories + already exist within the ``Experiment`` directory. + + .. note:: + If files or folders are attached to a ``Model`` or ``Ensemble`` members through ``Model.attach_generator_files`` + or ``Ensemble.attach_generator_files``, the attached files or directories will be symlinked, copied, or configured and + written into the created directory for that instance. + + The ``Experiment.generate`` call places the `.err` and `.out` log files in the entity + subdirectories within the main ``Experiment`` directory. + +Previewing +========== +.. compound:: + Optionally, users can preview an ``Experiment`` entity. The ``Experiment.preview`` method displays the entity summaries during runtime + to offer additional insight into the launch details. Any instance of a ``Model``, ``Ensemble``, or ``Orchestrator`` created by the + ``Experiment`` can be passed as an argument to the preview method. Additionally, users may specify the name of a file to write preview data to + via the ``output_filename`` argument, as well as the text format through the ``output_format`` argument. Users can also specify how verbose + the preview is via the ``verbosity_level`` argument. + + The following options are available when configuring preview: + + * `verbosity_level="info"` instructs preview to display user-defined fields and entities. + * `verbosity_level="debug"` instructs preview to display user-defined field and entities and auto-generated fields. + * `verbosity_level="developer"` instructs preview to display user-defined field and entities, auto-generated fields, and run commands. + * `output_format="plain_text"` sets the output format. The only accepted output format is 'plain_text'. + * `output_filename="test_name.txt"` specifies name of file and extension to write preview data to. If no output filename is set, the preview will be output to stdout. + + In the example below, we preview the ``Orchestrator`` and ``Model`` entities by passing their instances to ``Experiment.preview``: + + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 21-22 + +When executed, the preview logs the following in stdout: + +:: + + === Experiment Overview === + + Experiment Name: example-experiment + Experiment Path: absolute/path/to/SmartSim/example-experiment + Launcher: local + + === Entity Preview === + + == Orchestrators == + + = Database Identifier: orchestrator = + Path: absolute/path/to/SmartSim/example-experiment/orchestrator + Shards: 1 + TCP/IP Port(s): + 6379 + Network Interface: ib0 + Type: redis + Executable: absolute/path/to/SmartSim/smartsim/_core/bin/redis-server + + == Models == + + = Model Name: hello_world = + Path: absolute/path/to/SmartSim/example-experiment/hello_world + Executable: /bin/echo + Executable Arguments: + Hello + World + Client Configuration: + Database Identifier: orchestrator + Database Backend: redis + TCP/IP Port(s): + 6379 + Type: Standalone + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: Off + Datasets: Off + ML Models/Torch Scripts: Off + Aggregation Lists: Off + +Starting +======== +.. compound:: + Next launch the components of the ``Experiment`` (``Orchestrator`` and ``Model``). + To do so, use the ``Experiment.start`` factory method and pass in the previous + ``Orchestrator`` and ``Model`` instances. - # define how each member should run - srun = exp.create_run_settings(exe="python", - exe_args="./train-model.py") - srun.set_nodes(1) - srun.set_tasks(24) + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 24-25 - ensemble = exp.create_ensemble("Training-Ensemble", - params=train_params, - params_as_args=["LR"], - batch_settings=sbatch, - run_settings=srun, - perm_strategy="random", - n_models=4) +Stopping +======== +.. compound:: + Lastly, to clean up the ``Experiment``, tear down the launched ``Orchestrator`` + using the ``Experiment.stop`` factory method. - # Enable key prefixing -- note that this should be done - # before starting the experiment - ensemble.enable_key_prefixing() + .. literalinclude:: tutorials/doc_examples/experiment_doc_examples/exp.py + :language: python + :linenos: + :lines: 27-28 - exp.start(ensemble, summary=True) + Notice that we use the ``Experiment.summary`` function to print + the summary of the workflow. +When you run the experiment, the following output will appear:: -Further Information -------------------- + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|----------------|---------------|-------------|---------|---------|-----------|--------------| + | 0 | hello_world | Model | 1778304.4 | 0 | 10.0657 | Completed | 0 | + | 1 | orchestrator_0 | DBNode | 1778304.3+2 | 0 | 43.4797 | Cancelled | 0 | -For more informtion about Ensembles, please refer to the :ref:`Ensemble API documentation `. \ No newline at end of file +.. note:: + Failure to tear down the ``Orchestrator`` at the end of an ``Experiment`` + may lead to ``Orchestrator`` launch failures if another ``Experiment`` is + started on the same node. diff --git a/doc/images/Experiment.png b/doc/images/Experiment.png new file mode 100644 index 000000000..a103dd6dd Binary files /dev/null and b/doc/images/Experiment.png differ diff --git a/doc/images/clustered_orchestrator-1.png b/doc/images/clustered_orchestrator-1.png new file mode 100644 index 000000000..996d55e85 Binary files /dev/null and b/doc/images/clustered_orchestrator-1.png differ diff --git a/doc/images/colocated_orchestrator-1.png b/doc/images/colocated_orchestrator-1.png new file mode 100644 index 000000000..0da5d0609 Binary files /dev/null and b/doc/images/colocated_orchestrator-1.png differ diff --git a/doc/index.rst b/doc/index.rst index 91a7ee1ba..4c64712b2 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -30,9 +30,14 @@ :caption: SmartSim experiment + run_settings + batch_settings + model + ensemble orchestrator - launchers + ss_logger ml_features + dragon api/smartsim_api .. toctree:: diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 2f43db50f..02c17e1fd 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -1,3 +1,5 @@ +.. _basic_install_SS: + ****************** Basic Installation ****************** @@ -18,7 +20,7 @@ Basic The base prerequisites to install SmartSim and SmartRedis are: - - Python 3.8-3.11 + - Python 3.9-3.11 - Pip - Cmake 3.13.x (or later) - C compiler @@ -27,7 +29,7 @@ The base prerequisites to install SmartSim and SmartRedis are: - git - `git-lfs`_ -.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs#installing +.. _git-lfs: https://github.com/git-lfs/git-lfs?utm_source=gitlfs_site&utm_medium=installation_link&utm_campaign=gitlfs .. note:: @@ -48,7 +50,7 @@ The machine-learning backends have additional requirements in order to use GPUs for inference - `CUDA Toolkit 11 (tested with 11.8) `_ - - `cuDNN 8 (tested with 8.9.1) `_ + - `cuDNN 8 (tested with 8.9.1) `_ - OS: Linux - GPU: Nvidia @@ -72,11 +74,11 @@ Supported Versions * - MacOS - x86_64, aarch64 - Not supported - - 3.8 - 3.11 + - 3.9 - 3.11 * - Linux - x86_64 - Nvidia - - 3.8 - 3.11 + - 3.9 - 3.11 .. note:: @@ -235,6 +237,28 @@ to building SmartSim with GPU support is to specify a different ``device`` backends look for the CUDA Toolkit and cuDNN libraries. Please see the :ref:`Platform Installation Section ` section for guidance. + +.. _dragon_install: + +Dragon Install +-------------- + +`Dragon `_ is +an HPC-native library for distributed computing. SmartSim can use Dragon as a +launcher on systems with Slurm or PBS as schedulers. To install the correct +version of Dragon, you can add the ``--dragon`` option to ``smart build``. +For example, to install dragon alongside the RedisAI CPU backends, you can run + +.. code-block:: bash + + # run one of the following + smart build --device cpu --dragon # install Dragon, PT and TF for cpu + smart build --device cpu --onnx --dragon # install Dragon and all backends (PT, TF, ONNX) on cpu + +.. note:: + Dragon is only supported on Linux systems. For further information, you + can read :ref:`the dedicated documentation page `. + ========== SmartRedis ========== @@ -254,9 +278,9 @@ SmartSim does. * - Platform - Python Versions * - MacOS - - 3.8 - 3.11 + - 3.9 - 3.11 * - Linux - - 3.8 - 3.11 + - 3.9 - 3.11 The Python client for SmartRedis is installed through ``pip`` as follows: @@ -298,7 +322,7 @@ source remains at the site of the clone instead of in site-packages. pip install -e .[dev,ml] # for bash users pip install -e .\[dev,ml\] # for zsh users -Use the now installed ``smart`` cli to install the machine learning runtimes. +Use the now installed ``smart`` cli to install the machine learning runtimes and dragon. .. tabs:: @@ -307,8 +331,8 @@ Use the now installed ``smart`` cli to install the machine learning runtimes. .. code-block:: bash # run one of the following - smart build --device cpu --onnx # install with cpu-only support - smart build --device gpu --onnx # install with both cpu and gpu support + smart build --device cpu --onnx --dragon # install with cpu-only support + smart build --device gpu --onnx --dragon # install with both cpu and gpu support .. tab:: MacOS (Intel x64) diff --git a/doc/installation_instructions/platform/nonroot-linux.rst b/doc/installation_instructions/platform/nonroot-linux.rst index 2c8f7933a..3070a871a 100644 --- a/doc/installation_instructions/platform/nonroot-linux.rst +++ b/doc/installation_instructions/platform/nonroot-linux.rst @@ -13,6 +13,6 @@ a user is possible. ./cuda_11.4.4_470.82.01_linux.run --toolkit --silent --toolkitpath=/path/to/install/location/ For cuDNN, follow `Nvidia's instructions -`_, +`_, and copy the cuDNN libraries to the `lib64` directory at the CUDA Toolkit location specified above. \ No newline at end of file diff --git a/doc/installation_instructions/platform/olcf-summit.rst b/doc/installation_instructions/platform/olcf-summit.rst index 5727ae8fe..236d15054 100644 --- a/doc/installation_instructions/platform/olcf-summit.rst +++ b/doc/installation_instructions/platform/olcf-summit.rst @@ -6,7 +6,7 @@ Since SmartSim does not have a built PowerPC build, the build steps for an IBM system are slightly different than other systems. Luckily for us, a conda channel with all relevant packages is maintained as part -of the `OpenCE `_ initiative. Users can follow these +of the `OpenCE `_ initiative. Users can follow these instructions to get a working SmartSim build with PyTorch and TensorFlow for GPU on Summit. Note that SmartSim and SmartRedis will be downloaded to the working directory from which these instructions are executed. @@ -19,7 +19,7 @@ into problems. .. code-block:: bash # setup Python and build environment - export ENV_NAME=smartsim-0.6.2 + export ENV_NAME=smartsim-0.7.0 git clone https://github.com/CrayLabs/SmartRedis.git smartredis git clone https://github.com/CrayLabs/SmartSim.git smartsim conda config --prepend channels https://ftp.osuosl.org/pub/open-ce/1.6.1/ diff --git a/doc/launchers.rst b/doc/launchers.rst deleted file mode 100644 index 22425071e..000000000 --- a/doc/launchers.rst +++ /dev/null @@ -1,248 +0,0 @@ - -********* -Launchers -********* - -SmartSim interfaces with a number of backends called `launchers` that -are responsible for constructing jobs based on run parameters and -launching them onto a system. - -The `launchers` allow SmartSim users to interact with their system -programmatically through a python interface. -Because of this, SmartSim users do not have to leave the Jupyter Notebook, -Python REPL, or Python script to launch, query, and interact with their jobs. - -SmartSim currently supports 5 `launchers`: - 1. ``local``: for single-node, workstation, or laptop - 2. ``slurm``: for systems using the Slurm scheduler - 3. ``pbs``: for systems using the PBSpro scheduler - 4. ``lsf``: for systems using the LSF scheduler - 5. ``auto``: have SmartSim auto-detect the launcher to use. - -To specify a specific launcher, one argument needs to be provided -to the ``Experiment`` initialization. - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("name-of-experiment", launcher="local") # local launcher - exp = Experiment("name-of-experiment", launcher="slurm") # Slurm launcher - exp = Experiment("name-of-experiment", launcher="pbs") # PBSpro launcher - exp = Experiment("name-of-experiment", launcher="lsf") # LSF launcher - exp = Experiment("name-of-experiment", launcher="auto") # auto-detect launcher - -------------------------------------------------------------------------- - -Local -===== - - -The local launcher can be used on laptops, workstations and single -nodes of supercomputer and cluster systems. Through -launching locally, users can prototype workflows and quickly scale -them to larger systems with minimal changes. - -As with all launchers in SmartSim, the local launcher supports -asynchronous execution meaning once entities have been launched -the main thread of execution is not blocked. Daemon threads -that manage currently running jobs will be created when active -jobs are present within SmartSim. - -.. _psutil: https://github.com/giampaolo/psutil - -The local launcher uses the `psutil`_ library to execute and monitor -user-created jobs. - - -Running Locally ---------------- - -The local launcher supports the base :ref:`RunSettings API ` -which can be used to run executables as well as run executables -with arbitrary launch binaries like `mpiexec`. - -The local launcher is the default launcher for all ``Experiment`` -instances. - -The local launcher does not support batch launching. Ensembles -are always executed in parallel but launched sequentially. - ----------------------------------------------------------------------- - -Slurm -===== - -The Slurm launcher works directly with the Slurm scheduler to launch, query, -monitor and stop applications. During the course of an ``Experiment``, -launched entities can be queried for status, completion, and errors. - -The amount of communication between SmartSim and Slurm can be tuned -for specific guidelines of different sites by setting the -value for ``jm_interval`` in the SmartSim configuration file. - -To use the Slurm launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("NAMD-worklfow", launcher="slurm") - - -Running on Slurm ----------------- - -The Slurm launcher supports three types of ``RunSettings``: - 1. :ref:`SrunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``sbatch`` through: - 1. :ref:`SbatchSettings ` - - -Both supported ``RunSettings`` types above can be added -to a ``SbatchSettings`` batch workload through ``Ensemble`` -creation. - - -Getting Allocations -------------------- - -Slurm supports a number of user facing features that other schedulers -do not. For this reason, an extra module :ref:`smartsim.slurm ` can be -used to obtain allocations to launch on and release them after -``Experiment`` completion. - -.. code-block:: python - - from smartsim.wlm import slurm - alloc = slurm.get_allocation(nodes=1) - -The ID of the allocation is returned as a string to the user so that -they can specify what entities should run on which allocations -obtained by SmartSim. - -Additional arguments that would have been passed to the ``salloc`` -command can be passed through the ``options`` argument in a dictionary. - -Anything passed to the options will be processed as a Slurm -argument and appended to the salloc command with the appropriate -prefix (e.g. `-` or `--`). - -For arguments without a value, pass None as the value: - - `exclusive=None` - -.. code-block:: python - - from smartsim.wlm import slurm - salloc_options = { - "C": "haswell", - "partition": "debug", - "exclusive": None - } - alloc_id = slurm.get_slurm_allocation(nodes=128, - time="10:00:00", - options=salloc_options) - -The above code would generate a ``salloc`` command like: - -.. code-block:: bash - - salloc -N 5 -C haswell --partition debug --time 10:00:00 --exclusive - - - -Releasing Allocations ---------------------- - -The :ref:`smartsim.slurm ` interface -also supports releasing allocations obtained in an experiment. - -The example below releases the allocation in the example above. - -.. code-block:: python - - from smartsim.wlm import slurm - salloc_options = { - "C": "haswell", - "partition": "debug", - "exclusive": None - } - alloc_id = slurm.get_slurm_allocation(nodes=128, - time="10:00:00", - options=salloc_options) - - # - - slurm.release_slurm_allocation(alloc_id) - -------------------------------------------------------------------- - -PBSPro -====== - -Like the Slurm launcher, the PBSPro launcher works directly with the PBSPro -scheduler to launch, query, monitor and stop applications. - -The amount of communication between SmartSim and PBSPro can be tuned -for specific guidelines of different sites by setting the -value for ``jm_interval`` in the SmartSim configuration file. - -To use the PBSpro launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("LAMMPS-melt", launcher="pbs") - - - -Running on PBSpro ------------------ - -The PBSpro launcher supports three types of ``RunSettings``: - 1. :ref:`AprunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``qsub`` through: - 1. :ref:`QsubBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``QsubBatchSettings`` batch workload through ``Ensemble`` -creation. - ---------------------------------------------------------------------- - -LSF -=== - -The LSF Launcher works like the PBSPro launcher and -is compatible with LSF and OpenMPI workloads. - -To use the LSF launcher, specify at ``Experiment`` initialization: - -.. code-block:: python - - from smartsim import Experiment - - exp = Experiment("MOM6-double-gyre", launcher="lsf") - - -Running on LSF --------------- - -The LSF launcher supports three types of ``RunSettings``: - 1. :ref:`JsrunSettings ` - 2. :ref:`MpirunSettings ` - 3. :ref:`MpiexecSettings ` - -As well as batch settings for ``bsub`` through: - 1. :ref:`BsubBatchSettings ` - -Both supported ``RunSettings`` types above can be added -to a ``BsubBatchSettings`` batch workload through ``Ensemble`` -creation. diff --git a/doc/ml_features.rst b/doc/ml_features.rst index 6096f005e..4e0919a08 100644 --- a/doc/ml_features.rst +++ b/doc/ml_features.rst @@ -1,3 +1,5 @@ +.. _ml_features_docs: + ########### ML Features ########### @@ -303,7 +305,7 @@ with TensorFlow or PyTorch backends. .. code-block:: python - client.run_model(model_key, inputs=["mnist_imagse"], outputs=["mnist_output"]) + client.run_model(model_key, inputs=["mnist_images"], outputs=["mnist_output"]) output = client.get_tensor("mnist_output") diff --git a/doc/model.rst b/doc/model.rst new file mode 100644 index 000000000..52e1ce1c0 --- /dev/null +++ b/doc/model.rst @@ -0,0 +1,2343 @@ +.. _model_object_doc: + +***** +Model +***** +======== +Overview +======== +SmartSim ``Model`` objects enable users to execute computational tasks in an +``Experiment`` workflow, such as launching compiled applications, +running scripts, or performing general computational operations. A ``Model`` can be launched with +other SmartSim ``Model(s)`` and ``Orchestrator(s)`` to build AI-enabled workflows. +With the SmartSim ``Client`` (:ref:`SmartRedis`), data can be transferred from a ``Model`` +to the ``Orchestrator`` for use in an ML model (TF, TF-lite, PyTorch, or ONNX), online +training process, or additional ``Model`` applications. SmartSim ``Clients`` (SmartRedis) are available in +Python, C, C++, or Fortran. + +To initialize a SmartSim ``Model``, use the ``Experiment.create_model`` factory method. +When creating a ``Model``, a :ref:`RunSettings` object must be provided. A ``RunSettings`` +object specifies the ``Model`` executable (e.g. the full path to a compiled binary) as well as +executable arguments and launch parameters. These specifications include launch commands (e.g. `srun`, `aprun`, `mpiexec`, etc), +compute resource requirements, and application command-line arguments. + +Once a ``Model`` instance has been initialized, users have access to +the :ref:`Model API` functions to further configure the ``Model``. +The Model API functions provide users with the following capabilities: + +- :ref:`Attach Files to a SmartSim Model` +- :ref:`Colocate an Orchestrator to a SmartSim Model` +- :ref:`Attach a ML Model to the SmartSim Model` +- :ref:`Attach a TorchScript Function to the SmartSim Model` +- :ref:`Enable SmartSim Model Data Collision Prevention` + +Once the ``Model`` has been configured and launched, a user can leverage an ``Orchestrator`` within a ``Model`` +through **two** strategies: + +- :ref:`Connect to a Standalone Orchestrator` + When a ``Model`` is launched, it does not use or share compute + resources on the same host (computer/server) where a SmartSim ``Orchestrator`` is running. + Instead, it is launched on its own compute resources specified by the ``RunSettings`` object. + The ``Model`` can connect via a SmartRedis ``Client`` to a launched standalone ``Orchestrator``. + +- :ref:`Connect to a Colocated Orchestrator` + When the colocated ``Model`` is started, SmartSim launches an ``Orchestrator`` on the ``Model`` compute + nodes prior to the ``Model`` execution. The ``Model`` can then connect to the colocated ``Orchestrator`` + via a SmartRedis ``Client``. + +.. note:: + For the ``Client`` connection to be successful from within the ``Model`` application, + the SmartSim ``Orchestrator`` must be launched prior to the start of the ``Model``. + +.. note:: + A ``Model`` can be launched without an ``Orchestrator`` if data transfer and ML capabilities are not + required. + +SmartSim manages ``Model`` instances through the :ref:`Experiment API` by providing functions to +launch, monitor, and stop applications. Additionally, a ``Model`` can be launched individually +or as a group via an :ref:`Ensemble`. + +============== +Initialization +============== +Overview +======== +The ``Experiment`` is responsible for initializing all SmartSim entities. +A ``Model`` is created using the ``Experiment.create_model`` factory method, and users can customize the +``Model`` via the factory method parameters. + +The key initializer arguments for ``Model`` creation can be found in the :ref:`Experiment API` +under the ``create_model`` docstring. + +A `name` and :ref:`RunSettings` reference are required to initialize a ``Model``. +Optionally, include a :ref:`BatchSettings` object to specify workload manager batch launching. + +.. note:: + ``BatchSettings`` attached to a ``Model`` are ignored when the ``Model`` is executed as part of an ``Ensemble``. + +The `params` factory method parameter for ``Model`` creation allows a user to define simulation parameters and +values through a dictionary. Using ``Model`` :ref:`file functions`, users can write these parameters to +a file in the ``Model`` working directory. + +When a ``Model`` instance is passed to ``Experiment.generate``, a +directory within the Experiment directory +is created to store input and output files from the ``Model``. + +.. note:: + It is strongly recommended to invoke ``Experiment.generate`` on the ``Model`` + instance before launching the ``Model``. If a path is not specified during + ``Experiment.create_model``, calling ``Experiment.generate`` with the ``Model`` + instance will result in SmartSim generating a ``Model`` directory within the + ``Experiment`` directory. This directory will be used to store the ``Model`` outputs + and attached files. + +.. _std_model_doc: + +Example +======= +In this example, we provide a demonstration of how to initialize and launch a ``Model`` +within an ``Experiment`` workflow. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + +All workflow entities are initialized through the :ref:`Experiment API`. +Consequently, initializing a SmartSim ``Experiment`` is a prerequisite for ``Model`` +initialization. + +To initialize an instance of the ``Experiment`` class, import the SmartSim +``Experiment`` module and invoke the ``Experiment`` constructor +with a `name` and `launcher`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 1-4 + +A ``Model`` requires ``RunSettings`` objects to specify how the ``Model`` should be +executed within the workflow. We use the ``Experiment`` instance `exp` to +call the factory method ``Experiment.create_run_settings`` to initialize a ``RunSettings`` +object. Finally, we specify the executable `"echo"` to run the executable argument `"Hello World"`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 6-7 + +.. seealso:: + For more information on ``RunSettings`` objects, reference the :ref:`RunSettings` documentation. + +We now have a ``RunSettings`` instance named `model_settings` that contains all of the +information required to launch our application. Pass a `name` and the run settings instance +to the ``create_model`` factory method: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 9-10 + +To create an isolated output directory for the ``Model``, invoke ``Experiment.generate`` on the +``Model`` `model_instance`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 12-13 + +.. note:: + The ``Experiment.generate`` step is optional; however, this step organizes the ``Experiment`` + entity output files into individual entity folders within the ``Experiment`` folder. Continue + in the example for information on ``Model`` output generation or visit the + :ref:`Output and Error Files` section. + +All entities are launched, monitored and stopped by the ``Experiment`` instance. +To start the ``Model``, invoke ``Experiment.start`` on `model_instance`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_init.py + :language: python + :linenos: + :lines: 15-16 + +When the ``Experiment`` driver script is executed, two files from the `model_instance` will be created +in the generated ``Model`` subdirectory: + +1. `model_instance.out` : this file will hold outputs produced by the `model_instance` workload. +2. `model_instance.err` : this file will hold any errors that occurred during `model_instance` execution. + +.. _colo_model_doc: + +====================== +Colocated Orchestrator +====================== +A SmartSim ``Model`` has the capability to share compute node(s) with a SmartSim ``Orchestrator`` in +a deployment known as a colocated ``Orchestrator``. In this scenario, the ``Orchestrator`` and ``Model`` share +compute resources. To achieve this, users need to initialize a ``Model`` instance using the +``Experiment.create_model`` function and then utilize one of the three functions listed below to +colocate an ``Orchestrator`` with the ``Model``. This instructs SmartSim to launch an ``Orchestrator`` +on the application compute node(s) before the ``Model`` execution. + +There are **three** different Model API functions to colocate a ``Model``: + +- ``Model.colocate_db_tcp``: Colocate an ``Orchestrator`` instance and establish client communication using TCP/IP. +- ``Model.colocate_db_uds``: Colocate an ``Orchestrator`` instance and establish client communication using Unix domain sockets (UDS). +- ``Model.colocate_db``: (deprecated) An alias for `Model.colocate_db_tcp`. + +Each function initializes an unsharded ``Orchestrator`` accessible only to the ``Model`` processes on the same compute node. When the ``Model`` +is started, the ``Orchestrator`` will be launched on the same compute resource as the ``Model``. Only the colocated ``Model`` +may communicate with the ``Orchestrator`` via a SmartRedis ``Client`` by using the loopback TCP interface or +Unix Domain sockets. Extra parameters for the ``Orchestrator`` can be passed into the colocate functions above +via `kwargs`. + +.. code-block:: python + + example_kwargs = { + "maxclients": 100000, + "threads_per_queue": 1, + "inter_op_threads": 1, + "intra_op_threads": 1 + } + +For a walkthrough of how to colocate a ``Model``, navigate to the +:ref:`Colocated Orchestrator` for instructions. + +For users aiming to **optimize performance**, SmartSim offers the flexibility to specify +processor IDs to which the colocated ``Orchestrator`` should be pinned. This can be achieved using +the `custom_pinning` argument, which is recognized by both ``Model.colocate_db_uds`` and +``Model.colocate_db_tcp``. In systems where specific processors support ML model and +TorchScript execution, users can employ the `custom_pinning` argument to designate +these processor IDs. This ensures that the specified processors are available +when executing ML models or TorchScripts on the colocated ``Orchestrator``. +Additionally, users may use the `custom_pinning` argument to avoid reserved processors +by specifying a available processor ID or a list of available processor IDs. + +.. _files_doc: + +===== +Files +===== +Overview +======== +Applications often depend on external files (e.g. training datasets, evaluation datasets, etc) +to operate as intended. Users can instruct SmartSim to copy, symlink, or manipulate external files +prior to a ``Model`` launch via the ``Model.attach_generator_files`` function. + +.. note:: + Multiple calls to ``Model.attach_generator_files`` will overwrite previous file configurations + in the ``Model``. + +To setup the run directory for the ``Model``, users should pass the list of files to +``Model.attach_generator_files`` using the following arguments: + +* `to_copy` (t.Optional[t.List[str]] = None): Files that are copied into the path of the ``Model``. +* `to_symlink` (t.Optional[t.List[str]] = None): Files that are symlinked into the path of the ``Model``. + +User-formatted files can be attached using the `to_configure` argument. These files will be modified +during ``Model`` generation to replace tagged sections in the user-formatted files with +values from the `params` initializer argument used during ``Model`` creation: + +* `to_configure` (t.Optional[t.List[str]] = None): Designed for text-based ``Model`` input files, + `"to_configure"` is exclusive to the ``Model``. During ``Model`` directory generation, the attached + files are parsed and specified tagged parameters are replaced with the `params` values that were + specified in the ``Experiment.create_model`` factory method of the ``Model``. The default tag is a semicolon + (e.g., THERMO = ;THERMO;). + +In the :ref:`Example` subsection, we provide an example using the value `to_configure` +within ``attach_generator_files``. + +.. _files_example_doc: + +Example +======= +This example demonstrates how to attach a file to a ``Model`` for parameter replacement at the time +of ``Model`` directory generation. This is accomplished using the `params` function parameter in +``Experiment.create_model`` and the `to_configure` function parameter +in ``Model.attach_generator_files``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + +In this example, we have a text file named `params_inputs.txt`. Within the text file, is the parameter `THERMO` +that is required by the ``Model`` application at runtime: + +.. code-block:: bash + + THERMO = ;THERMO; + +In order to have the tagged parameter `;THERMO;` replaced with a usable value at runtime, two steps are required: + +1. The `THERMO` variable must be included in ``Experiment.create_model`` factory method as + part of the `params` initializer argument. +2. The file containing the tagged parameter `;THERMO;`, `params_inputs.txt`, must be attached to the ``Model`` + via the ``Model.attach_generator_files`` method as part of the `to_configure` function parameter. + +To encapsulate our application within a ``Model``, we must first create an ``Experiment`` instance. +Begin by importing the ``Experiment`` module and initializing an ``Experiment``: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 1-4 + +A SmartSim ``Model`` requires a ``RunSettings`` object to +specify the ``Model`` executable (e.g. the full path to a compiled binary) as well as +executable arguments and launch parameters. Create a simple ``RunSettings`` object +and specify the path to the executable script as an executable argument (`exe_args`): + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 6-7 + +.. seealso:: + To read more on SmartSim ``RunSettings`` objects, reference the :ref:`RunSettings` documentation. + +Next, initialize a ``Model`` object via ``Experiment.create_model``. Pass in the `model_settings` instance +and the `params` value: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 9-10 + +We now have a ``Model`` instance named `model_instance`. Attach the text file, `params_inputs.txt`, +to the ``Model`` for use at entity runtime. To do so, use the +``Model.attach_generator_files`` function and specify the `to_configure` +parameter with the path to the text file, `params_inputs.txt`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 12-13 + +To created an isolated directory for the ``Model`` outputs and configuration files, invoke ``Experiment.generate`` +on `model_instance` as an input parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/model_file.py + :language: python + :linenos: + :lines: 15-16 + +The contents of `getting-started/model_name/params_inputs.txt` at runtime are: + +.. code-block:: bash + + THERMO = 1 + +.. _model_output_files: + +====================== +Output and Error Files +====================== +By default, SmartSim stores the standard output and error of the ``Model`` in two files: + +* `.out` +* `.err` + +The files are created in the working directory of the ``Model``, and the filenames directly match the +``Model`` name. The `.out` file logs standard outputs and the +`.err` logs errors for debugging. + +.. note:: + Invoking ``Experiment.generate(model)`` will create a directory `model_name/` and will store + the two files within that directory. You can also specify a path for these files using the + `path` parameter when invoking ``Experiment.create_model``. + +.. _ml_script_model_doc: + +===================== +ML Models and Scripts +===================== +Overview +======== +SmartSim users have the capability to load ML models and TorchScripts into an ``Orchestrator`` +within the ``Experiment`` script for use within a ``Model``. Functions accessible through +a ``Model`` object support loading ML models (TensorFlow, TensorFlow-lite, PyTorch, and ONNX) and +TorchScripts into standalone or colocated ``Orchestrator(s)`` before application runtime. + +Users can follow **two** processes to load an ML model to the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` + +.. warning:: + Uploading an ML model :ref:`from memory` is solely supported for + standalone ``Orchestrator(s)``. To upload an ML model to a colocated ``Orchestrator``, users + must save the ML model to disk and upload :ref:`from file`. + +Users can follow **three** processes to load a TorchScript to the ``Orchestrator``: + +- :ref:`From Memory` +- :ref:`From File` +- :ref:`From String` + +.. warning:: + Uploading a TorchScript :ref:`from memory` is solely supported for + standalone ``Orchestrator(s)``. To upload a TorchScript to a colocated ``Orchestrator``, users + upload :ref:`from file` or :ref:`from string`. + +Once an ML model or TorchScript is loaded into the ``Orchestrator``, ``Model`` objects can +leverage ML capabilities by utilizing the SmartSim ``Client`` (:ref:`SmartRedis`) +to execute the stored ML models and TorchScripts. + +.. _ai_model_doc: + +AI Models +========= +When configuring a ``Model``, users can instruct SmartSim to load +Machine Learning (ML) models to the ``Orchestrator``. ML models added +are loaded into the ``Orchestrator`` prior to the execution of the ``Model``. To load an ML model +to the ``Orchestrator``, SmartSim users can provide the ML model **in-memory** or specify the **file path** +when using the ``Model.add_ml_model`` function. SmartSim solely supports loading an ML model from file +for use within standalone ``Orchestrator(s)``. The supported ML frameworks are TensorFlow, +TensorFlow-lite, PyTorch, and ONNX. + +The arguments that customize the storage and execution of an ML model can be found in the +:ref:`Model API` under the ``add_ml_model`` docstring. + +.. _in_mem_ML_model_ex: + +------------------------------------- +Example: Attach an In-Memory ML Model +------------------------------------- +This example demonstrates how to attach an in-memory ML model to a SmartSim ``Model`` +to load into an ``Orchestrator`` at ``Model`` runtime. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py + +.. note:: + This example assumes: + + - an ``Orchestrator`` is launched prior to the ``Model`` execution (colocated or standalone) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``serialize_model`` which returns the + the ML model as a byte string with the names of the input and output layers + +**Attach the ML Model to a SmartSim Model** + +In this example, we have a serialized Tensorflow-based ML model that was saved to a byte string stored under `model`. +Additionally, the ``serialize_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Model`` named `smartsim_model` exists, we add the in-memory TensorFlow model using +the ``Model.add_ml_model`` function and specify the in-memory ML model to the function parameter `model`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``smartsim_model.add_ml_model`` code snippet, we pass in the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model` (model): The in-memory representation of the TensorFlow model. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the ML model will be loaded to the +launched ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` via a SmartSim +``Client`` (:ref:`SmartRedis`) within the application code. + +.. _from_file_ML_model_ex: + +------------------------------------- +Example: Attach an ML Model From File +------------------------------------- +This example demonstrates how to attach a ML model from file to a SmartSim ``Model`` +to load into an ``Orchestrator`` at ``Model`` runtime. +The source code example is available in the dropdown below for +convenient execution and customization. + +.. note:: + SmartSim supports loading ML models from file to standalone ``Orchestrator(s)``. + This feature is **not** supported for colocated ``Orchestrator(s)``. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_ml_model.py + +.. note:: + This example assumes: + + - a standalone ``Orchestrator`` is launched prior to the ``Model`` execution + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + - a Tensorflow-based ML model was serialized using ``freeze_model`` which returns the + the path to the serialized model file and the names of the input and output layers + +**Attach the ML Model to a SmartSim Model** + +In this example, we have a serialized Tensorflow-based ML model that was saved to disk and stored under `model`. +Additionally, the ``freeze_model`` function returned the names of the input and output layers stored under +`inputs` and `outputs`. Assuming an initialized ``Model`` named `smartsim_model` exists, we add the TensorFlow model using +the ``Model.add_ml_model`` function and specify the TensorFlow model path to the parameter `model_path`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_ml_model.py + :language: python + :linenos: + :lines: 39-40 + +In the above ``smartsim_model.add_ml_model`` code snippet, we pass in the following arguments: + +- `name` ("cnn"): A name to reference the ML model in the ``Orchestrator``. +- `backend` ("TF"): Indicating that the ML model is a TensorFlow model. +- `model_path` (model_file): The path to the ML model script. +- `device` ("GPU"): Specifying the device for ML model execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. +- `inputs` (inputs): The name of the ML model input nodes (TensorFlow only). +- `outputs` (outputs): The name of the ML model output nodes (TensorFlow only). + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the ML model will be loaded to the +launched standalone ``Orchestrator``. The ML model can then be executed on the ``Orchestrator`` +via a SmartRedis ``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_doc: + +TorchScripts +============ +When configuring a ``Model``, users can instruct SmartSim to load TorchScripts +to the ``Orchestrator``. TorchScripts added are loaded into the ``Orchestrator`` prior to +the execution of the ``Model``. To load a TorchScript to the ``Orchestrator``, SmartSim users +can follow one of the processes: + +- :ref:`Define a TorchScript Function In-Memory` + Use the ``Model.add_function`` to instruct SmartSim to load an in-memory TorchScript to the ``Orchestrator``. +- :ref:`Define a TorchScript Function From File` + Provide file path to ``Model.add_script`` to instruct SmartSim to load the TorchScript from file to the ``Orchestrator``. +- :ref:`Define a TorchScript Function as String` + Provide function string to ``Model.add_script`` to instruct SmartSim to load a raw string as a TorchScript function to the ``Orchestrator``. + +.. note:: + SmartSim does **not** support loading in-memory TorchScript functions to colocated ``Orchestrator(s)``. + Users should instead load TorchScripts to a colocated ``Orchestrator`` from file or as a raw string. + +Continue or select a process link to learn more on how each function (``Model.add_script`` and ``Model.add_function``) +load TorchScripts to launched ``Orchestrator(s)``. + +.. _in_mem_TF_doc: + +------------------------------- +Attach an In-Memory TorchScript +------------------------------- +Users can define TorchScript functions within the Python driver script +to attach to a ``Model``. This feature is supported by ``Model.add_function`` which provides flexible +device selection, allowing users to choose between which device the TorchScript is executed on, `"GPU"` or `"CPU"`. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` function parameter. + +.. warning:: + ``Model.add_function`` does **not** support loading in-memory TorchScript functions to a colocated ``Orchestrator``. + If you would like to load a TorchScript function to a colocated ``Orchestrator``, define the function + as a :ref:`raw string` or :ref:`load from file`. + +The arguments that customize the execution of an in-memory TorchScript function can be found in the +:ref:`Model API` under the ``add_function`` docstring. + +Example: Load a In-Memory TorchScript Function +---------------------------------------------- +This example walks through the steps of instructing SmartSim to load an in-memory TorchScript function +to a standalone ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + +.. note:: + The example assumes: + + - a standalone ``Orchestrator`` is launched prior to the ``Model`` execution + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define an In-Memory TF Function** + +To begin, define an in-memory TorchScript function within the ``Experiment`` driver script. +For the purpose of the example, we add a simple TorchScript function named `timestwo`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + :language: python + :linenos: + :lines: 3-4 + +**Attach the In-Memory TorchScript Function to a SmartSim Model** + +We use the ``Model.add_function`` function to instruct SmartSim to load the TorchScript function `timestwo` +onto the launched standalone ``Orchestrator``. Specify the function `timestwo` to the `function` +parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/in_mem_script.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``smartsim_model.add_function`` code snippet, we input the following arguments: + +- `name` ("example_func"): A name to uniquely identify the TorchScript within the ``Orchestrator``. +- `function` (timestwo): Name of the TorchScript function defined in the Python driver script. +- `device` ("CPU"): Specifying the device for TorchScript execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the TorchScript to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the TF function will be loaded to the +standalone ``Orchestrator``. The function can then be executed on the ``Orchestrator`` via a SmartRedis +``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_from_file: + +------------------------------ +Attach a TorchScript From File +------------------------------ +Users can attach TorchScript functions from a file to a ``Model`` and upload them to a +colocated or standalone ``Orchestrator``. This functionality is supported by the ``Model.add_script`` +function's `script_path` parameter. The function supports +flexible device selection, allowing users to choose between `"GPU"` or `"CPU"` via the `device` parameter. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +The arguments that customize the storage and execution of a TorchScript script can be found in the +:ref:`Model API` under the ``add_script`` docstring. + +Example: Load a TorchScript From File +------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript from file +to a launched ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_script.py + +.. note:: + This example assumes: + + - a ``Orchestrator`` is launched prior to the ``Model`` execution (Colocated or standalone) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define a TorchScript Script** + +For the example, we create the Python script `torchscript.py`. The file contains a +simple torch function shown below: + +.. code-block:: python + + def negate(x): + return torch.neg(x) + +**Attach the TorchScript Script to a SmartSim Model** + +Assuming an initialized ``Model`` named `smartsim_model` exists, we add the TorchScript script using +``Model.add_script`` by specifying the script path to the `script_path` parameter: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/from_file_script.py + :language: python + :linenos: + :lines: 13-14 + +In the above ``smartsim_model.add_script`` code snippet, we include the following arguments: + +- `name` ("example_script"): Reference name for the script inside of the ``Orchestrator``. +- `script_path` ("path/to/torchscript.py"): Path to the script file. +- `device` ("CPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When `smartsim_model` is started via ``Experiment.start``, the TorchScript will be loaded from file to the +``Orchestrator`` that is launched prior to the start of `smartsim_model`. The function can then be executed +on the ``Orchestrator`` via a SmartRedis ``Client`` (:ref:`SmartRedis`) within the application code. + +.. _TS_raw_string: + +--------------------------------- +Define TorchScripts as Raw String +--------------------------------- +Users can upload TorchScript functions from string to colocated or +standalone ``Orchestrator(s)``. This feature is supported by the +``Model.add_script`` function's `script` parameter. The function supports +flexible device selection, allowing users to choose between `"GPU"` or `"CPU"` via the `device` parameter. +In environments with multiple devices, specific device numbers can be specified using the +`devices_per_node` parameter. + +The arguments that customize the storage and execution of a TorchScript script can be found in the +:ref:`Model API` under the ``add_script`` docstring. + +Example: Load a TorchScript From String +--------------------------------------- +This example walks through the steps of instructing SmartSim to load a TorchScript +from string to a ``Orchestrator``. The source code example is available in the dropdown below for +convenient execution and customization. + +.. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + +.. note:: + This example assumes: + + - a ``Orchestrator`` is launched prior to the ``Model`` execution (standalone or colocated) + - an initialized ``Model`` named `smartsim_model` exists within the ``Experiment`` workflow + +**Define a String TorchScript** + +Define the TorchScript code as a variable in the ``Experiment`` driver script: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + :language: python + :linenos: + :lines: 12-13 + +**Attach the TorchScript Function to a SmartSim Model** + +Assuming an initialized ``Model`` named `smartsim_model` exists, we add a TensorFlow model using +the ``Model.add_script`` function and specify the variable `torch_script_str` to the parameter +`script`: + +.. literalinclude:: tutorials/doc_examples/model_doc_examples/string_script.py + :language: python + :linenos: + :lines: 15-16 + +In the above ``smartsim_model.add_script`` code snippet, we offer the following arguments: + +- `name` ("example_script"): key to store script under. +- `script` (torch_script_str): TorchScript code. +- `device` ("CPU"): device for script execution. +- `devices_per_node` (2): Use two GPUs per node. +- `first_device` (0): Start with 0 index GPU. + +.. warning:: + Calling `exp.start(smartsim_model)` prior to instantiation of an ``Orchestrator`` will result in + a failed attempt to load the ML model to a non-existent ``Orchestrator``. + +When the ``Model`` is started via ``Experiment.start``, the TorchScript will be loaded to the +``Orchestrator`` that is launched prior to the start of the ``Model``. + +.. _model_key_collision: + +========================= +Data Collision Prevention +========================= +Overview +======== +If an ``Experiment`` consists of multiple ``Model(s)`` that use the same key names to reference +information in the ``Orchestrator``, the names used to reference data, ML models, and scripts will be +identical, and without the use of SmartSim and SmartRedis prefix methods, ``Model(s)`` +will end up inadvertently accessing or overwriting each other’s data. To prevent this +situation, the SmartSim ``Model`` object supports key prefixing, which prepends +the name of the ``Model`` to the keys sent to the ``Orchestrator`` to create unique key names. +With this enabled, collision is avoided and ``Model(s)`` can use the same key names within their applications. + +The key components of SmartSim ``Model`` prefixing functionality include: + +1. **Sending Data to the Orchestrator**: Users can send data to an ``Orchestrator`` + with the ``Model`` name prepended to the data name through SmartSim :ref:`Model functions` and + SmartRedis :ref:`Client functions`. +2. **Retrieving Data from the Orchestrator**: Users can instruct a ``Client`` to prepend a + ``Model`` name to a key during data retrieval, polling, or check for existence on the ``Orchestrator`` + through SmartRedis :ref:`Client functions`. + +For example, assume you have two ``Model(s)`` in an ``Experiment``, named `model_0` and `model_1`. In each +application code you use the function ``Client.put_tensor("tensor_0", data)`` to send a tensor named `"tensor_0"` +to the same ``Orchestrator``. With ``Model`` key prefixing turned on, the `model_0` and `model_1` +applications can access their respective tensor `"tensor_0"` by name without overwriting or accessing +the other ``Model(s)`` `"tensor_0"` tensor. In this scenario, the two tensors placed in the +``Orchestrator`` are `model_0.tensor_0` and `model_1.tensor_0`. + +Enabling and Disabling +====================== +SmartSim provides support for toggling prefixing on a ``Model`` for tensors, ``Datasets``, +lists, ML models, and scripts. Prefixing functions from the SmartSim :ref:`Model API` and SmartRedis :ref:`Client API` rely on +each other to fully support SmartSim key prefixing. For example, to use the ``Client`` prefixing +functions, a user must enable prefixing on the ``Model`` through ``Model.enable_key_prefixing``. +This function enables and activates prefixing for tensors, ``Datasets`` and lists placed in an ``Orchestrator`` +by the ``Model``. This configuration can be toggled within the ``Model`` application through +``Client`` functions, such as disabling tensor prefixing via ``Client.use_tensor_ensemble_prefix(False)``. + +The interaction between the prefix SmartSim `Model Functions` and SmartRedis +`Client Functions` are documentation below. + +.. _model_prefix_func: + +--------------- +Model Functions +--------------- +A ``Model`` object supports two prefixing functions: ``Model.enable_key_prefixing`` and +``Model.register_incoming_entity``. + +To enable prefixing on a ``Model``, users must use the ``Model.enable_key_prefixing`` +function in the ``Experiment`` driver script. The key components of this function include: + +- Activates prefixing for tensors, ``Datasets``, and lists sent to a ``Orchestrator`` from within + the ``Model`` application. +- Enables access to prefixing ``Client`` functions within the ``Model`` application. This excludes + the ``Client.set_data_source`` function, where ``enable_key_prefixing`` is not require for access. + +.. note:: + ML model and script prefixing is not automatically enabled through ``Model.enable_key_prefixing`` + and rather must be enabled within the ``Model`` application using ``Client.use_model_ensemble_prefix``. + +Users can enable a SmartRedis ``Client`` to interact with prefixed data, ML models and TorchScripts +within a ``Model`` application by specifying the producer entity name to ``Client.set_data_source``. +However, for SmartSim to recognize the entity name within the application, the producer +entity must be registered on the consumer entity using ``Ensemble.register_incoming_entity``. +This also applies to scenarios where the ``Model`` attempts to access data placed by self. +For more information on ``Client.set_data_source``, visit the +:ref:`Client functions` section. + +.. _client_prefix_func: + +---------------- +Client Functions +---------------- +A ``Client`` object supports five prefixing functions: ``Client.use_tensor_ensemble_prefix``, +``Client.use_dataset_ensemble_prefix``, ``Client.use_list_ensemble_prefix``, +``Client.use_model_ensemble_prefix`` and ``Client.set_data_source``. + +To enable or disable SmartRedis data structure prefixing for tensors, ``Datasets``, aggregation lists, ML models +and scripts, SmartRedis ``Client`` offers functions per data structure: + +- Tensor: ``Client.use_tensor_ensemble_prefix`` +- ``Dataset``: ``Client.use_dataset_ensemble_prefix`` +- Aggregation lists: ``Client.use_list_ensemble_prefix`` +- ML models/scripts: ``Client.use_model_ensemble_prefix`` + +.. warning:: + To access the ``Client`` prefixing functions, prefixing must be enabled on the + ``Model`` through ``Model.enable_key_prefixing``. This function activates prefixing + for tensors, ``Datasets`` and lists. + +Examples are provided below that show the use of these ``Client`` methods in conjunction +with the SmartSim key prefixing ``Model`` API functions. + +Users can enable the SmartSim ``Client`` to interact with prefixed data, ML models and TorchScripts +using the ``Client.set_data_source`` function. To leverage this capability: + +1. Use ``Model.register_incoming_entity`` on the ``Model`` intending to interact with prefixed data in the ``Orchestrator`` + placed by a separate ``Model``. +2. Pass the SmartSim entity (e.g., another ``Model``) to ``Model.register_incoming_entity`` in order to + reference the ``Model`` prefix in the application code. +3. In the ``Model`` application, instruct the ``Client`` to prepend the specified ``Model`` name during key searches + using ``Client.set_data_source("model_name")``. + +Examples are provided below that show the use of these ``Client`` methods in conjunction +with the SmartSim key prefixing ``Model`` API functions. + +.. _put_set_prefix: + +Put/Set Operations +================== +In the following tabs we provide snippets of driver script and application code to demonstrate +activating and deactivating prefixing for tensors, ``Datasets``, lists, ML models and scripts using +SmartRedis put/get semantics. + +.. tabs:: + + .. group-tab:: Tensor + **Activate Tensor Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each tensor name sent to the ``Orchestrator`` from within the ``Model`` + executable code. The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate tensor + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, two tensors named `tensor_1` and `tensor_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.tensor_1" + 2) "model_name.tensor_2" + + You will notice that the ``Model`` name `model_name` has been prepended to each tensor name + and stored in the ``Orchestrator``. + + **Activate Tensor Prefixing in the Application** + + Users can further configure tensor prefixing in the application by using + the ``Client`` function ``use_tensor_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_tensor_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling tensor prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_tensor_ensemble_prefix(False) + # Place a tensor in the Orchestrator + client.put_tensor("tensor_1", np.array([1, 2, 3, 4])) + # Enable key prefixing + client.use_tensor_ensemble_prefix(True) + # Place a tensor in the Orchestrator + client.put_tensor("tensor_2", np.array([5, 6, 7, 8])) + + In the application, two tensors named `tensor_1` and `tensor_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "tensor_1" + 2) "model_name.tensor_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `tensor_1` since + we disabled tensor prefixing before sending the tensor to the ``Orchestrator``. However, + when we enabled tensor prefixing and sent the second tensor, the ``Model`` name was prefixed + to `tensor_2`. + + .. group-tab:: Dataset + **Activate Dataset Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each ``Dataset`` name sent to the ``Orchestrator`` from within the ``Model``. + The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate ``Dataset`` + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, two Datasets named `dataset_1` and `dataset_2` are sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.{dataset_1}.dataset_tensor_1" + 2) "model_name.{dataset_1}.meta" + 3) "model_name.{dataset_2}.dataset_tensor_2" + 4) "model_name.{dataset_2}.meta" + + You will notice that the ``Model`` name `model_name` has been prefixed to each ``Dataset`` name + and stored in the ``Orchestrator``. + + **Activate Dataset Prefixing in the Application** + + Users can further configure ``Dataset`` prefixing in the application by using + the ``Client`` function ``use_dataset_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_dataset_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling ``Dataset`` prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_dataset_ensemble_prefix(False) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_1) + # Enable key prefixing + client.use_dataset_ensemble_prefix(True) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_2) + + In the application, we have two ``Datasets`` named `dataset_1` and `dataset_2`. + We then send them to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "{dataset_1}.dataset_tensor_1" + 2) "{dataset_1}.meta" + 3) "model_name.{dataset_2}.dataset_tensor_1" + 4) "model_name.{dataset_2}.meta" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `dataset_1` since + we disabled ``Dataset`` prefixing before sending the ``Dataset`` to the ``Orchestrator``. However, + when we enabled ``Dataset`` prefixing and sent the second ``Dataset``, the ``Model`` name was prefixed + to `dataset_2`. + + .. group-tab:: Aggregation List + **Activate Aggregation List Prefixing in the Driver Script** + + To activate prefixing on a ``Model`` in the driver script, a user must use the function + ``Model.enable_key_prefixing``. This functionality ensures that the ``Model`` name + is prepended to each list name sent to the ``Orchestrator`` from within the ``Model``. + The source code example is available in the dropdown below for + convenient execution and customization. + + .. dropdown:: Example Driver Script Source Code + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + + In the driver script snippet below, we take an initialized ``Model`` and activate list + prefixing through the ``enable_key_prefixing`` function: + + .. literalinclude:: tutorials/doc_examples/model_doc_examples/prefix_data.py + :language: python + :linenos: + :lines: 6-12 + + In the `model` application, a list named `dataset_list` is sent to a launched ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "model_name.dataset_list" + + You will notice that the ``Model`` name `model_name` has been prefixed to the list name + and stored in the ``Orchestrator``. + + **Activate Aggregation List Prefixing in the Application** + + Users can further configure list prefixing in the application by using + the ``Client`` function ``use_list_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_list_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling list prefixing: + + .. code-block:: python + + # Disable key prefixing + client.use_list_ensemble_prefix(False) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_1) + # Place a list in the Orchestrator + client.append_to_list("list_1", dataset_1) + # Enable key prefixing + client.use_dataset_ensemble_prefix(True) + # Place a Dataset in the Orchestrator + client.put_dataset(dataset_2) + # Append Dataset to list in the Orchestrator + client.append_to_list("list_2", dataset_2) + + In the application, two lists named `list_1` and `list_2` are sent to the ``Orchestrator``. + The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "list_1" + 2) "model_name.{dataset_1}.meta" + 3) "model_name.{dataset_1}.dataset_tensor_1" + 4) "model_name.list_2" + 5) "model_name.{dataset_2}.meta" + 6) "model_name.{dataset_2}.dataset_tensor_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `list_1` since + we disabled list prefixing before sending the list to the ``Orchestrator``. However, + when we enabled list prefixing and sent the second list, the ``Model`` name was prefixed + to `list_2` as well as the list ``Dataset`` members. + + .. note:: + The ``Datasets`` sent to the ``Orchestrator`` are all prefixed. This is because + ``Model.enable_key_prefixing`` turns on prefixing for tensors, ``Datasets`` and lists. + + .. group-tab:: ML Model + **Activate ML Model Prefixing in the Application** + + Users can configure ML model prefixing in the application by using + the ``Client`` function ``use_model_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_model_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling ML model prefixing: + + .. code-block:: python + + # Disable ML model prefixing + client.use_model_ensemble_prefix(False) + # Send ML model to the Orchestrator + client.set_model( + "ml_model_1", serialized_model_1, "TF", device="CPU", inputs=inputs, outputs=outputs + ) + # Enable ML model prefixing + client.use_model_ensemble_prefix(True) + # Send prefixed ML model to the Orchestrator + client.set_model( + "ml_model_2", serialized_model_2, "TF", device="CPU", inputs=inputs, outputs=outputs + ) + + In the application, two ML models named `ml_model_1` and `ml_model_2` are sent + to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "ml_model_1" + 2) "model_name.ml_model_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `ml_model_1` since + we disabled ML model prefixing before sending the ML model to the ``Orchestrator``. However, + when we enabled ML model prefixing and sent the second ML model, the ``Model`` name was prefixed + to `ml_model_2`. + + .. group-tab:: Script + **Activate Script Prefixing in the Application** + + Users can configure script prefixing in the application by using + the ``Client`` function ``use_model_ensemble_prefix``. By specifying a boolean + value to the function, users can turn prefixing on and off. + + .. note:: + To have access to ``Client.use_model_ensemble_prefix``, prefixing must be enabled + on the ``Model`` in the driver script via ``Model.enable_key_prefixing``. + + In the application snippet below, we demonstrate enabling and disabling script prefixing: + + .. code-block:: python + + # Disable script prefixing + client.use_model_ensemble_prefix(False) + # Store a script in the Orchestrator + client.set_function("script_1", script_1) + # Enable script prefixing + client.use_model_ensemble_prefix(True) + # Store a prefixed script in the Orchestrator + client.set_function("script_2", script_2) + + In the application, two ML models named `script_1` and `script_2` are sent + to a launched ``Orchestrator``. The contents of the ``Orchestrator`` after ``Model`` completion are: + + .. code-block:: bash + + 1) "script_1" + 2) "model_name.script_2" + + You will notice that the ``Model`` name `model_name` is **not** prefixed to `script_1` since + we disabled script prefixing before sending the script to the ``Orchestrator``. However, + when we enabled script prefixing and sent the second script, the ``Model`` name was prefixed + to `script_2`. + +.. _get_prefix: + +Get Operations +============== +In the following sections, we walk through snippets of application code to demonstrate the retrieval +of prefixed tensors, ``Datasets``, lists, ML models, and scripts using SmartRedis put/get +semantics. The examples demonstrate retrieval within the same application where the data +structures were placed, as well as scenarios where data structures are placed by separate +applications. + +.. tabs:: + + .. group-tab:: Tensor + **Retrieve a Tensor Placed by the Same Application** + + SmartSim supports retrieving prefixed tensors sent to the ``Orchestrator`` from within the + same application where the tensor was placed. To achieve this, users must + provide the ``Model`` name that stored the tensor to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name + in the driver script. + + As an example, we placed a prefixed tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor_name" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the tensor: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed tensor + tensor_data = client.get_tensor("tensor_name") + # Log the tensor data + client.log_data(LLInfo, f"The tensor value is: {tensor_data}") + + In the `model.out` file, the ``Client`` will log the message:: + Default@00-00-00:The tensor value is: [1 2 3 4] + + **Retrieve a Tensor Placed by an External Application** + + SmartSim supports retrieving prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data in the + driver script. + + In the example, a ``Model`` named `model_1` has placed a tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.tensor_name" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored tensor named `tensor_name`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed tensor + tensor_data = client.get_tensor("tensor_name") + # Log the tensor data + client.log_data(LLInfo, f"The tensor value is: {tensor_data}") + + In the `model.out` file, the ``Client`` will log the message:: + Default@00-00-00:The tensor value is: [1 2 3 4] + + .. group-tab:: Dataset + **Retrieve a Dataset Placed by the Same Application** + + SmartSim supports retrieving prefixed ``Datasets`` sent to the ``Orchestrator`` from within the + same application where the ``Dataset`` was placed. To achieve this, users must + provide the ``Model`` name that stored the ``Dataset`` to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ``Dataset`` on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset_name}.dataset_tensor" + 2) "model_1.{dataset_name}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the ``Dataset``: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed Dataset + dataset_data = client.get_dataset("dataset_name") + # Log the Dataset data + client.log_data(LLInfo, f"The Dataset value is: {dataset_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + Default@00-00-00:Default@00-00-00:The dataset value is: + + DataSet (dataset_name): + Tensors: + dataset_tensor: + type: 16 bit unsigned integer + dimensions: [4] + elements: 4 + Metadata: + none + + **Retrieve a Dataset Placed by an External Application** + + SmartSim supports retrieving prefixed ``Datasets`` sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ``Dataset`` + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ``Dataset`` in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.{dataset_name}.dataset_tensor" + 2) "model_1.{dataset_name}.meta" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored ``Dataset`` named `dataset_name`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed Dataset + dataset_data = client.get_dataset("dataset_name") + # Log the Dataset data + client.log_data(LLInfo, f"The Dataset value is: {dataset_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + Default@00-00-00:Default@00-00-00:The Dataset value is: + + DataSet (dataset_name): + Tensors: + dataset_tensor: + type: 16 bit unsigned integer + dimensions: [4] + elements: 4 + Metadata: + none + + .. group-tab:: Aggregation List + **Retrieve a Aggregation List Placed by the Same Application** + + SmartSim supports retrieving prefixed lists sent to the ``Orchestrator`` from within the + same application where the list was placed. To achieve this, users must + provide the ``Model`` name that stored the list to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed list on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.dataset_list" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate checking the length of the list: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed list + list_data = client.get_datasets_from_list("dataset_list") + # Log the list data + client.log_data(LLInfo, f"The length of the list is: {len(list_data)}") + + In the `model.out` file, the ``Client`` will log the message:: + The length of the list is: 1 + + **Retrieve a Aggregation List Placed by an External Application** + + SmartSim supports retrieving prefixed lists sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the list + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a list in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_name.dataset_list" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we check the length of the list named `dataset_list`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed list + list_data = client.get_datasets_from_list("dataset_list") + # Log the list data + client.log_data(LLInfo, f"The length of the list is: {len(list_data)}") + + In the `model.out` file, the ``Client`` will log the message:: + The length of the list is: 1 + + .. group-tab:: ML Model + **Retrieve a ML Model Placed by the Same Application** + + SmartSim supports retrieving prefixed ML models sent to the ``Orchestrator`` from within the + same application where the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the ML model: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed ML model + model_data = client.get_model("mnist_cnn") + + **Retrieve a ML Model Placed by an External Application** + + SmartSim supports retrieving prefixed ML model sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ML model + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored ML model named `mnist_cnn`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed model + model_data = client.get_model("mnist_cnn") + + .. group-tab:: Script + **Retrieve a Script Placed by the Same Application** + + SmartSim supports retrieving prefixed scripts sent to the ``Orchestrator`` from within the + same application where the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key searches. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.normalizer" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate retrieving the script: + + .. code-block:: python + + # Set the name to prepend to key searches + client.set_data_source("model_1") + # Retrieve the prefixed script + script_data = client.get_script("normalizer") + # Log the script data + client.log_data(LLInfo, f"The script data is: {script_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + The script data is: def normalize(X): + """Simple function to normalize a tensor""" + mean = X.mean + std = X.std + + return (X-mean)/std + + **Retrieve a Script Placed by an External Application** + + SmartSim supports retrieving prefixed scripts sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the script + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a script in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.normalizer" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + Here we retrieve the stored script named `normalizer`: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Retrieve the prefixed script + script_data = client.get_script("model_1.normalizer") + # Log the script data + client.log_data(LLInfo, f"The script data is: {script_data}") + + In the `model.out` file, the ``Client`` will log the message: + + .. code-block:: bash + + The script data is: def normalize(X): + """Simple function to normalize a tensor""" + mean = X.mean + std = X.std + + return (X-mean)/std + +.. _run_prefix: + +Run Operations +============== +In the following sections, we walk through snippets of application code to demonstrate executing +prefixed ML models and scripts using SmartRedis run semantics. The examples demonstrate +executing within the same application where the ML Model and Script were placed, as well as scenarios +where ML Model and Script are placed by separate applications. + +.. tabs:: + + .. group-tab:: ML Model + **Access ML Models From within the Application** + + SmartSim supports executing prefixed ML models with prefixed tensors sent to the ``Orchestrator`` from within + the same application that the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model and input tensors to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model and tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + 2) "model_1.mnist_images" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the ML model: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the ML model + client.run_model(name="mnist_cnn", inputs=["mnist_images"], outputs=["Identity"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.Identity" + 2) "model_1.mnist_cnn" + 3) "model_1.mnist_images" + + .. note:: + The output tensors are prefixed because we executed ``model_1.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + **Access ML Models Loaded From an External Application** + + SmartSim supports executing prefixed ML models with prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ML model and tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ML model and tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.mnist_cnn" + 2) "model_1.mnist_images" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the ML model: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the ML model + client.run_model(name="mnist_cnn", inputs=["mnist_images"], outputs=["Identity"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_2.Identity" + 2) "model_1.mnist_cnn" + 3) "model_1.mnist_images" + + .. note:: + The output tensors are prefixed because we executed ``model_2.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + .. group-tab:: Script + + **Access Scripts From within the Application** + + SmartSim supports executing prefixed scripts with prefixed tensors sent to the ``Orchestrator`` from within + the same application that the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script and input tensors to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script and tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + + To run the script, the prefixed script name `"model_name.normalizer"` and prefixed + input tensors `"model_name.X_rand"` must be provided, as demonstrated below: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the script + client.run_script("normalizer", "normalize", inputs=["X_rand"], outputs=["X_norm"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + 3) "model_1.X_norm" + + .. note:: + The output tensors are prefixed because we executed ``model_1.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + + **Access Scripts Loaded From an External Application** + + SmartSim supports executing prefixed scripts with prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the script and tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a script and tensor in a standalone + ``Orchestrator`` with prefixing enabled on the ``Model``. The contents of the ``Orchestrator`` + are as follows: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + + We create a separate ``Model``, named `model_2`, with the executable application code below. + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for use in ``Client.set_data_source``. + + In the application snippet below, we demonstrate running the script: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Run the script + client.run_script("normalizer", "normalize", inputs=["X_rand"], outputs=["X_norm"]) + + The ``Orchestrator`` now contains prefixed output tensors: + + .. code-block:: bash + + 1) "model_1.normalizer" + 2) "model_1.X_rand" + 3) "model_2.X_norm" + + .. note:: + The output tensors are prefixed because we executed ``model_2.enable_key_prefixing`` + in the driver script which enables and activates prefixing for tensors, ``Datasets`` + and lists. + +.. _copy_rename_del_prefix: + +Copy/Rename/Delete Operations +============================= +In the following sections, we walk through snippets of application code to demonstrate the copy, rename and delete +operations on prefixed tensors, ``Datasets``, lists, ML models, and scripts. The examples +demonstrate these operations within the same script where the data +structures were placed, as well as scenarios where data structures are placed by separate +scripts. + +.. tabs:: + + .. group-tab:: Tensor + **Copy/Rename/Delete Operations on Tensors in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed tensors sent to the ``Orchestrator`` from within + the same application that the tensor was placed. To achieve this, users must + provide the ``Model`` name that stored the tensor to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed tensor on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the tensor in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_tensor``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the tensor + client.rename_tensor("tensor", "renamed_tensor") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the tensor but replace the tensor name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.renamed_tensor" + + Next, we copy the prefixed tensor to a new destination: + + .. code-block:: python + + client.copy_tensor("renamed_tensor", "copied_tensor") + + Since tensor prefixing is enabled on the ``Client``, the `copied_tensor` is prefixed: + + .. code-block:: bash + + 1) "model_1.renamed_tensor" + 2) "model_1.copied_tensor" + + Next, delete `renamed_tensor`: + + .. code-block:: python + + client.delete_tensor("renamed_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.copied_tensor" + + **Copy/Rename/Delete Operations on Tensors Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed tensors sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the tensor + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a tensor in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.tensor" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the tensor in the ``Orchestrator``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + + To rename the tensor in the ``Orchestrator``, we provide the tensor name: + + .. code-block:: python + + client.rename_tensor("tensor", "renamed_tensor") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model``. The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed tensor to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_tensor("renamed_tensor", "copied_tensor") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + 2) "model_2.copied_tensor" + + Next, delete `copied_tensor` by specifying the name: + + .. code-block:: python + + client.delete_tensor("copied_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_tensor" + + .. group-tab:: Dataset + **Copy/Rename/Delete Operations on A Dataset in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed ``Datasets`` sent to the ``Orchestrator`` from within + the same application that the ``Dataset`` was placed. To achieve this, users must + provide the ``Model`` name that stored the ``Dataset`` to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ``Dataset`` on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset}.dataset_tensor" + 2) "model_1.{dataset}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the ``Dataset`` in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_tensor``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the Dataset + client.rename_dataset("dataset", "renamed_dataset") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the ``Dataset`` but replace the ``Dataset`` name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + + Next, we copy the prefixed ``Dataset`` to a new destination: + + .. code-block:: python + + client.copy_dataset("renamed_dataset", "copied_dataset") + + Since ``Dataset`` prefixing is enabled on the ``Client``, the `copied_dataset` is prefixed: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + 3) "model_1.{copied_dataset}.dataset_tensor" + 4) "model_1.{copied_dataset}.meta" + + Next, delete `copied_dataset`: + + .. code-block:: python + + client.delete_dataset("model_name.copied_dataset") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.{renamed_dataset}.dataset_tensor" + 2) "model_1.{renamed_dataset}.meta" + + **Copy/Rename/Delete Operations on Datasets Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed ``Datasets`` sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the ``Dataset`` + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a ``Dataset`` in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.{dataset}.dataset_tensor" + 2) "model_1.{dataset}.meta" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the ``Dataset`` in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To rename the ``Dataset`` in the ``Orchestrator``, we provide the ``Dataset`` `name`: + + .. code-block:: python + + client.rename_tensor("dataset", "renamed_dataset") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model`` via ``Model.enable_key_prefixing`` in the driver script. + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed ``Dataset`` to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_dataset("renamed_dataset", "copied_dataset") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + 3) "model_2.{copied_dataset}.dataset_tensor" + 4) "model_2.{copied_dataset}.meta" + + Next, delete `copied_dataset` by specifying the name: + + .. code-block:: python + + client.delete_dataset("copied_tensor") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.{renamed_dataset}.dataset_tensor" + 2) "model_2.{renamed_dataset}.meta" + + .. group-tab:: Aggregation List + **Copy/Rename/Delete Operations on a Aggregation List in The Same Application** + + SmartSim supports copy/rename/delete operations on prefixed lists sent to the ``Orchestrator`` from within + the same application that the list was placed. To achieve this, users must + provide the ``Model`` name that stored the list to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed list on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.list_of_datasets" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To rename the list in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``rename_list``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the list + client.rename_list("list_of_datasets", "renamed_list") + + Because prefixing is enabled on the ``Model`` via ``enable_key_prefixing`` in the driver script, + SmartSim will keep the prefix on the list but replace the list name as shown in the ``Orchestrator``: + + .. code-block:: bash + + 1) "model_1.renamed_list" + + Next, we copy the prefixed list to a new destination: + + .. code-block:: python + + client.copy_list("renamed_list", "copied_list") + + Since list prefixing is enabled on the ``Client``, the `copied_list` is prefixed: + + .. code-block:: bash + + 1) "model_1.renamed_list" + 2) "model_1.copied_list" + + Next, delete `copied_list`: + + .. code-block:: python + + client.delete_list("copied_list") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_1.renamed_list" + + **Copy/Rename/Delete Operations on Aggregation Lists Placed by an External Application** + + SmartSim supports copy/rename/delete operations on prefixed lists sent to the ``Orchestrator`` by separate + ``Model(s)``. To achieve this, users need to provide the ``Model`` name that stored the list + to ``Client.set_data_source``. This action instructs the ``Client`` to prepend the ``Model`` + name to all key searches. For SmartSim to recognize the ``Model`` name as a data source, + users must execute the ``Model.register_incoming_entity`` function on the ``Model`` + responsible for the search and pass the ``Model`` instance that stored the data. + + In the example, a ``Model`` named `model_1` has placed a list in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.list_of_datasets" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_2`, + we execute ``model_2.register_incoming_entity(model_1)``. By passing the producer ``Model`` + instance to the consumer ``Model``, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + From within a separate ``Model`` named `model_2`, we perform basic copy/rename/delete operations. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To rename the list in the ``Orchestrator``, we provide the list name: + + .. code-block:: python + + client.rename_list("list_of_datasets", "renamed_list") + + SmartSim will replace the prefix with the current ``Model`` name since prefixing is enabled + on the current ``Model``. The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + + .. note:: + In the driver script, we also register `model_2` as an entity on itself via ``model_2.register_incoming_entity(model_2)``. + This way we can use ``Client.set_data_source`` to interact with prefixed data placed by `model_2`. + + Next, we copy the prefixed list to a new destination: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_2") + # Copy the tensor data + client.copy_dataset("renamed_list", "copied_list") + + The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + 2) "model_2.copied_list" + + Next, delete `copied_list` by specifying the name: + + .. code-block:: python + + client.delete_list("copied_list") + + The contents of the ``Orchestrator`` are: + + .. code-block:: bash + + 1) "model_2.renamed_list" + + .. group-tab:: ML Model + **Delete ML Models From within the Application** + + SmartSim supports delete operations on prefixed ML models sent to the ``Orchestrator`` from within + the same application that the ML model was placed. To achieve this, users must + provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed ML model on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + .. code-block:: bash + + 1) "model_1.ml_model" + + To delete the ML model in the ``Orchestrator``, we provide self ``Model`` name + to ``Client.set_data_source`` then execute the function ``delete_model``: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Delete the ML model + client.delete_model("ml_model") + + **Delete a ML Model Placed by an External Application** + + SmartSim supports delete operations on prefixed ML models sent to the ``Orchestrator`` by separate ``Model(s)``. + To do so, users must provide the ``Model`` name that stored the ML model to ``Client.set_data_source``. + This will instruct the ``Client`` to prepend the ``Model`` name input to all key searches. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.ml_model" + + From within a separate ``Model`` named `model_2`, we perform a basic delete operation. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To delete the ML model in the ``Orchestrator``, we provide the ML model name: + + .. code-block:: python + + client.delete_model("ml_model") + + .. group-tab:: Script + + **Delete Scripts From within the Application** + + SmartSim supports delete operations on prefixed scripts sent to the ``Orchestrator`` from within + the same application that the script was placed. To achieve this, users must + provide the ``Model`` name that stored the script to ``Client.set_data_source``. This action + instructs the ``Client`` to prepend the ``Model`` name to all key names. For SmartSim to + recognize the ``Model`` name as a data source, users must execute the + ``Model.register_incoming_entity`` function on the ``Model`` and pass the self ``Model`` name. + + As an example, we placed a prefixed script on the ``Orchestrator`` within a ``Model`` named + `model_1`. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.script" + + .. note:: + In the driver script, after initializing the ``Model`` instance named `model_1`, + we execute ``model_1.register_incoming_entity(model_1)``. By passing the ``Model`` + instance to itself, we instruct SmartSim to recognize the name of `model_1` as a valid data + source for subsequent use in ``Client.set_data_source``. + + To delete the script in the ``Orchestrator``, we provide the full list name: + + .. code-block:: python + + # Set the Model source name + client.set_data_source("model_1") + # Rename the script + client.delete_script("script") + + **Delete a Script Placed by an External Application** + + SmartSim supports delete operations on prefixed scripts sent to the ``Orchestrator`` by separate ``Model(s)``. + To do so, users must provide the ``Model`` name that stored the script to ``Client.set_data_source``. + This will instruct the ``Client`` to prepend the ``Model`` name input to all key searches. + + In the example, a ``Model`` named `model_1` has placed a ML model in a standalone ``Orchestrator`` with prefixing enabled + on the ``Client``. The ``Orchestrator`` contents are: + + .. code-block:: bash + + 1) "model_1.script" + + From within a separate ``Model`` named `model_2`, we perform a basic delete operation. + To instruct the ``Client`` to prepend a ``Model`` name to all key searches, use the + ``Client.set_data_source`` function. Specify the ``Model`` name `model_1` + that placed the list in the ``Orchestrator``: + + .. code-block:: python + + client.set_data_source("model_1") + + To delete the script in the ``Orchestrator``, we provide the script name: + + .. code-block:: python + + client.delete_model("script") \ No newline at end of file diff --git a/doc/orchestrator.rst b/doc/orchestrator.rst index 456d9a814..6ccc7c1e1 100644 --- a/doc/orchestrator.rst +++ b/doc/orchestrator.rst @@ -1,208 +1,688 @@ +.. _orch_docs: + ************ Orchestrator ************ +======== +Overview +======== +The ``Orchestrator`` is an in-memory database with features built for +AI-enabled workflows including online training, low-latency inference, cross-application data +exchange, online interactive visualization, online data analysis, computational steering, and more. + +An ``Orchestrator`` can be thought of as a general feature store +capable of storing numerical data (tensors and ``Datasets``), AI models (TF, TF-lite, PyTorch, or ONNX), +and scripts (TorchScripts). In addition to storing data, the ``Orchestrator`` is capable of +executing AI models and TorchScripts on the stored data using CPUs or GPUs. + +.. figure:: images/smartsim-arch.png + + Sample ``Experiment`` showing a user application leveraging + machine learning infrastructure launched by SmartSim and connected + to an online analysis and visualization simulation via the ``Orchestrator``. + +Users can establish a connection to the ``Orchestrator`` from within ``Model`` executable code, ``Ensemble`` +member executable code, or ``Experiment`` driver scripts by using the +:ref:`SmartRedis` ``Client`` library. + +SmartSim offers **two** types of ``Orchestrator`` deployments: + +- :ref:`Standalone Deployment` + A standalone ``Orchestrator`` is ideal for systems that have heterogeneous node types + (i.e. a mix of CPU-only and GPU-enabled compute nodes) where + ML model and TorchScript evaluation is more efficiently performed off-node. This + deployment is also ideal for workflows relying on data exchange between multiple + applications (e.g. online analysis, visualization, computational steering, or + producer/consumer application couplings). Standalone deployment is also optimal for + high data throughput scenarios where ``Orchestrators`` require large amounts of compute resources. + +- :ref:`Colocated Deployment` + A colocated ``Orchestrator`` is ideal when the data and hardware accelerator are located on the same compute node. + This setup helps reduce latency in ML inference and TorchScript evaluation by eliminating off-node communication. +.. warning:: + Colocated ``Orchestrators`` cannot share data across compute nodes. + Communication is only supported between a ``Model`` and colocated ``Orchestrator`` pair. + +SmartSim allows users to launch :ref:`multiple Orchestrators` of either type during +the course of an ``Experiment``. If a workflow requires a multiple ``Orchestrator`` environment, a +`db_identifier` argument must be specified during ``Orchestrator`` initialization. Users can connect to +``Orchestrators`` in a multiple ``Orchestrator`` workflow by specifying the respective `db_identifier` argument +within a :ref:`ConfigOptions` object that is passed into the SmartRedis ``Client`` constructor. + +.. _standalone_orch_doc: + +===================== +Standalone Deployment +===================== +-------- +Overview +-------- +During standalone ``Orchestrator`` deployment, a SmartSim ``Orchestrator`` (the database) runs on separate +compute node(s) from the SmartSim ``Model`` node(s). A standalone ``Orchestrator`` can be deployed on a single +node (single-sharded) or distributed (sharded) over multiple nodes. With a multi-node ``Orchestrator``, users can +scale the number of database nodes for inference and script evaluation, enabling +increased in-memory capacity for data storage in large-scale workflows. Single-node +``Orchestrators`` are effective for small-scale workflows and offer lower latency for ``Client`` API calls +that involve data appending or processing (e.g. ``Client.append_to_list``, ``Client.run_model``, etc). + +When connecting to a standalone ``Orchestrator`` from within a ``Model`` application, the user has +several options to connect a SmartRedis ``Client``: + +- In an ``Experiment`` with a single deployed ``Orchestrator``, users can rely on SmartRedis + to detect the ``Orchestrator`` address through runtime configuration of the SmartSim ``Model`` environment. + A default ``Client`` constructor, with no user-specified parameters, is sufficient to + connect to the ``Orchestrator``. The only exception is for the Python ``Client``, which requires + the `cluster` constructor parameter to differentiate between standalone deployment and colocated + deployment. +- In an ``Experiment`` with multiple ``Orchestrators``, users can connect to a specific ``Orchestrator`` by + first specifying the `db_identifier` in the ``ConfigOptions`` constructor within the executable application. + Subsequently, users should pass the ``ConfigOptions`` instance to the ``Client`` constructor. +- Users can specify or override automatically configured connection options by providing the + ``Orchestrator`` address in the ``ConfigOptions`` object. Subsequently, users should pass the ``ConfigOptions`` + instance to the ``Client`` constructor. + +If connecting to a standalone ``Orchestrator`` from a ``Experiment`` driver script, the user must specify +the address of the ``Orchestrator`` to the ``Client`` constructor. SmartSim does not automatically +configure the environment of the ``Experiment`` driver script to connect to an ``Orchestrator``. Users +can access an ``Orchestrators`` address through ``Orchestrator.get_address``. -The ``Orchestrator`` is an in-memory database that is launched prior to all other -entities within an ``Experiment``. The ``Orchestrator`` can be used to store and retrieve -data during the course of an experiment and across multiple entities. In order to -stream data into or receive data from the ``Orchestrator``, one of the SmartSim clients -(SmartRedis) has to be used within a Model. +.. note:: + In SmartSim ``Model`` applications, it is advisable to **avoid** specifying addresses directly to the ``Client`` constructor. + Utilizing the SmartSim environment configuration for SmartRedis ``Client`` connections + allows the SmartSim ``Model`` application code to remain unchanged even as ``Orchestrator`` deployment + options vary. -.. |orchestrator| image:: images/Orchestrator.png - :width: 700 - :alt: Alternative text +The following image illustrates +communication between a standalone ``Orchestrator`` and a +SmartSim ``Model``. In the diagram, the application is running on multiple compute nodes, +separate from the ``Orchestrator`` compute nodes. Communication is established between the +``Model`` application and the sharded ``Orchestrator`` using the :ref:`SmartRedis client`. -|orchestrator| +.. figure:: images/clustered_orchestrator-1.png -Combined with the SmartRedis clients, the ``Orchestrator`` is capable of hosting and executing -AI models written in Python on CPU or GPU. The ``Orchestrator`` supports models written with -TensorFlow, Pytorch, TensorFlow-Lite, or models saved in an ONNX format (e.g. sci-kit learn). + Sample Standalone ``Orchestrator`` Deployment +.. note:: + Users do not need to know how the data is stored in a standalone configuration and + can address the cluster with the SmartRedis ``Client`` like a single block of memory + using simple put/get semantics in SmartRedis. + +In scenarios where data needs to be shared amongst ``Experiment`` entities, +such as online analysis, training, and processing, a standalone ``Orchestrator`` +is optimal. The data produced by multiple processes in a ``Model`` is stored in the standalone +``Orchestrator`` and is available for consumption by other ``Model``'s. + +If a workflow requires an application to leverage multiple standalone deployments, +multiple ``Clients`` can be instantiated within an application, +with each ``Client`` connected to a unique ``Orchestrator``. This is accomplished through the use of the +`db-identifier` and :ref:`ConfigOptions` object specified at ``Orchestrator`` initialization time. +For more information on a multiple database ``Experiment``, visit the :ref:`Multiple Orchestrators` section on +this page. + +------- +Example +------- +In the following example, we demonstrate deploying a standalone ``Orchestrator`` on an HPC system. +Once the standalone ``Orchestrator`` is launched from the ``Experiment`` driver script, we walk through +connecting a SmartRedis ``Client`` to the ``Orchestrator`` from within the ``Model`` +application to transmit and poll for data. -Cluster Orchestrator -==================== +The example is comprised of two script files: + +- :ref:`Application Script` + The application script is a Python file that contains instructions to create a SmartRedis + ``Client`` connection to the standalone ``Orchestrator``. To demonstrate the ability of + workflow components to access data from other entities, we retrieve the tensors set by + the driver script using a SmartRedis ``Client`` in the application script. We then instruct + the ``Client`` to send and retrieve data from within the application script. The example source + code is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Application Script source code + + .. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py -The ``Orchestrator`` supports single node and distributed memory settings. This means -that a single compute host can be used for the database or multiple by specifying -``db_nodes`` to be greater than 1. +- :ref:`Experiment Driver Script` + The ``Experiment`` driver script is responsible for launching and managing SmartSim entities. Within this script, + we use the ``Experiment`` API to create and launch a standalone ``Orchestrator``. To demonstrate the capability of + a ``Model`` application to access ``Orchestrator`` data sent from other sources, we employ the SmartRedis ``Client`` in + the driver script to store a tensor in the ``Orchestrator``, which is later retrieved by the ``Model`` application. + To employ the application script, we initialize a ``Model`` object with the application script as the executable, + launch the ``Orchestrator``, and then launch the ``Model``. -.. |cluster-orc| image:: images/clustered-orc-diagram.png - :width: 700 - :alt: Alternative text + To further demonstrate the ability of workflow components to access data from + other entities, we retrieve the tensors stored by the completed ``Model`` using a SmartRedis ``Client`` in + the driver script. Lastly, we tear down the ``Orchestrator``. The example source code is available in the dropdown below for + convenient execution and customization. -|cluster-orc| + .. dropdown:: Example Experiment Driver Script Source Code + .. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py -With a clustered ``Orchestrator``, multiple compute hosts memory can be used together -to store data. As well, the CPU or GPU(s) where the ``Orchestrator`` is running can -be used to execute the AI models, and Torchscript code on data stored within it. +.. _standalone_orch_app_script: -Users do not need to know how the data is stored in a clustered configuration and -can address the cluster with the SmartRedis clients like a single block of memory -using simple put/get semantics in SmartRedis. SmartRedis will ensure that data -is evenly distributed amongst all nodes in the cluster. +Application Script +================== +To begin writing the application script, import the necessary SmartRedis packages: -The cluster deployment is optimal for high data throughput scenarios such as -online analysis, training and processing. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 1-2 +Client Initialization +--------------------- +To establish a connection with the ``Orchestrator``, we need to initialize a new SmartRedis ``Client``. +Because the ``Orchestrator`` launched in the driver script is sharded, we specify the +constructor argument `cluster` as `True`. -Colocated Orchestrator -======================= +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 4-5 -A colocated Orchestrator is a special type of Orchestrator that is deployed on -the same compute hosts an a ``Model`` instance defined by the user. In this -deployment, the database is *not* connected together in a cluster and each -shard of the database is addressed individually by the processes running -on that compute host. +.. note:: + Note that the C/C++/Fortran SmartRedis ``Clients`` are capable of reading cluster configurations + from the SmartSim ``Model`` environment and the `cluster` constructor argument does not need to be specified + in those ``Client`` languages. -.. |colo-orc| image:: images/co-located-orc-diagram.png - :width: 700 - :alt: Alternative text +Since there is only one ``Orchestrator`` launched in the ``Experiment`` +(the standalone ``Orchestrator``), specifying an ``Orchestrator`` `db_identifier` +is **not** required when initializing the SmartRedis ``Client``. +SmartRedis will handle the connection configuration. +.. note:: + To create a SmartRedis ``Client`` connection to the standalone ``Orchestrator``, the ``Orchestrator`` must be launched + from within the driver script prior to the start of the ``Model``. -|colo-orc| +Data Retrieval +-------------- +To confirm a successful connection to the ``Orchestrator``, we retrieve the tensor set from the ``Experiment`` script. +Use the ``Client.get_tensor`` method to retrieve the tensor named `tensor_1` placed by the driver script: -This deployment is designed for highly performant online inference scenarios where -a distributed process (likely MPI processes) are performing inference with -data local to each process. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 7-10 -This method is deemed ``locality based inference`` since data is local to each -process and the ``Orchestrator`` is deployed locally on each compute host where -the distributed application is running. +After the ``Model`` is launched by the driver script, the following output will appear in +`getting-started/model/model.out`:: + Default@17-11-48:The multi-sharded db tensor is: [1 2 3 4] -To create a colocated model, first, create a ``Model`` instance and then call -the ``Model.colocate_db_tcp`` or ``Model.colocate_db_uds`` function. +Data Storage +------------ +Next, create a NumPy tensor to send to the standalone ``Orchestrator`` using +``Client.put_tensor(name, data)``: -.. currentmodule:: smartsim.entity.model +.. literalinclude:: tutorials/doc_examples/orch_examples/std_app.py + :language: python + :linenos: + :lines: 12-15 -.. automethod:: Model.colocate_db_tcp - :noindex: +We retrieve `"tensor_2"` in the ``Experiment`` driver script. -.. automethod:: Model.colocate_db_uds - :noindex: +.. _standalone_orch_driver_script: -Here is an example of creating a simple model that is colocated with an -``Orchestrator`` deployment using Unix Domain Sockets +Experiment Driver Script +======================== +To run the previous application script, we define a ``Model`` and ``Orchestrator`` within the +``Experiment`` driver script. Configuring and launching workflow entities (``Model`` and ``Orchestrator``) requires the utilization of +``Experiment`` class methods. The ``Experiment`` object is intended to be instantiated +once and utilized throughout the workflow runtime. -.. code-block:: python +In this example, we instantiate an ``Experiment`` object with the name `getting-started` +and the `launcher` set to `auto`. When using `launcher=auto`, SmartSim attempts to find a launcher on the machine. +For example, if this script were run on a Slurm-based system, SmartSim will automatically set the launcher to `slurm`. +We also setup the SmartSim `logger` to output information from the ``Experiment`` at runtime: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 1-9 + +Orchestrator Initialization +--------------------------- +In the next stage of the ``Experiment``, we create a standalone ``Orchestrator``. + +To create a standalone ``Orchestrator``, utilize the ``Experiment.create_database`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 11-12 + +Client Initialization +--------------------- +The SmartRedis ``Client`` object contains functions that manipulate, send, and retrieve +data on the ``Orchestrator``. Begin by initializing a SmartRedis ``Client`` object for the standalone ``Orchestrator``. + +SmartRedis ``Clients`` in driver scripts do not have the ability to use a `db-identifier` or +rely on automatic configurations to connect to ``Orchestrators``. Therefore, when creating a SmartRedis ``Client`` +connection from within a driver script, specify the address of the ``Orchestrator`` you would like to connect to. +You can easily retrieve the ``Orchestrator`` address using the ``Orchestrator.get_address`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 14-15 + +Data Storage +------------ +In the application script, we retrieved a NumPy tensor stored from within the driver script. +To support the application functionality, we create a +NumPy array in the ``Experiment`` driver script to send to the ``Orchestrator``. To +send a tensor to the ``Orchestrator``, use the function ``Client.put_tensor(name, data)``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 17-20 + +Model Initialization +-------------------- +In the next stage of the ``Experiment``, we configure and create +a SmartSim ``Model`` and specify the executable path during ``Model`` creation: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 22-27 + +File Generation +--------------- +To create an isolated output directory for the ``Orchestrator`` and ``Model``, invoke ``Experiment.generate`` on the +``Experiment`` instance `exp` with `standalone_orchestrator` and `model` as input parameters: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 29-30 + +Invoking ``Experiment.generate(standalone_orchestrator, model)`` will create two directories: +`standalone_orchestrator/` and `model/`. Each of these directories will store +two output files: a `.out` file and a `.err` file. - from smartsim import Experiment - exp = Experiment("colo-test", launcher="auto") +.. note:: + It is important to invoke ``Experiment.generate`` with all ``Experiment`` entity instances + before launching. This will ensure that the output files are organized in the main ``experiment-name/`` + folder. In this example, the ``Experiment`` folder is named `getting-started/`. - colo_settings = exp.create_run_settings(exe="./some_mpi_app") +Entity Deployment +----------------- +In the next stage of the ``Experiment``, we launch the ``Orchestrator``, then launch the ``Model``. - colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.colocate_db_uds( - db_cpus=1, # cpus given to the database on each node - debug=False # include debug information (will be slower) - ifname=network_interface # specify network interface(s) to use (i.e. "ib0" or ["ib0", "lo"]) - ) - exp.start(colo_model) +Step 1: Start Orchestrator +'''''''''''''''''''''''''' +In the context of this ``Experiment``, it's essential to create and launch +the ``Orchestrator`` as a preliminary step before any other workflow entities. This is important +because the application requests and sends tensors to a launched ``Orchestrator``. +To launch the ``Orchestrator``, pass the ``Orchestrator`` instance to ``Experiment.start``. -By default, SmartSim will pin the database to the first _N_ CPUs according to ``db_cpus``. By -specifying the optional argument ``custom_pinning``, an alternative pinning can be specified -by sending in a list of CPU ids (e.g [0,2,range(5,8)]). For optimal performance, most users -will want to also modify the RunSettings for the model to pin their application to cores not -occupied by the database. +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 32-33 -.. warning:: +The ``Experiment.start`` function launches the ``Orchestrator`` for use within the workflow. +In other words, the function deploys the ``Orchestrator`` on the allocated compute resources. + +Step 2: Start Model +''''''''''''''''''' +Next, launch the `model` instance using the ``Experiment.start`` function: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 35-36 + +In the next subsection, we request tensors placed by the ``Model`` application. +We specify `block=True` to ``exp.start`` to require the ``Model`` to finish before +the ``Experiment`` continues. + +Data Polling +------------ +Next, check if the tensor exists in the standalone ``Orchestrator`` using ``Client.poll_tensor``. +This function queries for data in the ``Orchestrator``. The function requires the tensor name (`name`), +how many milliseconds to wait in between queries (`poll_frequency_ms`), +and the total number of times to query (`num_tries`). Check if the data exists in the ``Orchestrator`` by +polling every 100 milliseconds until 10 attempts have completed: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 38-41 + +When you execute the driver script, the output will be as follows:: + + 23:45:46 system.host.com SmartSim[87400] INFO The tensor exists: True + +Cleanup +------- +Finally, use the ``Experiment.stop`` function to stop the ``Orchestrator`` instance. Print the +workflow summary with ``Experiment.summary``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/std_driver.py + :language: python + :linenos: + :lines: 43-46 + +When you run the ``Experiment``, the following output will appear:: + + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|----------------|---------------|-------------|---------|---------|-----------|--------------| + | 0 | model | Model | 1658679.3 | 0 | 1.3342 | Completed | 0 | + | 1 | orchestrator_0 | DBNode | 1658679.2+2 | 0 | 42.8742 | Cancelled | 0 | + +.. _colocated_orch_doc: + +==================== +Colocated Deployment +==================== +-------- +Overview +-------- +During colocated ``Orchestrator`` deployment, a SmartSim ``Orchestrator`` (the database) runs on +the ``Model``'s compute node(s). Colocated ``Orchestrators`` can only be deployed as isolated instances +on each compute node and cannot be clustered over multiple nodes. The ``Orchestrator`` on each application node is +utilized by SmartRedis ``Clients`` on the same node. With a colocated ``Orchestrator``, all interactions +with the database occur on the same node, thus resulting in lower latency compared to the standard ``Orchestrator``. +A colocated ``Orchestrator`` is ideal when the data and hardware accelerator are located on the +same compute node. + +Communication between a colocated ``Orchestrator`` and ``Model`` is initiated in the application through a +SmartRedis ``Client``. Since a colocated ``Orchestrator`` is launched when the ``Model`` +is started by the ``Experiment``, connecting a SmartRedis ``Client`` to a colocated ``Orchestrator`` is only possible from within +the associated ``Model`` application. + +There are **three** methods for connecting the SmartRedis ``Client`` to the colocated ``Orchestrator``: + +- In an ``Experiment`` with a single deployed ``Orchestrator``, users can rely on SmartRedis + to detect the ``Orchestrator`` address through runtime configuration of the SmartSim ``Model`` environment. + A default ``Client`` constructor, with no user-specified parameters, is sufficient to + connect to the ``Orchestrator``. The only exception is for the Python ``Client``, which requires + the `cluster=False` constructor parameter for the colocated ``Orchestrator``. +- In an ``Experiment`` with multiple ``Orchestrators``, users can connect to a specific ``Orchestrator`` by + first specifying the `db_identifier` in the ``ConfigOptions`` constructor. Subsequently, users should pass the + ``ConfigOptions`` instance to the ``Client`` constructor. +- Users can specify or override automatically configured connection options by providing the + ``Orchestrator`` address in the ``ConfigOptions`` object. Subsequently, users should pass the ``ConfigOptions`` + instance to the ``Client`` constructor. + +Below is an image illustrating communication within a colocated ``Model`` spanning multiple compute nodes. +As demonstrated in the diagram, each process of the application creates its own SmartRedis ``Client`` +connection to the ``Orchestrator`` running on the same host. + +.. figure:: images/colocated_orchestrator-1.png + + Sample Colocated ``Orchestrator`` Deployment + +Colocated deployment is ideal for highly performant online inference scenarios where +a distributed application (likely an MPI application) is performing inference with +data local to each process. With colocated deployment, data does not need to travel +off-node to be used to evaluate a ML model, and the results of the ML model evaluation +are stored on-node. + +If a workflow requires an application to both leverage colocated +deployment and standalone deployment, multiple ``Clients`` can be instantiated within an application, +with each ``Client`` connected to a unique deployment. This is accomplished through the use of the +`db-identifier` specified at ``Orchestrator`` initialization time. + +------- +Example +------- +In the following example, we demonstrate deploying a colocated ``Orchestrator`` on an HPC system. +Once the ``Orchestrator`` is launched, we walk through connecting a SmartRedis ``Client`` +from within the application script to transmit and poll for data on the ``Orchestrator``. + +The example is comprised of two script files: + +- :ref:`Application Script` + The application script is a Python script that connects a SmartRedis + ``Client`` to the colocated ``Orchestrator``. From within the application script, + the ``Client`` is utilized to both send and retrieve data. The source code example + is available in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Application Script Source Code + + .. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py - Pinning is not supported on MacOS X. Setting ``custom_pinning`` to anything - other than ``None`` will raise a warning and the input will be ignored. +- :ref:`Experiment Driver Script` + The ``Experiment`` driver script launches and manages + the example entities through the ``Experiment`` API. + In the driver script, we use the ``Experiment`` API + to create and launch a colocated ``Model``. The source code example is available + in the dropdown below for convenient execution and customization. + + .. dropdown:: Example Experiment Driver source code + + .. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + +.. _colocated_orch_app_script: + +Application Script +================== +To begin writing the application script, import the necessary SmartRedis packages: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 1-2 + +Client Initialization +--------------------- +To establish a connection with the colocated ``Orchestrator``, we need to initialize a +new SmartRedis ``Client`` and specify `cluster=False` since colocated deployments are never +clustered but only single-sharded. + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 4-5 + +.. note:: + Note that the C/C++/Fortran SmartRedis ``Clients`` are capable of reading cluster configurations + from the ``Model`` environment and the `cluster` constructor argument does not need to be specified + in those ``Client`` languages. .. note:: + Since there is only one ``Orchestrator`` launched in the ``Experiment`` + (the colocated ``Orchestrator``), specifying a ``Orchestrator`` `db_identifier` + is not required when initializing the ``Client``. SmartRedis will handle the + connection configuration. - Pinning _only_ affects the co-located deployment because both the application and the database - are sharing the same compute node. For the clustered deployment, a shard occupies the entirety - of the node. +.. note:: + To create a ``Client`` connection to the colocated ``Orchestrator``, the colocated ``Model`` must be launched + from within the driver script. You must execute the Python driver script, otherwise, there will + be no ``Orchestrator`` to connect the ``Client`` to. + +Data Storage +------------ +Next, using the SmartRedis ``Client`` instance, we create and store a NumPy tensor through +``Client.put_tensor(name, data)``: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 7-10 + +We will retrieve `“tensor_1”` in the following section. + +Data Retrieval +-------------- +To confirm a successful connection to the ``Orchestrator``, we retrieve the tensor we stored. +Use the ``Client.get_tensor`` method to retrieve the tensor by specifying the name +`“tensor_1”`: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_app.py + :language: python + :linenos: + :lines: 12-15 + +When the ``Experiment`` completes, you can find the following log message in `colo_model.out`:: + + Default@21-48-01:The colocated db tensor is: [1 2 3 4] + +.. _colocated_orch_driver_script: + +Experiment Driver Script +======================== +To run the previous application script, a ``Model`` object must be configured and launched within the +``Experiment`` driver script. Configuring and launching workflow entities (``Model``) +requires the utilization of ``Experiment`` class methods. The ``Experiment`` object is intended to +be instantiated once and utilized throughout the workflow runtime. + +In this example, we instantiate an ``Experiment`` object with the name `getting-started` +and the `launcher` set to `auto`. When using `launcher=auto`, SmartSim attempts to find a launcher on the machine. +In this case, since we are running the example on a Slurm-based machine, +SmartSim will automatically set the launcher to `slurm`. We set up the SmartSim `logger` +to output information from the ``Experiment`` at runtime: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 1-9 + +Colocated Model Initialization +------------------------------ +In the next stage of the ``Experiment``, we create and launch a colocated ``Model`` that +runs the application script with a ``Orchestrator`` on the same compute node. -Redis -===== +Step 1: Configure +''''''''''''''''' +In this example ``Experiment``, the ``Model`` application is a Python script as defined in section: +:ref:`Application Script`. Before initializing the ``Model`` object, we must use +``Experiment.create_run_settings`` to create a ``RunSettings`` object that defines how to execute +the ``Model``. To launch the Python script in this example workflow, we specify the path to the application +file `application_script.py` as the `exe_args` parameter and the executable `exe_ex` (the Python +executable on this system) as `exe` parameter. The ``Experiment.create_run_settings`` function +will return a ``RunSettings`` object that can then be used to initialize the ``Model`` object. -.. _Redis: https://github.com/redis/redis -.. _RedisAI: https://github.com/RedisAI/RedisAI +.. note:: + Change the `exe_args` argument to the path of the application script + on your file system to run the example. -The ``Orchestrator`` is built on `Redis`_. Largely, the job of the ``Orchestrator`` is to -create a Python reference to a Redis deployment so that users can launch, monitor -and stop a Redis deployment on workstations and HPC systems. +Use the ``RunSettings`` helper functions to +configure the the distribution of computational tasks (``RunSettings.set_nodes``). In this +example, we specify to SmartSim that we intend the ``Model`` to run on a single compute node. -Redis was chosen for the Orchestrator because it resides in-memory, can be distributed on-node -as well as across nodes, and provides low latency data access to many clients in parallel. The -Redis ecosystem was a primary driver as the Redis module system provides APIs for languages, -libraries, and techniques used in Data Science. In particular, the ``Orchestrator`` -relies on `RedisAI`_ to provide access to Machine Learning runtimes. +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 11-14 -At its core, Redis is a key-value store. This means that put/get semantics are used to send -messages to and from the database. SmartRedis clients use a specific hashing algorithm, CRC16, to ensure -that data is evenly distributed amongst all database nodes. Notably, a user is not required to -know where (which database node) data or Datasets (see Dataset API) are stored as the -SmartRedis clients will infer their location for the user. +Step 2: Initialize +'''''''''''''''''' +Next, create a ``Model`` instance using the ``Experiment.create_model`` factory method. +Pass the ``model_settings`` object as input to the method and +assign the returned ``Model`` instance to the variable `model`: + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 16-17 + +Step 3: Colocate +'''''''''''''''' +To colocate an ``Orchestrator`` with a ``Model``, use the ``Model.colocate_db_uds`` function. +This function will colocate an ``Orchestrator`` instance with this ``Model`` over +a Unix domain socket connection. +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 19-20 -KeyDB -===== +Step 4: Generate Files +'''''''''''''''''''''' +Next, generate the ``Experiment`` entity directories by passing the ``Model`` instance to +``Experiment.generate``: -.. _KeyDB: https://github.com/EQ-Alpha/KeyDB +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 22-23 -`KeyDB`_ is a multi-threaded fork of Redis that can be swapped in as the database for -the ``Orchestrator`` in SmartSim. KeyDB can be swapped in for Redis by setting the -``REDIS_PATH`` environment variable to point to the ``keydb-server`` binary. +Step 5: Start +''''''''''''' +Next, launch the colocated ``Model`` instance using the ``Experiment.start`` function. -A full example of configuring KeyDB to run in SmartSim is shown below +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 25-26 -.. code-block:: bash +Cleanup +------- +.. note:: + Since the colocated ``Orchestrator`` is automatically torn down by SmartSim once the colocated ``Model`` + has finished, we do not need to `stop` the ``Orchestrator``. + +.. literalinclude:: tutorials/doc_examples/orch_examples/colo_driver.py + :language: python + :linenos: + :lines: 28-29 + +When you run the experiment, the following output will appear:: - # build KeyDB - # see https://github.com/EQ-Alpha/KeyDB + | | Name | Entity-Type | JobID | RunID | Time | Status | Returncode | + |----|--------|---------------|-----------|---------|---------|-----------|--------------| + | 0 | model | Model | 1592652.0 | 0 | 10.1039 | Completed | 0 | - # get KeyDB configuration file - wget https://github.com/CrayLabs/SmartSim/blob/d3d252b611c9ce9d9429ba6eeb71c15471a78f08/smartsim/_core/config/keydb.conf +.. _mutli_orch_doc: - export REDIS_PATH=/path/to/keydb-server - export REDIS_CONF=/path/to/keydb.conf +====================== +Multiple Orchestrators +====================== +SmartSim supports automating the deployment of multiple ``Orchestrators`` +from within an ``Experiment``. Communication with the ``Orchestrator`` via a SmartRedis ``Client`` is possible with the +`db_identifier` argument that is required when initializing an ``Orchestrator`` or +colocated ``Model`` during a multiple ``Orchestrator`` ``Experiment``. When initializing a SmartRedis +``Client`` during the ``Experiment``, create a ``ConfigOptions`` object to specify the `db_identifier` +argument used when creating the ``Orchestrator``. Pass the ``ConfigOptions`` object to +the ``Client`` init call. - # run smartsim workload +.. _mutli_orch: +----------------------------- Multiple Orchestrator Example -============================= +----------------------------- SmartSim offers functionality to automate the deployment of multiple databases, supporting workloads that require multiple ``Orchestrators`` for a ``Experiment``. For instance, a workload may consist of a simulation with high inference performance demands (necessitating a co-located deployment), -along with an analysis and -visualization workflow connected to the simulation (requiring a standard orchestrator). -In the following example, we simulate a simple version of this use case. +along with an analysis and visualization workflow connected to the simulation +(requiring a standalone ``Orchestrator``). In the following example, we simulate a +simple version of this use case. The example is comprised of two script files: -* The :ref:`Application Script` -* The :ref:`Experiment Driver Script` +* The Application Script +* The ``Experiment`` Driver Script **The Application Script Overview:** In this example, the application script is a python file that contains instructions to complete computational tasks. Applications are not limited to Python and can also be written in C, C++ and Fortran. -This script specifies creating a Python SmartRedis client for each -standard orchestrator and a colocated orchestrator. We use the -clients to request data from both standard databases, then -transfer the data to the colocated database. The application -file is launched by the experiment driver script +This script specifies creating a Python SmartRedis ``Client`` for each +standalone ``Orchestrator`` and a colocated ``Orchestrator``. We use the +``Clients`` to request data from both standalone ``Orchestrators``, then +transfer the data to the colocated ``Orchestrator``. The application +file is launched by the ``Experiment`` driver script through a ``Model`` stage. **The Application Script Contents:** -1. Connecting SmartRedis clients within the application to retrieve tensors - from the standard databases to store in a colocated database. Details in section: - :ref:`Initialize the Clients`. +1. Connecting SmartRedis ``Clients`` within the application to retrieve tensors + from the standalone ``Orchestrators`` to store in a colocated ``Orchestrator``. Details in section: + :ref:`Initialize the Clients`. **The Experiment Driver Script Overview:** -The experiment driver script holds the stages of the workflow +The ``Experiment`` driver script holds the stages of the workflow and manages their execution through the ``Experiment`` API. -We initialize an Experiment +We initialize an ``Experiment`` at the beginning of the Python file and use the ``Experiment`` to iteratively create, configure and launch computational kernels on the system through the `slurm` launcher. @@ -211,143 +691,146 @@ runs the application. **The Experiment Driver Script Contents:** -1. Launching two standard Orchestrators with unique identifiers. Details in section: - :ref:`Launch Multiple Orchestrators`. -2. Launching the application script with a co-located database. Details in section: - :ref:`Initialize a Colocated Model`. -3. Connecting SmartRedis clients within the driver script to send tensors to standard Orchestrators +1. Launching two standalone ``Orchestrators`` with unique identifiers. Details in section: + :ref:`Launch Multiple Orchestrators`. +2. Launching the application script with a colocated ``Orchestrator``. Details in section: + :ref:`Initialize a Colocated Model`. +3. Connecting SmartRedis ``Clients`` within the driver script to send tensors to standalone ``Orchestrators`` for retrieval within the application. Details in section: - :ref:`Create Client Connections to Orchestrators`. + :ref:`Create Client Connections to Orchestrators`. -Setup and run instructions can be found :ref:`here` +Setup and run instructions can be found :ref:`here` + +.. _app_script_multi_db: The Application Script ----------------------- -Applications interact with the databases -through a SmartRedis client. +====================== +Applications interact with the ``Orchestrators`` +through a SmartRedis ``Client``. In this section, we write an application script to demonstrate how to connect SmartRedis -clients in the context of multiple -launched databases. Using the clients, we retrieve tensors -from two databases launched in the driver script, then store -the tensors in the colocated database. +``Clients`` in the context of multiple +launched ``Orchestrators``. Using the ``Clients``, we retrieve tensors +from two ``Orchestrators`` launched in the driver script, then store +the tensors in the colocated ``Orchestrators``. .. note:: - The Experiment must be started to use the Orchestrators within the + The ``Experiment`` must be started to use the ``Orchestrators`` within the application script. Otherwise, it will fail to connect. - Find the instructions on how to launch :ref:`here` + Find the instructions on how to launch :ref:`here` To begin, import the necessary packages: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 1-3 +.. _init_model_client: + Initialize the Clients -^^^^^^^^^^^^^^^^^^^^^^ -To establish a connection with each database, -we need to initialize a new SmartRedis client for each -``Orchestrator``. +---------------------- +To establish a connection with each ``Orchestrators``, +we need to initialize a new SmartRedis ``Client`` for each. Step 1: Initialize ConfigOptions -"""""""""""""""""""""""""""""""" -Since we are launching multiple databases within the experiment, +'''''''''''''''''''''''''''''''' +Since we are launching multiple ``Orchestrators`` within the ``Experiment``, the SmartRedis ``ConfigOptions`` object is required when initializing -a client in the application. -We use the ``ConfigOptions.create_from_environment()`` +a ``Client`` in the application. +We use the ``ConfigOptions.create_from_environment`` function to create three instances of ``ConfigOptions``, with one instance associated with each launched ``Orchestrator``. -Most importantly, to associate each launched Orchestrator to a ConfigOptions object, -the ``create_from_environment()`` function requires specifying the unique database identifier +Most importantly, to associate each launched ``Orchestrator`` to a ``ConfigOptions`` object, +the ``create_from_environment`` function requires specifying the unique ``Orchestrator`` identifier argument named `db_identifier`. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 5-6 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 10-11 -For the colocated database: +For the colocated ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 15-16 Step 2: Initialize the Client Connections -""""""""""""""""""""""""""""""""""""""""" +''''''''''''''''''''''''''''''''''''''''' Now that we have three ``ConfigOptions`` objects, we have the -tools necessary to initialize three SmartRedis clients and -establish a connection with the three databases. -We use the SmartRedis ``Client`` API to create the client instances by passing in +tools necessary to initialize three SmartRedis ``Clients`` and +establish a connection with the three ``Orchestrators``. +We use the SmartRedis ``Client`` API to create the ``Client`` instances by passing in the ``ConfigOptions`` objects and assigning a `logger_name` argument. -Single-sharded database: +Single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 7-8 -Multi-sharded database: +Multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 12-13 -Colocated database: +Colocated ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 17-18 Retrieve Data and Store Using SmartRedis Client Objects -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -To confirm a successful connection to each database, we will retrieve the tensors +------------------------------------------------------- +To confirm a successful connection to each ``Orchestrator``, we will retrieve the tensors that we plan to store in the python driver script. After retrieving, we -store both tensors in the colocated database. -The ``Client.get_tensor()`` method allows +store both tensors in the colocated ``Orchestrator``. +The ``Client.get_tensor`` method allows retrieval of a tensor. It requires the `name` of the tensor assigned -when sent to the database via ``Client.put_tensor()``. +when sent to the ``Orchestrator`` via ``Client.put_tensor``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 20-26 -Later, when you run the experiment driver script the following output will appear in ``tutorial_model.out`` +Later, when you run the ``Experiment`` driver script the following output will appear in ``tutorial_model.out`` located in ``getting-started-multidb/tutorial_model/``:: Model: single shard logger@00-00-00:The single sharded db tensor is: [1 2 3 4] Model: multi shard logger@00-00-00:The multi sharded db tensor is: [5 6 7 8] -This output showcases that we have established a connection with multiple Orchestrators. +This output showcases that we have established a connection with multiple ``Orchestrators``. -Next, take the tensors retrieved from the standard deployment databases and -store them in the colocated database using ``Client.put_tensor(name, data)``. +Next, take the tensors retrieved from the standalone deployment ``Orchestrators`` and +store them in the colocated ``Orchestrator`` using ``Client.put_tensor(name, data)``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 28-30 -Next, check if the tensors exist in the colocated database using ``Client.poll_tensor()``. -This function queries for data in the database. The function requires the tensor name (`name`), +Next, check if the tensors exist in the colocated ``Orchestrator`` using ``Client.poll_tensor``. +This function queries for data in the ``Orchestrator``. The function requires the tensor name (`name`), how many milliseconds to wait in between queries (`poll_frequency_ms`), and the total number of times to query (`num_tries`): -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: :lines: 32-37 @@ -358,156 +841,162 @@ The output will be as follows:: Model: colo logger@00-00-00:The colocated db has tensor_2: True The Experiment Driver Script ----------------------------- +============================ To run the previous application, we must define workflow stages within a workload. Defining workflow stages requires the utilization of functions associated -with the ``Experiment`` object. The Experiment object is intended to be instantiated +with the ``Experiment`` object. The ``Experiment`` object is intended to be instantiated once and utilized throughout the workflow runtime. In this example, we instantiate an ``Experiment`` object with the name ``getting-started-multidb``. We setup the SmartSim ``logger`` to output information from the Experiment. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 1-10 +.. _launch_multiple_orch: + Launch Multiple Orchestrators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +----------------------------- In the context of this ``Experiment``, it's essential to create and launch -the databases as a preliminary step before any other components since -the application script requests tensors from the launched databases. +the ``Orchestrators`` as a preliminary step before any other components since +the application script requests tensors from the launched ``Orchestrators``. -We aim to showcase the multi-database automation capabilities of SmartSim, so we -create two databases in the workflow: a single-sharded database and a -multi-sharded database. +We aim to showcase the multi-Orchestrator automation capabilities of SmartSim, so we +create two ``Orchestrators`` in the workflow: a single-sharded ``Orchestrator`` and a +multi-sharded ``Orchestrator``. Step 1: Initialize Orchestrators -"""""""""""""""""""""""""""""""" -To create an database, utilize the ``Experiment.create_database()`` function. +'''''''''''''''''''''''''''''''' +To create an ``Orchestrator``, utilize the ``Experiment.create_database`` function. The function requires specifying a unique -database identifier argument named `db_identifier` to launch multiple databases. -This step is necessary to connect to databases outside of the driver script. +``Orchestrator`` identifier argument named `db_identifier` to launch multiple ``Orchestrators``. +This step is necessary to connect to ``Orchestrators`` outside of the driver script. We will use the `db_identifier` names we specified in the application script. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 12-14 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 16-18 .. note:: - Calling ``exp.generate()`` will create two subfolders - (one for each Orchestrator created in the previous step) - whose names are based on the db_identifier of that Orchestrator. + Calling ``exp.generate`` will create two subfolders + (one for each ``Orchestrator`` created in the previous step) + whose names are based on the `db_identifier` of that ``Orchestrator``. In this example, the Experiment folder is - named ``getting-started-multidb/``. Within this folder, two Orchestrator subfolders will + named ``getting-started-multidb/``. Within this folder, two ``Orchestrator`` subfolders will be created, namely ``single_shard_db_identifier/`` and ``multi_shard_db_identifier/``. -Step 2: Start Databases -""""""""""""""""""""""" -Next, to launch the databases, -pass the database instances to ``Experiment.start()``. +Step 2: Start +''''''''''''' +Next, to launch the ``Orchestrators``, +pass the ``Orchestrator`` instances to ``Experiment.start``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 20-21 -The ``Experiment.start()`` function launches the ``Orchestrators`` for use within the workflow. In other words, the function -deploys the databases on the allocated compute resources. +The ``Experiment.start`` function launches the ``Orchestrators`` for use within the workflow. In other words, the function +deploys the ``Orchestrators`` on the allocated compute resources. .. note:: By setting `summary=True`, SmartSim will print a summary of the - experiment before it is launched. After printing the experiment summary, - the experiment is paused for 10 seconds giving the user time to - briefly scan the summary contents. If we set `summary=False`, then the experiment + ``Experiment`` before it is launched. After printing the ``Experiment`` summary, + the ``Experiment`` is paused for 10 seconds giving the user time to + briefly scan the summary contents. If we set `summary=False`, then the ``Experiment`` would be launched immediately with no summary. +.. _client_connect_orch: + Create Client Connections to Orchestrators -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------------ The SmartRedis ``Client`` object contains functions that manipulate, send, and receive -data within the database. Each database has a single, dedicated SmartRedis ``Client``. -Begin by initializing a SmartRedis ``Client`` object per launched database. +data within the ``Orchestrator``. Each ``Orchestrator`` has a single, dedicated SmartRedis ``Client``. +Begin by initializing a SmartRedis ``Client`` object per launched ``Orchestrator``. To create a designated SmartRedis ``Client``, you need to specify the address of the target -running database. You can easily retrieve this address using the ``Orchestrator.get_address()`` function. +running ``Orchestrator``. You can easily retrieve this address using the ``Orchestrator.get_address`` function. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 23-24 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 25-26 Store Data Using Clients -^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------ In the application script, we retrieved two NumPy tensors. To support the apps functionality, we will create two -NumPy arrays in the python driver script and send them to the a database. To -accomplish this, we use the ``Client.put_tensor()`` function with the respective -database client instances. +NumPy arrays in the python driver script and send them to the a ``Orchestrator``. To +accomplish this, we use the ``Client.put_tensor`` function with the respective +``Orchestrator`` `client` instances. -For the single-sharded database: +For the single-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 28-31 -For the multi-sharded database: +For the multi-sharded ``Orchestrator``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 33-36 -Lets check to make sure the database tensors do not exist in the incorrect databases: +Lets check to make sure the ``Orchestrator`` tensors do not exist in the incorrect ``Orchestrators``: -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 38-42 -When you run the experiment, the following output will appear:: +When you run the ``Experiment``, the following output will appear:: 00:00:00 system.host.com SmartSim[#####] INFO The multi shard array key exists in the incorrect database: False 00:00:00 system.host.com SmartSim[#####] INFO The single shard array key exists in the incorrect database: False +.. _init_colocated_model: + Initialize a Colocated Model -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In the next stage of the experiment, we -launch the application script with a co-located database +---------------------------- +In the next stage of the ``Experiment``, we +launch the application script with a co-located ``Orchestrator`` by configuring and creating a SmartSim colocated ``Model``. Step 1: Configure -""""""""""""""""" -You can specify the run settings of a model. -In this experiment, we invoke the Python interpreter to run -the python script defined in section: :ref:`The Application Script`. -To configure this into a ``Model``, we use the ``Experiment.create_run_settings()`` function. +''''''''''''''''' +You can specify the run settings of a ``Model``. +In this ``Experiment``, we invoke the Python interpreter to run +the python script defined in section: :ref:`The Application Script`. +To configure this into a SmartSim ``Model``, we use the ``Experiment.create_run_settings`` function. The function returns a ``RunSettings`` object. When initializing the RunSettings object, we specify the path to the application file, `application_script.py`, for ``exe_args``, and the run command for ``exe``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 44-45 @@ -517,62 +1006,66 @@ we specify the path to the application file, on your machine to run the example. With the ``RunSettings`` instance, -configure the the distribution of computational tasks (``RunSettings.set_nodes()``) and the number of instances -the script is execute on each node (``RunSettings.set_tasks_per_node()``). In this +configure the the distribution of computational tasks (``RunSettings.set_nodes``) and the number of instances +the script is execute on each node (``RunSettings.set_tasks_per_node``). In this example, we specify to SmartSim that we intend to execute the script once on a single node. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 46-48 Step 2: Initialize -"""""""""""""""""" -Next, create a ``Model`` instance using the ``Experiment.create_model()``. +'''''''''''''''''' +Next, create a ``Model`` instance using the ``Experiment.create_model``. Pass the ``model_settings`` object as an argument -to the ``create_model()`` function and assign to the variable ``model``. +to the ``create_model`` function and assign to the variable ``model``. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 49-50 Step 2: Colocate -"""""""""""""""" -To colocate the model, use the ``Model.colocate_db_uds()`` function to -Colocate an Orchestrator instance with this Model over +'''''''''''''''' +To colocate the ``Model``, use the ``Model.colocate_db_uds`` function to +Colocate an ``Orchestrator`` instance with this ``Model`` over a Unix domain socket connection. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 51-52 This method will initialize settings which add an unsharded -database to this Model instance. Only this Model will be able -to communicate with this colocated database by using the loopback TCP interface. +``Orchestrator`` to this ``Model`` instance. Only this ``Model`` will be able +to communicate with this colocated ``Orchestrator`` by using the loopback TCP interface. Step 3: Start -""""""""""""" -Next, launch the colocated model instance using the ``Experiment.start()`` function. +''''''''''''' +Next, launch the colocated ``Model`` instance using the ``Experiment.start`` function. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 53-54 .. note:: We set `block=True`, - so that ``Experiment.start()`` waits until the last Model has finished + so that ``Experiment.start`` waits until the last ``Model`` has finished before returning: it will act like a job monitor, letting us know if processes run, complete, or fail. Cleanup Experiment -^^^^^^^^^^^^^^^^^^ -Finally, use the ``Experiment.stop()`` function to stop the database instances. Print the -workflow summary with ``Experiment.summary()``. +------------------ +Finally, use the ``Experiment.stop`` function to stop the standard ``Orchestrator`` instances. + +.. note:: + Co-located ``Orchestrator``s are stopped when their associated ``Model``'s are stopped. -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +Print the workflow summary with ``Experiment.summary``. + +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: :lines: 56-59 @@ -586,16 +1079,18 @@ When you run the experiment, the following output will appear:: | 1 | single_shard_db_identifier_0 | DBNode | 1556529.3 | 0 | 68.8732 | Cancelled | 0 | | 2 | multi_shard_db_identifier_0 | DBNode | 1556529.4+2 | 0 | 45.5139 | Cancelled | 0 | +.. _run_ex_instruct: + How to Run the Example ----------------------- -Below are the steps to run the experiment. Find the -:ref:`experiment source code` -and :ref:`application source code` +====================== +Below are the steps to run the ``Experiment``. Find the +:ref:`experiment source code` +and :ref:`application source code` below in the respective subsections. .. note:: The example assumes that you have already installed and built - SmartSim and SmartRedis. Please refer to Section :ref:`Basic Installation` + SmartSim and SmartRedis. Please refer to Section :ref:`Basic Installation` for further details. For simplicity, we assume that you are running on a SLURM-based HPC-platform. Refer to the steps below for more details. @@ -609,7 +1104,7 @@ Step 1 : Setup your directory tree application_script.py experiment_script.py - You can find the application and experiment source code in subsections below. + You can find the application and ``Experiment`` source code in subsections below. Step 2 : Install and Build SmartSim This example assumes you have installed SmartSim and SmartRedis in your @@ -619,21 +1114,25 @@ Step 2 : Install and Build SmartSim Step 3 : Change the `exe_args` file path When configuring the colocated model in `experiment_script.py`, we pass the file path of `application_script.py` to the `exe_args` argument - on line 33 in :ref:`experiment_script.py`. + on line 33 in :ref:`experiment_script.py`. Edit this argument to the file path of your `application_script.py` -Step 4 : Run the Experiment - Finally, run the experiment with ``python experiment_script.py``. +Step 4 : Run the ``Experiment`` + Finally, run the ``Experiment`` with ``python experiment_script.py``. + +.. _multi_app_source_code: Application Source Code -^^^^^^^^^^^^^^^^^^^^^^^ -.. literalinclude:: ../tutorials/getting_started/multi_db_example/application_script.py +----------------------- +.. literalinclude:: tutorials/getting_started/multi_db_example/application_script.py :language: python :linenos: +.. _multi_exp_source_code: + Experiment Source Code -^^^^^^^^^^^^^^^^^^^^^^ -.. literalinclude:: ../tutorials/getting_started/multi_db_example/multidb_driver.py +---------------------- +.. literalinclude:: tutorials/getting_started/multi_db_example/multidb_driver.py :language: python :linenos: \ No newline at end of file diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index e883a2805..696881bef 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -12,3 +12,7 @@ ipython jinja2==3.1.2 protobuf numpy +sphinx-design +pypandoc +sphinx-autodoc-typehints +myst_parser diff --git a/doc/run_settings.rst b/doc/run_settings.rst new file mode 100644 index 000000000..ed12df8cb --- /dev/null +++ b/doc/run_settings.rst @@ -0,0 +1,334 @@ +.. _run_settings_doc: + +************ +Run Settings +************ +======== +Overview +======== +``RunSettings`` are used in the SmartSim API to define how ``Model`` and ``Ensemble`` jobs +should be executed. + +In general, ``RunSettings`` define: + +- the executable +- the arguments to pass to the executable +- necessary environment variables at runtime +- the required compute resources + +The base ``RunSettings`` class is utilized for local task launches, +while its derived child classes offer specialized functionality for HPC workload managers (WLMs). +Each SmartSim `launcher` interfaces with a specific ``RunSettings`` subclass tailored to an HPC job scheduler. + +- Navigate to the :ref:`Local` section to configure run settings locally +- Navigate to the :ref:`HPC Systems` section to configure run settings for HPC + +A ``RunSettings`` object is initialized through the ``Experiment.create_run_settings`` function. +This function accepts a `run_command` argument: the command to run the executable. + +If `run_command` is set to `"auto"`, SmartSim will attempt to match a run command on the +system with a ``RunSettings`` class. If found, the class corresponding to +that `run_command` will be created and returned. + +If the `run_command` is passed a recognized run command (e.g. `"srun"`) the ``RunSettings`` +instance will be a child class such as ``SrunSettings``. You may also specify `"mpirun"`, +`"mpiexec"`, `"aprun"`, `"jsrun"` or `"orterun"` to the `run_command` argument. +This will return the associated child class. + +If the run command is not supported by SmartSim, the base ``RunSettings`` class will be created and returned +with the specified `run_command` and `run_args` evaluated literally. + +After creating a ``RunSettings`` instance, users gain access to the attributes and methods +of the associated child class, providing them with the ability to further configure the run +settings for jobs. + +======== +Examples +======== +.. _run_settings_local_ex: + +Local +===== +When running SmartSim on laptops and single node workstations via the `"local"` +`launcher`, job execution is configured with the base ``RunSettings`` object. +For local launches, ``RunSettings`` accepts a `run_command` parameter to allow +the use of parallel launch binaries like `"mpirun"`, `"mpiexec"`, and others. + +If no `run_command` is specified and the ``Experiment`` `launcher` is set to `"local"`, +the executable is launched locally. When utilizing the `"local"` launcher and configuring +the `run_command` parameter to `"auto"` in the ``Experiment.create_run_settings`` factory +method, SmartSim defaults to omitting any run command prefix before the executable. + +Once the ``RunSettings`` object is initialized using the ``Experiment.create_run_settings`` factory +method, the :ref:`RunSettings API` can be used to further configure the +``RunSettings`` object prior to execution. + +.. note:: + The local `launcher` is the default `launcher` for all ``Experiment`` instances. + +When the user initializes the ``Experiment`` at the beginning of the Python driver script, +a `launcher` argument may be specified. SmartSim will register or detect the +`launcher` and return the supported class upon a call to ``Experiment.create_run_settings``. +Below we demonstrate creating and configuring the base ``RunSettings`` +object for local launches by specifying the `"local"` launcher during ``Experiment`` creation. +We also demonstrate specifying `run_command="mpirun"` locally. + +**Initialize and Configure a RunSettings Object with No Run Command Specified:** + +.. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher local + exp = Experiment("name-of-experiment", launcher="local") + + + # Initialize a RunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command=None) + +**Initialize and Configure a RunSettings Object with the `mpirun` Run Command Specified:** + +.. note:: + Please note that to run this example you need to have an MPI implementation + (e.g. OpenMPI or MPICH) installed. + +.. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher local + exp = Experiment("name-of-experiment", launcher="local") + + # Initialize a RunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + +Users may replace `mpirun` with `mpiexec`. + +.. _run_settings_hpc_ex: + +HPC System +========== +To configure an entity for launch on an HPC system, SmartSim offers ``RunSettings`` child classes. +Each WLM `launcher` supports different ``RunSettings`` child classes. +When the user initializes the ``Experiment`` at the beginning of the Python driver script, +a `launcher` argument may be specified. The specified `launcher` will be used by SmartSim to +return the correct ``RunSettings`` child class that matches with the specified (or auto-detected) +`run_command` upon a call to ``Experiment.create_run_settings``. Below we demonstrate +creating and configuring the base ``RunSettings`` object for HPC launches +by specifying the launcher during ``Experiment`` creation. We show examples +for each job scheduler. + +.. tabs:: + + .. group-tab:: Slurm + + The Slurm `launcher` supports the :ref:`SrunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"srun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``SrunSettings`` and ``MpirunSettings`` + instance on a Slurm based machine using the associated `run_command`. + + **SrunSettings** + + Run a job with the `srun` command on a Slurm based system. Any arguments passed in + the `run_args` dict will be converted into `srun` arguments and prefixed with `"--"`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the Experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a SrunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="srun") + # Set the number of nodes + run_settings.set_nodes(4) + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with the `mpirun` command (MPI-standard) on a Slurm based system. Any + arguments passed in the `run_args` dict will be converted into `mpirun` arguments + and prefixed with `"--"`. Values of `None` can be provided for arguments that do + not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the Experiment and provide launcher Slurm + exp = Experiment("name-of-experiment", launcher="slurm") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + + + .. note:: + SmartSim will look for an allocation by accessing the associated WLM job ID environment variable. If an allocation + is present, the entity will be launched on the reserved compute resources. A user may also specify the allocation ID + when initializing a run settings object via the `alloc` argument. If an allocation is specified, the entity receiving + these run parameters will launch on that allocation. + + .. group-tab:: PBS Pro + The PBS Pro `launcher` supports the :ref:`AprunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"aprun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``AprunSettings`` and ``MpirunSettings`` + instance on a PBS Pro based machine using the associated `run_command`. + + **AprunSettings** + + Run a job with `aprun` command on a PBS Pro based system. Any arguments passed in + the `run_args` dict will be converted into `aprun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a AprunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="aprun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with `mpirun` command on a PBS Pro based system. Any arguments passed + in the `run_args` dict will be converted into `mpirun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PBS Pro + exp = Experiment("name-of-experiment", launcher="pbs") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. group-tab:: PALS + The PALS `launcher` supports the :ref:`MpiexecSettings API ` that can be used to run executables + with the `mpiexec` launch binary. Below we step through initializing a ``MpiexecSettings`` instance on a PALS + based machine using the associated `run_command`. + + **MpiexecSettings** + + Run a job with `mpiexec` command on a PALS based system. Any arguments passed in the `run_args` dict will be converted into `mpiexec` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher PALS + exp = Experiment("name-of-experiment", launcher="pals") + + # Initialize a MpiexecSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpiexec") + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + .. group-tab:: LSF + The LSF `launcher` supports the :ref:`JsrunSettings API ` as well as the :ref:`MpirunSettings API `, + :ref:`MpiexecSettings API ` and :ref:`OrterunSettings API ` that each can be used to run executables + with launch binaries like `"jsrun"`, `"mpirun"`, `"mpiexec"` and `"orterun"`. Below we step through initializing a ``JsrunSettings`` and ``MpirunSettings`` + instance on a LSF based machine using the associated `run_command`. + + **JsrunSettings** + + Run a job with `jsrun` command on a LSF based system. Any arguments passed in the + `run_args` dict will be converted into `jsrun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a JsrunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="jsrun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + **MpirunSettings** + + Run a job with `mpirun` command on a LSF based system. Any arguments passed in the + `run_args` dict will be converted into `mpirun` arguments and prefixed with `--`. + Values of `None` can be provided for arguments that do not have values. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher LSF + exp = Experiment("name-of-experiment", launcher="lsf") + + # Initialize a MpirunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World", run_command="mpirun") + # Set the number of cpus to use per task + run_settings.set_cpus_per_task(2) + # Set the number of tasks for this job + run_settings.set_tasks(100) + # Set the number of tasks for this job + run_settings.set_tasks_per_node(25) + + Users may replace `mpirun` with `mpiexec` or `orterun`. + + .. group-tab:: Dragon + The Dragon `launcher` does not need any launch binary. Below we step through initializing a ``DragonRunSettings`` instance on a Slurm- + or PBS-based machine. + + **DragonRunSettings** + + Run a job with the `dragon` launcher. + + .. code-block:: python + + from smartsim import Experiment + + # Initialize the experiment and provide launcher dragon + exp = Experiment("name-of-experiment", launcher="dragon") + + # Initialize a DragonRunSettings object + run_settings = exp.create_run_settings(exe="echo", exe_args="Hello World") + # Set the number of nodes for this job + run_settings.set_nodes(4) + # Set the number of tasks per node for this job + run_settings.set_tasks_per_node(10) diff --git a/doc/sr_advanced_topics.rst b/doc/sr_advanced_topics.rst index 30da2c578..763a7fbe7 100644 --- a/doc/sr_advanced_topics.rst +++ b/doc/sr_advanced_topics.rst @@ -1,2 +1,2 @@ - +.. _config_options_explain: .. include:: ../smartredis/doc/advanced_topics.rst \ No newline at end of file diff --git a/doc/ss_logger.rst b/doc/ss_logger.rst new file mode 100644 index 000000000..186e28a89 --- /dev/null +++ b/doc/ss_logger.rst @@ -0,0 +1,221 @@ +****** +Logger +****** + +.. _ss_logger: + +======== +Overview +======== +SmartSim supports logging experiment activity through a logging API accessible via +the SmartSim `log` module. The SmartSim logger, backed by Python logging, enables +real-time logging of experiment activity **to stdout** and/or **to file**, with +multiple verbosity levels for categorizing log messages. + +Users may instruct SmartSim to log certain verbosity level log messages +and omit others through the `SMARTSIM_LOG_LEVEL` environment variable. The `SMARTSIM_LOG_LEVEL` +environment variable may be overridden when logging to file by specifying a log level to +the ``log_to_file`` function. Examples walking through logging :ref:`to stdout` +and :ref:`to file` are provided below. + +SmartSim offers **four** log functions to use within the Python driver script. The +below functions accept string messages: + +- ``logger.error`` +- ``logger.warning`` +- ``logger.info`` +- ``logger.debug`` + +The `SMARTSIM_LOG_LEVEL` environment variable accepts **four** log levels: `quiet`, +`info`, `debug` and `developer`. Setting the log level in the environment (or via the override function) +controls the log messages that are output at runtime. The log levels are listed below from +least verbose to most verbose: + +- level: `quiet` + - The `quiet` log level instructs SmartSim to print ``error`` and ``warning`` messages. +- level: `info` + - The `info` log level instructs SmartSim to print ``info``, ``error`` and ``warning`` messages. +- level: `debug` + - The `debug` log level instructs SmartSim to print ``debug``, ``info``, ``error`` and ``warning`` messages. +- level: `developer` + - The `developer` log level instructs SmartSim to print ``debug``, ``info``, ``error`` and ``warning`` messages. + +.. note:: + Levels `developer` and `debug` print the same log messages. The `developer` log level is intended for use + during code development and signifies highly detailed and verbose logging. + +.. note:: + `SMARTSIM_LOG_LEVEL` defaults to log level `info`. For SmartSim log API examples, continue to the :ref:`Examples` section. + +.. _log_ex: + +======== +Examples +======== +.. _log_to_stdout: + +------------- +Log to stdout +------------- +The ``get_logger`` function in SmartSim enables users to initialize a logger instance. +Once initialized, a user may use the instance to log a message using one of the four +logging functions. + +To use the SmartSim logger within a Python script, import the required `get_logger` +function from the `log` module: + +.. code-block:: python + + from smartsim.log import get_logger + +Next, initialize an instance of the logger and provide a logger `name`: + +.. code-block:: python + + logger = get_logger("SmartSim") + +To demonstrate full functionality of the SmartSim logger, we include all log +functions in the Python driver script with log messages: + +.. code-block:: python + + logger.info("This is a message") + logger.debug("This is a debug message") + logger.error("This is an error message") + logger.warning("This is a warning message") + +Execute the script *without* setting the `SMARTSIM_LOG_LEVEL`. Remember that `SMARTSIM_LOG_LEVEL` +defaults to `info`. When we execute the script, the following messages will print to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[130033] INFO This is a message + 11:15:00 system.host.com SmartSim[130033] ERROR This is an error message + 11:15:00 system.host.com SmartSim[130033] WARNING This is a warning message + +Notice that the `debug` function message was filtered. This is because by using +a lower verbosity level (`info`), we instruct SmartSim to omit the higher verbosity level messages (`debug` and `developer`). + +Next, set `SMARTSIM_LOG_LEVEL` to `debug`: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=debug + +When we execute the script again, +the following messages will print to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[65385] INFO This is a message + 11:15:00 system.host.com SmartSim[65385] DEBUG This is a debug message + 11:15:00 system.host.com SmartSim[65385] ERROR This is an error message + 11:15:00 system.host.com SmartSim[65385] WARNING This is a warning message + +Notice that all log messages print to stdout. By using a higher verbosity level (`debug`), +we instruct SmartSim to print all log functions at and above the level. + +Next, set `SMARTSIM_LOG_LEVEL` to `quiet` in terminal: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=quiet + +When we run the program once again, the following output is printed +to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[65385] ERROR This is an error message + 11:15:00 system.host.com SmartSim[65385] WARNING This is a warning message + +Notice that the `info` and `debug` log functions were filtered. This is because by using +the least verbose level (`quiet`), we instruct SmartSim to omit messages at higher verbosity levels +(`info`, `debug` and `developer`). + +To finish the example, set `SMARTSIM_LOG_LEVEL` to `info` in terminal: + +.. code-block:: bash + + export SMARTSIM_LOG_LEVEL=info + +When we execute the script, the following messages will print +to stdout: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[130033] INFO This is a message + 11:15:00 system.host.com SmartSim[130033] ERROR This is an error message + 11:15:00 system.host.com SmartSim[130033] WARNING This is a warning message + +Notice that the same messages were logged to stdout as when we ran the script with the default value `info`. +SmartSim omits messages at higher verbosity levels (`debug` and `developer`). + +.. _log_to_file: + +--------------- +Logging to File +--------------- +The ``log_to_file`` function in SmartSim allows users to log messages +to a specified file by providing a file name or relative file path. If the file name +passed in does not exist, SmartSim will create the file. If the program is re-executed with the same +file name, the file contents will be overwritten. + +To demonstrate, begin by importing the functions `get_logger` and `log_to_file` from the `log` module: + +.. code-block:: python + + from smartsim.log import get_logger, log_to_file + +Initialize a logger for use within the Python driver script: + +.. code-block:: python + + logger = get_logger("SmartSim") + +Invoke the ``log_to_file`` function to instruct SmartSim to create a file named `logger.out` +to write log messages to: + +.. code-block:: python + + log_to_file("logger.out") + +For the example, we add all log functions to the script: + +.. code-block:: python + + logger.info("This is a message") + logger.debug("This is a debug message") + logger.error("This is an error message") + logger.warning("This is a warning message") + +Remember that the default value for the `SMARTSIM_LOG_LEVEL` variable is `info`. +Therefore, we will not set the environment variable and instead rely on the +default. + +When we execute the Python script, a file named `logger.out` is created in our working +directory with the listed contents: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[10950] INFO This is a message + 11:15:00 system.host.com SmartSim[10950] ERROR This is an error message + 11:15:00 system.host.com SmartSim[10950] WARNING This is a warning message + +Notice that the `debug` function message was filtered. This is because by using +a lower verbosity level (`info`), we instruct SmartSim to omit higher verbosity messages (`debug` and `developer`). + +In the same Python script, add a log level to the ``log_to_file`` as a input argument: + +.. code-block:: python + + log_to_file("logger.out", "quiet") + +When we execute the Python script once again, SmartSim will override the `SMARTSIM_LOG_LEVEL` +variable to output messages of log level `quiet`. SmartSim will overwrite the contents +of `logger.out` with: + +.. code-block:: bash + + 11:15:00 system.host.com SmartSim[10950] ERROR This is an error message + 11:15:00 system.host.com SmartSim[10950] WARNING This is a warning message \ No newline at end of file diff --git a/doc/testing.rst b/doc/testing.rst index ccb2db3c2..08cce5d36 100644 --- a/doc/testing.rst +++ b/doc/testing.rst @@ -66,20 +66,20 @@ of the tests located within the ``on_wlm`` directory. To run the ``on_wlm`` test suite, users will have to be on a system with one of the supported workload managers. Additionally, users will -need to obtain an allocation of **at least 4 nodes**. +need to obtain an allocation of **at least 8 nodes**. Examples of how to obtain allocations on systems with the launchers: .. code:: bash # for slurm (with srun) - salloc -N 4 -A account --exclusive -t 00:10:00 + salloc -N 8 -A account --exclusive -t 00:10:00 # for PBSPro (with aprun) - qsub -l select=4 -l place=scatter -l walltime=00:10:00 -q queue + qsub -l select=8 -l place=scatter -l walltime=00:10:00 -q queue # for LSF (with jsrun) - bsub -Is -W 00:30 -nnodes 4 -P project $SHELL + bsub -Is -W 00:30 -nnodes 8 -P project $SHELL Values for queue, account, or project should be substituted appropriately. @@ -119,7 +119,7 @@ A full example on an internal SLURM system .. code:: bash - salloc -N 4 -A account --exclusive -t 03:00:00 + salloc -N 8 -A account --exclusive -t 03:00:00 export SMARTSIM_TEST_LAUNCHER=slurm export SMARTSIM_TEST_INTERFACE=ipogif0 export SMARTSIM_TEST_DEVICE=gpu diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py b/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py new file mode 100644 index 000000000..57d720163 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/application_consumer_script.py @@ -0,0 +1,17 @@ +from smartredis import Client, LLInfo + +# Initialize a Client +client = Client(cluster=False) + +# Set the data source +client.set_data_source("producer_0") +# Check if the tensor exists +tensor_1 = client.poll_tensor("tensor", 100, 100) + +# Set the data source +client.set_data_source("producer_1") +# Check if the tensor exists +tensor_2 = client.poll_tensor("tensor", 100, 100) + +client.log_data(LLInfo, f"producer_0.tensor was found: {tensor_1}") +client.log_data(LLInfo, f"producer_1.tensor was found: {tensor_2}") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py new file mode 100644 index 000000000..619a56e05 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/application_producer_script.py @@ -0,0 +1,10 @@ +from smartredis import Client +import numpy as np + +# Initialize a Client +client = Client(cluster=False) + +# Create NumPy array +array = np.array([1, 2, 3, 4]) +# Use SmartRedis Client to place tensor in standalone Orchestrator +client.put_tensor("tensor", array) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py new file mode 100644 index 000000000..a2fa206f5 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_file.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def save_tf_cnn(path, file_name): + """Create a Keras CNN and save to file for example purposes""" + from smartsim.ml.tf import freeze_model + + n = Net() + input_shape = (3, 3, 1) + n.build(input_shape=(None, *input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Serialize and save TF model to file +model_file, inputs, outputs = save_tf_cnn(ensemble_instance.path, "model.pb") + +# Attach ML model file to Ensemble +ensemble_instance.add_ml_model(name="cnn", backend="TF", model_path=model_file, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py new file mode 100644 index 000000000..98974fdc2 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_ml_model_mem.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def create_tf_cnn(): + """Create an in-memory Keras CNN for example purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + +# Serialize and save TF model +model, inputs, outputs = create_tf_cnn() + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach the in-memory ML model to the SmartSim Ensemble +ensemble_instance.add_ml_model(name="cnn", backend="TF", model=model, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py new file mode 100644 index 000000000..819ed814f --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_file.py @@ -0,0 +1,13 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach TorchScript to Ensemble +ensemble_instance.add_script(name="example_script", script_path="path/to/torchscript.py", device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py new file mode 100644 index 000000000..3e68bfd5a --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_mem.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +def timestwo(x): + return 2*x + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Ensemble object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# Attach TorchScript to Ensemble +ensemble_instance.add_function(name="example_func", function=timestwo, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py new file mode 100644 index 000000000..b8f907e9a --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/ensemble_torchscript_string.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +ensemble_instance = exp.create_ensemble("ensemble_name", ensemble_settings) + +# TorchScript string +torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + +# Attach TorchScript to Ensemble +ensemble_instance.add_script(name="example_script", script=torch_script_str, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py new file mode 100644 index 000000000..1a1db58e4 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/experiment_driver.py @@ -0,0 +1,42 @@ +from smartsim import Experiment +from smartsim.log import get_logger + +logger = get_logger("Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a standalone Orchestrator +standalone_orch = exp.create_database(db_nodes=1) + +# Initialize a RunSettings object for Ensemble +ensemble_settings = exp.create_run_settings(exe="/path/to/executable_producer_simulation") + +# Initialize Ensemble +producer_ensemble = exp.create_ensemble("producer", run_settings=ensemble_settings, replicas=2) + +# Enable key prefixing for Ensemble members +producer_ensemble.enable_key_prefixing() + +# Initialize a RunSettings object for Model +model_settings = exp.create_run_settings(exe="/path/to/executable_consumer_simulation") +# Initialize Model +consumer_model = exp.create_model("consumer", model_settings) + +# Generate SmartSim entity folder tree +exp.generate(standalone_orch, producer_ensemble, consumer_model, overwrite=True) + +# Launch Orchestrator +exp.start(standalone_orch, summary=True) + +# Launch Ensemble +exp.start(producer_ensemble, block=True, summary=True) + +# Register Ensemble members on consumer Model +for model in producer_ensemble: + consumer_model.register_incoming_entity(model) + +# Launch consumer Model +exp.start(consumer_model, block=True, summary=True) + +# Clobber Orchestrator +exp.stop(standalone_orch) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py b/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py new file mode 100644 index 000000000..68f233342 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/file_attach.py @@ -0,0 +1,20 @@ +from smartsim import Experiment + +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +ensemble_settings = exp.create_run_settings(exe="python", exe_args="/path/to/application.py") + +# Initialize an Ensemble object via replicas strategy +example_ensemble = exp.create_ensemble("ensemble", ensemble_settings, replicas=2, params={"THERMO":1}) + +# Attach the file to the Ensemble instance +example_ensemble.attach_generator_files(to_configure="path/to/params_inputs.txt") + +# Generate the Ensemble directory +exp.generate(example_ensemble) + +# Launch the Ensemble +exp.start(example_ensemble) + diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py b/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py new file mode 100644 index 000000000..89c9ea27e --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/manual_append_ensemble.py @@ -0,0 +1,25 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize BatchSettings +bs = exp.create_batch_settings(nodes=10, + time="01:00:00") + +# Initialize Ensemble +ensemble = exp.create_ensemble("ensemble-append", batch_settings=bs) + +# Initialize RunSettings for Model 1 +srun_settings_1 = exp.create_run_settings(exe=exe, exe_args="path/to/application_script_1.py") +# Initialize RunSettings for Model 2 +srun_settings_2 = exp.create_run_settings(exe=exe, exe_args="path/to/application_script_2.py") +# Initialize Model 1 with RunSettings 1 +model_1 = exp.create_model(name="model_1", run_settings=srun_settings_1) +# Initialize Model 2 with RunSettings 2 +model_2 = exp.create_model(name="model_2", run_settings=srun_settings_2) + +# Add Model member to Ensemble +ensemble.add_model(model_1) +# Add Model member to Ensemble +ensemble.add_model(model_2) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py new file mode 100644 index 000000000..6ccbce397 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_1.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings +rs = exp.create_run_settings(exe="path/to/example_simulation_program") + +#Create the parameters to expand to the Ensemble members +params = { + "name": ["Ellie", "John"], + "parameter": [2, 11] + } + +# Initialize the Ensemble by specifying RunSettings, the params and "all_perm" +ensemble = exp.create_ensemble("model_member", run_settings=rs, params=params, perm_strategy="all_perm") diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py new file mode 100644 index 000000000..f6fb30967 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/param_expansion_2.py @@ -0,0 +1,21 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a BatchSettings +bs = exp.create_batch_settings(nodes=2, + time="10:00:00") + +# Initialize and configure RunSettings +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") +rs.set_nodes(1) + +#Create the parameters to expand to the Ensemble members +params = { + "name": ["Ellie", "John"], + "parameter": [2, 11] + } + +# Initialize the Ensemble by specifying RunSettings, BatchSettings, the params and "step" +ensemble = exp.create_ensemble("ensemble", run_settings=rs, batch_settings=bs, params=params, perm_strategy="step") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py new file mode 100644 index 000000000..0dd5d16f5 --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_1.py @@ -0,0 +1,10 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") + +# Initialize the Ensemble by specifying the number of replicas and RunSettings +ensemble = exp.create_ensemble("ensemble-replica", replicas=4, run_settings=rs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py new file mode 100644 index 000000000..e2363a5be --- /dev/null +++ b/doc/tutorials/doc_examples/ensemble_doc_examples/replicas_2.py @@ -0,0 +1,15 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a BatchSettings object +bs = exp.create_batch_settings(nodes=4, + time="10:00:00") + +# Initialize and configure a RunSettings object +rs = exp.create_run_settings(exe="python", exe_args="path/to/application_script.py") +rs.set_nodes(4) + +# Initialize an Ensemble +ensemble = exp.create_ensemble("ensemble-replica", replicas=4, run_settings=rs, batch_settings=bs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/experiment_doc_examples/exp.py b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py new file mode 100644 index 000000000..b5374e7bd --- /dev/null +++ b/doc/tutorials/doc_examples/experiment_doc_examples/exp.py @@ -0,0 +1,30 @@ +from smartsim import Experiment +from smartsim._core.control.previewrenderer import Verbosity +from smartsim.log import get_logger + +# Initialize an Experiment +exp = Experiment("example-experiment", launcher="auto") +# Initialize a SmartSim logger +smartsim_logger = get_logger("logger") + +# Initialize an Orchestrator +standalone_database = exp.create_database(db_nodes=3, port=6379, interface="ib0") + +# Initialize the Model RunSettings +settings = exp.create_run_settings("echo", exe_args="Hello World") +# Initialize the Model +model = exp.create_model("hello_world", settings) + +# Generate the output directory +exp.generate(standalone_database, model, overwrite=True) + +# Preview the experiment +exp.preview(standalone_database, model, verbosity_level=Verbosity.DEBUG) + +# Launch the Orchestrator then Model instance +exp.start(standalone_database, model) + +# Clobber the Orchestrator +exp.stop(standalone_database) +# Log the summary of the Experiment +smartsim_logger.info(exp.summary()) diff --git a/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py b/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py new file mode 100644 index 000000000..329d08edc --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/from_file_ml_model.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def save_tf_cnn(path, file_name): + """Create a Keras CNN and save to file for example purposes""" + from smartsim.ml.tf import freeze_model + + n = Net() + input_shape = (3, 3, 1) + n.build(input_shape=(None, *input_shape)) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return freeze_model(model, path, file_name) + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Get and save TF model +model_file, inputs, outputs = save_tf_cnn(model_instance.path, "model.pb") + +# Attach the from file ML model to the SmartSim Model +model_instance.add_ml_model(name="cnn", backend="TF", model_path=model_file, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py b/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py new file mode 100644 index 000000000..ca6dcaea1 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/from_file_script.py @@ -0,0 +1,14 @@ + +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Attach TorchScript to Model +model_instance.add_script(name="example_script", script_path="path/to/torchscript.py", device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py b/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py new file mode 100644 index 000000000..a34cceb4a --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/in_mem_ml_model.py @@ -0,0 +1,40 @@ +from smartsim import Experiment +from tensorflow import keras +from tensorflow.keras.layers import Conv2D, Input + +class Net(keras.Model): + def __init__(self): + super(Net, self).__init__(name="cnn") + self.conv = Conv2D(1, 3, 1) + + def call(self, x): + y = self.conv(x) + return y + +def create_tf_cnn(): + """Create an in-memory Keras CNN for example purposes + + """ + from smartsim.ml.tf import serialize_model + n = Net() + input_shape = (3,3,1) + inputs = Input(input_shape) + outputs = n(inputs) + model = keras.Model(inputs=inputs, outputs=outputs, name=n.name) + + return serialize_model(model) + +# Serialize and save TF model +model, inputs, outputs = create_tf_cnn() + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Attach the in-memory ML model to the SmartSim Model +model_instance.add_ml_model(name="cnn", backend="TF", model=model, device="GPU", devices_per_node=2, first_device=0, inputs=inputs, outputs=outputs) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py b/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py new file mode 100644 index 000000000..634746085 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/in_mem_script.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +def timestwo(x): + return 2*x + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/example_simulation_program") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# Append TorchScript function to Model +model_instance.add_function(name="example_func", function=timestwo, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/model_file.py b/doc/tutorials/doc_examples/model_doc_examples/model_file.py new file mode 100644 index 000000000..8961d50a8 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/model_file.py @@ -0,0 +1,19 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings, params={"THERMO":1}) + +# Attach the file to the Model instance +model_instance.attach_generator_files(to_configure="path/to/params_inputs.txt") + +# Store model_instance outputs within the Experiment directory named getting-started +exp.generate(model_instance) + +# Launch the Model +exp.start(model_instance) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/model_init.py b/doc/tutorials/doc_examples/model_doc_examples/model_init.py new file mode 100644 index 000000000..b1bb090f4 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/model_init.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Init Experiment and specify to launch locally in this example +exp = Experiment(name="getting-started", launcher="local") + +# Initialize RunSettings +model_settings = exp.create_run_settings(exe="echo", exe_args="Hello World") + +# Initialize Model instance +model_instance = exp.create_model(name="example-model", run_settings=model_settings) + +# Generate Model directory +exp.generate(model_instance) + +# Launch Model +exp.start(model_instance) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py b/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py new file mode 100644 index 000000000..da4034d82 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/prefix_data.py @@ -0,0 +1,12 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Create the run settings for the Model +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Create a Model instance named 'model' +model = exp.create_model("model_name", model_settings) +# Enable tensor, Dataset and list prefixing on the 'model' instance +model.enable_key_prefixing() \ No newline at end of file diff --git a/doc/tutorials/doc_examples/model_doc_examples/string_script.py b/doc/tutorials/doc_examples/model_doc_examples/string_script.py new file mode 100644 index 000000000..52495ab47 --- /dev/null +++ b/doc/tutorials/doc_examples/model_doc_examples/string_script.py @@ -0,0 +1,16 @@ +from smartsim import Experiment + +# Initialize the Experiment and set the launcher to auto +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable/simulation") + +# Initialize a Model object +model_instance = exp.create_model("model_name", model_settings) + +# TorchScript string +torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + +# Attach TorchScript to Model +model_instance.add_script(name="example_script", script=torch_script_str, device="GPU", devices_per_node=2, first_device=0) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/colo_app.py b/doc/tutorials/doc_examples/orch_examples/colo_app.py new file mode 100644 index 000000000..930789fab --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/colo_app.py @@ -0,0 +1,15 @@ +from smartredis import Client, LLInfo +import numpy as np + +# Initialize a Client +colo_client = Client(cluster=False) + +# Create NumPy array +local_array = np.array([1, 2, 3, 4]) +# Store the NumPy tensor +colo_client.put_tensor("tensor_1", local_array) + +# Retrieve tensor from driver script +local_tensor = colo_client.get_tensor("tensor_1") +# Log tensor +colo_client.log_data(LLInfo, f"The colocated db tensor is: {local_tensor}") \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/colo_driver.py b/doc/tutorials/doc_examples/orch_examples/colo_driver.py new file mode 100644 index 000000000..fde06e9b7 --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/colo_driver.py @@ -0,0 +1,29 @@ +import numpy as np +from smartredis import Client +from smartsim import Experiment +from smartsim.log import get_logger + +# Initialize a logger object +logger = get_logger("Example Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="path/to/executable_simulation") +# Configure RunSettings object +model_settings.set_nodes(1) + +# Initialize a SmartSim Model +model = exp.create_model("colo_model", model_settings) + +# Colocate the Model +model.colocate_db_uds() + +# Generate output files +exp.generate(model) + +# Launch the colocated Model +exp.start(model, block=True, summary=True) + +# Log the Experiment summary +logger.info(exp.summary()) \ No newline at end of file diff --git a/doc/tutorials/doc_examples/orch_examples/std_app.py b/doc/tutorials/doc_examples/orch_examples/std_app.py new file mode 100644 index 000000000..67129fbf4 --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/std_app.py @@ -0,0 +1,15 @@ +from smartredis import Client, LLInfo +import numpy as np + +# Initialize a SmartRedis Client +application_client = Client(cluster=True) + +# Retrieve the driver script tensor from Orchestrator +driver_script_tensor = application_client.get_tensor("tensor_1") +# Log the tensor +application_client.log_data(LLInfo, f"The multi-sharded db tensor is: {driver_script_tensor}") + +# Create a NumPy array +local_array = np.array([5, 6, 7, 8]) +# Use SmartRedis client to place tensor in multi-sharded db +application_client.put_tensor("tensor_2", local_array) diff --git a/doc/tutorials/doc_examples/orch_examples/std_driver.py b/doc/tutorials/doc_examples/orch_examples/std_driver.py new file mode 100644 index 000000000..cf425125b --- /dev/null +++ b/doc/tutorials/doc_examples/orch_examples/std_driver.py @@ -0,0 +1,46 @@ +import numpy as np +from smartredis import Client +from smartsim import Experiment +from smartsim.log import get_logger + +# Initialize the logger +logger = get_logger("Example Experiment Log") +# Initialize the Experiment +exp = Experiment("getting-started", launcher="auto") + +# Initialize a multi-sharded Orchestrator +standalone_orchestrator = exp.create_database(db_nodes=3) + +# Initialize a SmartRedis client for multi-sharded Orchestrator +driver_client = Client(cluster=True, address=standalone_orchestrator.get_address()[0]) + +# Create NumPy array +local_array = np.array([1, 2, 3, 4]) +# Use the SmartRedis client to place tensor in the standalone Orchestrator +driver_client.put_tensor("tensor_1", local_array) + +# Initialize a RunSettings object +model_settings = exp.create_run_settings(exe="/path/to/executable_simulation") +model_settings.set_nodes(1) + +# Initialize the Model +model = exp.create_model("model", model_settings) + +# Create the output directory +exp.generate(standalone_orchestrator, model) + +# Launch the multi-sharded Orchestrator +exp.start(standalone_orchestrator) + +# Launch the Model +exp.start(model, block=True, summary=True) + +# Poll the tensors placed by the Model +app_tensor = driver_client.poll_key("tensor_2", 100, 10) +# Validate that the tensor exists +logger.info(f"The tensor exists: {app_tensor}") + +# Cleanup the Orchestrator +exp.stop(standalone_orchestrator) +# Print the Experiment summary +logger.info(exp.summary()) \ No newline at end of file diff --git a/tutorials/getting_started/consumer.py b/doc/tutorials/getting_started/consumer.py similarity index 100% rename from tutorials/getting_started/consumer.py rename to doc/tutorials/getting_started/consumer.py diff --git a/tutorials/getting_started/getting_started.ipynb b/doc/tutorials/getting_started/getting_started.ipynb similarity index 100% rename from tutorials/getting_started/getting_started.ipynb rename to doc/tutorials/getting_started/getting_started.ipynb diff --git a/tutorials/getting_started/multi_db_example/application_script.py b/doc/tutorials/getting_started/multi_db_example/application_script.py similarity index 100% rename from tutorials/getting_started/multi_db_example/application_script.py rename to doc/tutorials/getting_started/multi_db_example/application_script.py diff --git a/tutorials/getting_started/multi_db_example/multidb_driver.py b/doc/tutorials/getting_started/multi_db_example/multidb_driver.py similarity index 100% rename from tutorials/getting_started/multi_db_example/multidb_driver.py rename to doc/tutorials/getting_started/multi_db_example/multidb_driver.py diff --git a/tutorials/getting_started/output_my_parameter.py b/doc/tutorials/getting_started/output_my_parameter.py similarity index 100% rename from tutorials/getting_started/output_my_parameter.py rename to doc/tutorials/getting_started/output_my_parameter.py diff --git a/tutorials/getting_started/output_my_parameter_new_tag.py b/doc/tutorials/getting_started/output_my_parameter_new_tag.py similarity index 100% rename from tutorials/getting_started/output_my_parameter_new_tag.py rename to doc/tutorials/getting_started/output_my_parameter_new_tag.py diff --git a/tutorials/getting_started/producer.py b/doc/tutorials/getting_started/producer.py similarity index 100% rename from tutorials/getting_started/producer.py rename to doc/tutorials/getting_started/producer.py diff --git a/tutorials/ml_inference/Inference-in-SmartSim.ipynb b/doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb similarity index 100% rename from tutorials/ml_inference/Inference-in-SmartSim.ipynb rename to doc/tutorials/ml_inference/Inference-in-SmartSim.ipynb diff --git a/tutorials/ml_inference/colo-db-torch-example.py b/doc/tutorials/ml_inference/colo-db-torch-example.py similarity index 100% rename from tutorials/ml_inference/colo-db-torch-example.py rename to doc/tutorials/ml_inference/colo-db-torch-example.py diff --git a/tutorials/ml_training/surrogate/LICENSE b/doc/tutorials/ml_training/surrogate/LICENSE similarity index 100% rename from tutorials/ml_training/surrogate/LICENSE rename to doc/tutorials/ml_training/surrogate/LICENSE diff --git a/tutorials/ml_training/surrogate/README.md b/doc/tutorials/ml_training/surrogate/README.md similarity index 100% rename from tutorials/ml_training/surrogate/README.md rename to doc/tutorials/ml_training/surrogate/README.md diff --git a/tutorials/ml_training/surrogate/fd_sim.py b/doc/tutorials/ml_training/surrogate/fd_sim.py similarity index 97% rename from tutorials/ml_training/surrogate/fd_sim.py rename to doc/tutorials/ml_training/surrogate/fd_sim.py index 8b128a319..db68b24b2 100644 --- a/tutorials/ml_training/surrogate/fd_sim.py +++ b/doc/tutorials/ml_training/surrogate/fd_sim.py @@ -18,12 +18,9 @@ def augment_batch(samples, targets): following NWHC ordering. :param samples: Samples to augment - :type samples: np.ndarray :param targets: Targets to augment - :type targets: np.ndarray :returns: Tuple of augmented samples and targets - :rtype: (np.ndarray, np.ndarray) """ batch_size = samples.shape[0] augmented_samples = np.empty((batch_size*8, *samples.shape[1:])) @@ -83,9 +80,7 @@ def simulate(steps, size): both as tensors and as augmented samples for training. :param steps: Number of simulations to run - :type steps: int :param size: lateral size of the discretized domain - :type size: int """ batch_size = 50 samples = np.zeros((batch_size,size,size,1)).astype(np.single) diff --git a/tutorials/ml_training/surrogate/steady_state.py b/doc/tutorials/ml_training/surrogate/steady_state.py similarity index 100% rename from tutorials/ml_training/surrogate/steady_state.py rename to doc/tutorials/ml_training/surrogate/steady_state.py diff --git a/tutorials/ml_training/surrogate/tf_model.py b/doc/tutorials/ml_training/surrogate/tf_model.py similarity index 100% rename from tutorials/ml_training/surrogate/tf_model.py rename to doc/tutorials/ml_training/surrogate/tf_model.py diff --git a/tutorials/ml_training/surrogate/tf_training.py b/doc/tutorials/ml_training/surrogate/tf_training.py similarity index 100% rename from tutorials/ml_training/surrogate/tf_training.py rename to doc/tutorials/ml_training/surrogate/tf_training.py diff --git a/tutorials/ml_training/surrogate/train_surrogate.ipynb b/doc/tutorials/ml_training/surrogate/train_surrogate.ipynb similarity index 100% rename from tutorials/ml_training/surrogate/train_surrogate.ipynb rename to doc/tutorials/ml_training/surrogate/train_surrogate.ipynb diff --git a/tutorials/ml_training/surrogate/vishelpers.py b/doc/tutorials/ml_training/surrogate/vishelpers.py similarity index 100% rename from tutorials/ml_training/surrogate/vishelpers.py rename to doc/tutorials/ml_training/surrogate/vishelpers.py diff --git a/tutorials/online_analysis/lattice/LICENSE b/doc/tutorials/online_analysis/lattice/LICENSE similarity index 100% rename from tutorials/online_analysis/lattice/LICENSE rename to doc/tutorials/online_analysis/lattice/LICENSE diff --git a/tutorials/online_analysis/lattice/README.md b/doc/tutorials/online_analysis/lattice/README.md similarity index 100% rename from tutorials/online_analysis/lattice/README.md rename to doc/tutorials/online_analysis/lattice/README.md diff --git a/tutorials/online_analysis/lattice/driver.py b/doc/tutorials/online_analysis/lattice/driver.py similarity index 100% rename from tutorials/online_analysis/lattice/driver.py rename to doc/tutorials/online_analysis/lattice/driver.py diff --git a/tutorials/online_analysis/lattice/fv_sim.py b/doc/tutorials/online_analysis/lattice/fv_sim.py similarity index 100% rename from tutorials/online_analysis/lattice/fv_sim.py rename to doc/tutorials/online_analysis/lattice/fv_sim.py diff --git a/tutorials/online_analysis/lattice/online_analysis.ipynb b/doc/tutorials/online_analysis/lattice/online_analysis.ipynb similarity index 100% rename from tutorials/online_analysis/lattice/online_analysis.ipynb rename to doc/tutorials/online_analysis/lattice/online_analysis.ipynb diff --git a/tutorials/online_analysis/lattice/probe.script b/doc/tutorials/online_analysis/lattice/probe.script similarity index 100% rename from tutorials/online_analysis/lattice/probe.script rename to doc/tutorials/online_analysis/lattice/probe.script diff --git a/tutorials/online_analysis/lattice/vishelpers.py b/doc/tutorials/online_analysis/lattice/vishelpers.py similarity index 100% rename from tutorials/online_analysis/lattice/vishelpers.py rename to doc/tutorials/online_analysis/lattice/vishelpers.py diff --git a/docker-compose.yml b/docker-compose.yml index f69743f14..f5be4e338 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,7 +18,7 @@ services: - "8888:8888" tutorials-prod: - image: smartsim-tutorials:v0.6.1 + image: smartsim-tutorials:v0.7.0 build: context: . dockerfile: ./docker/prod/Dockerfile diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index c643787c3..3ab3a37f8 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" @@ -36,9 +36,9 @@ RUN useradd --system --create-home --shell /bin/bash -g root -G sudo craylabs && apt-get update \ && apt-get install --no-install-recommends -y build-essential \ git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ - python3-pip python3 python3-dev cmake \ + python3-pip python3.9 python3.9-dev cmake \ && rm -rf /var/lib/apt/lists/* \ - && ln -s /usr/bin/python3 /usr/bin/python + && ln -s /usr/bin/python3.9 /usr/bin/python WORKDIR /home/craylabs RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \ diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index eff99de36..e9db9c342 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" @@ -58,9 +58,7 @@ RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop -- RUN python -m pip install -r doc/requirements-doc.txt \ && NO_CHECKS=1 SMARTSIM_SUFFIX=dev python -m pip install . -RUN mkdir -p doc/tutorials/ \ - && cd doc/tutorials/ \ - && rm -rf * \ - && ln -s ../../tutorials/* . +# Note this is needed to ensure that the Sphinx builds. Can be removed with newer Tensorflow +RUN python -m pip install typing_extensions==4.6.1 RUN make docs diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index 769378aef..325ace923 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim @@ -36,9 +36,9 @@ RUN useradd --system --create-home --shell /bin/bash -g root -G sudo craylabs && apt-get update \ && apt-get install --no-install-recommends -y build-essential \ git gcc make git-lfs wget libopenmpi-dev openmpi-bin unzip \ - python3-pip python3 python3-dev cmake \ + python3.9 python3.9-dev python3-pip cmake \ && rm -rf /var/lib/apt/lists/* \ - && ln -s /usr/bin/python3 /usr/bin/python + && ln -s /usr/bin/python3.9 /usr/bin/python WORKDIR /home/craylabs COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ @@ -46,7 +46,7 @@ COPY --chown=craylabs:root ./tutorials/ /home/craylabs/tutorials/ USER craylabs RUN export PATH=/home/craylabs/.local/bin:$PATH && \ echo "export PATH=/home/craylabs/.local/bin:$PATH" >> /home/craylabs/.bashrc && \ - python -m pip install smartsim[ml]==0.6.2 jupyter jupyterlab matplotlib && \ + python -m pip install smartsim[ml]==0.7.0 jupyter jupyterlab matplotlib && \ smart build --device cpu -v && \ chown craylabs:root -R /home/craylabs/.local && \ rm -rf ~/.cache/pip diff --git a/docker/testing/Dockerfile b/docker/testing/Dockerfile index 9c247c320..285a66023 100644 --- a/docker/testing/Dockerfile +++ b/docker/testing/Dockerfile @@ -26,7 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:21.10 +FROM ubuntu:22.04 ENV DEBIAN_FRONTEND noninteractive RUN apt update && apt install -y python3 python3-pip python-is-python3 cmake git RUN pip install torch==1.9.1 diff --git a/pyproject.toml b/pyproject.toml index 4415c63ca..91164a68b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ['py38', 'py39', 'py310'] +target-version = ['py39', 'py310', 'py311'] exclude = ''' ( | \.egg @@ -45,8 +45,15 @@ exclude = ''' | build | dist | setup.py + | .*\.py ) ''' +force-exclude = ''' +( + .*\.dragon/* +) +''' + [tool.pytest.ini_options] log_cli = true @@ -61,6 +68,7 @@ markers = [ # supress circular import warning profile = "black" skip = ["tests/test_configs/circular_config"] +skip_glob="smartsim/_core/.dragon/*" [tool.coverage.run] source = ["smartsim"] @@ -78,7 +86,7 @@ namespace_packages = true files = [ "smartsim" ] -plugins = [] +plugins = ["pydantic.mypy"] ignore_errors = false # Dynamic typing @@ -107,10 +115,12 @@ strict_equality = true # Additional Error Codes enable_error_code = [ # "redundant-expr", - # "possibly-undefined", + "possibly-undefined", # "unused-awaitable", # "ignore-without-code", # "mutable-override", + "truthy-bool", + "truthy-iterable", ] [[tool.mypy.overrides]] @@ -122,6 +132,7 @@ module = [ "torch", "smartsim.ml.torch.*", # must solve/ignore inheritance issues "watchdog", + "dragon.*", ] ignore_missing_imports = true ignore_errors = true diff --git a/setup.cfg b/setup.cfg index 5fdfa82ae..742386d2c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,6 @@ contact_email = craylabs@hpe.com license = BSD 2-Clause License keywords = scientific, ai, workflow, hpc, analysis classifiers = - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 @@ -56,13 +55,14 @@ setup_requires = setuptools>=39.2 cmake>=3.13 include_package_data = True -python_requires = >=3.8,<3.12 +python_requires = >=3.9,<3.12 [options.packages.find] include = smartsim* exclude = .third-party + .dragon tests doc smartredis diff --git a/setup.py b/setup.py index bc7cf60d6..6e46ddef9 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ # # This future is needed to print Python2 EOL message from __future__ import print_function + import sys if sys.version_info < (3,): @@ -71,14 +72,14 @@ sys.exit(-1) -import os import importlib.util +import os from pathlib import Path from setuptools import setup -from setuptools.dist import Distribution -from setuptools.command.install import install from setuptools.command.build_py import build_py +from setuptools.command.install import install +from setuptools.dist import Distribution # Some necessary evils we have to do to be able to use # the _install tools in smartsim/smartsim/_core/_install @@ -107,8 +108,11 @@ # check for compatible python versions if not build_env.is_compatible_python(versions.PYTHON_MIN): - print("You are using Python {}. Python >={} is required.".format(build_env.python_version, - ".".join((versions.PYTHON_MIN)))) + print( + "You are using Python {}. Python >={} is required.".format( + build_env.python_version, ".".join((versions.PYTHON_MIN)) + ) + ) sys.exit(-1) if build_env.is_windows(): @@ -120,9 +124,11 @@ # __version__ in smartsim/__init__.py smartsim_version = versions.write_version(setup_path) + class BuildError(Exception): pass + # Hacky workaround for solving CI build "purelib" issue # see https://github.com/google/or-tools/issues/616 class InstallPlatlib(install): @@ -131,15 +137,14 @@ def finalize_options(self): if self.distribution.has_ext_modules(): self.install_lib = self.install_platlib -class SmartSimBuild(build_py): +class SmartSimBuild(build_py): def run(self): - database_builder = builder.DatabaseBuilder(build_env(), - build_env.MALLOC, - build_env.JOBS) + database_builder = builder.DatabaseBuilder( + build_env(), build_env.MALLOC, build_env.JOBS + ) if not database_builder.is_built: - database_builder.build_from_git(versions.REDIS_URL, - versions.REDIS) + database_builder.build_from_git(versions.REDIS_URL, versions.REDIS) database_builder.cleanup() @@ -151,9 +156,10 @@ def run(self): class BinaryDistribution(Distribution): """Distribution which always forces a binary package with platform name - We use this because we want to pre-package Redis for certain - platforms to use. + We use this because we want to pre-package Redis for certain + platforms to use. """ + def has_ext_modules(_placeholder): return True @@ -167,7 +173,11 @@ def has_ext_modules(_placeholder): "tqdm>=4.50.2", "filelock>=3.4.2", "protobuf~=3.20", - "watchdog>=3.0.0,<4.0.0", + "jinja2>=3.1.2", + "watchdog>=4.0.0", + "pydantic==1.10.14", + "pyzmq>=25.1.2", + "pygithub>=2.3.0", ] # Add SmartRedis at specific version @@ -181,6 +191,7 @@ def has_ext_modules(_placeholder): "pytest>=6.0.0", "pytest-cov>=2.10.1", "click==8.0.2", + "pytest-asyncio>=0.23.3", ], "mypy": [ "mypy>=1.3.0", @@ -193,7 +204,7 @@ def has_ext_modules(_placeholder): "typing_extensions>=4.1.0", ], # see smartsim/_core/_install/buildenv.py for more details - **versions.ml_extras_required() + **versions.ml_extras_required(), } @@ -212,5 +223,5 @@ def has_ext_modules(_placeholder): "console_scripts": [ "smart = smartsim._core._cli.__main__:main", ] - } + }, ) diff --git a/smartsim/__init__.py b/smartsim/__init__.py index 7c1fa2fe0..5e24097a5 100644 --- a/smartsim/__init__.py +++ b/smartsim/__init__.py @@ -30,8 +30,8 @@ # pylint: disable-next=useless-import-alias from .version import __version__ as __version__ -if sys.version_info < (3, 8): # pragma: no cover - sys.exit("Python 3.8 or greater must be used with SmartSim.") +if sys.version_info < (3, 9): # pragma: no cover + sys.exit("Python 3.9 or greater must be used with SmartSim.") # Main API module # pylint: disable=wrong-import-position diff --git a/smartsim/_core/__init__.py b/smartsim/_core/__init__.py index bbc108f48..490078770 100644 --- a/smartsim/_core/__init__.py +++ b/smartsim/_core/__init__.py @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .control import Controller, Manifest +from .control import Controller, Manifest, previewrenderer from .generation import Generator -__all__ = ["Controller", "Manifest", "Generator"] +__all__ = ["Controller", "Manifest", "Generator", "previewrenderer"] diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 443b916b7..951521f17 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -33,6 +33,7 @@ from tabulate import tabulate +from smartsim._core._cli.scripts.dragon_install import install_dragon from smartsim._core._cli.utils import SMART_LOGGER_FORMAT, color_bool, pip from smartsim._core._install import builder from smartsim._core._install.buildenv import ( @@ -43,7 +44,7 @@ VersionConflictError, Versioner, ) -from smartsim._core._install.builder import BuildError +from smartsim._core._install.builder import BuildError, Device from smartsim._core.config import CONFIG from smartsim._core.utils.helpers import installed_redisai_backends from smartsim.error import SSConfigError @@ -54,8 +55,6 @@ # NOTE: all smartsim modules need full paths as the smart cli # may be installed into a different directory. - -_TDeviceStr = t.Literal["cpu", "gpu"] _TPinningStr = t.Literal["==", "!=", ">=", ">", "<=", "<", "~="] @@ -134,16 +133,17 @@ def build_database( def build_redis_ai( build_env: BuildEnv, versions: Versioner, - device: _TDeviceStr, + device: Device, use_torch: bool = True, use_tf: bool = True, use_onnx: bool = False, torch_dir: t.Union[str, Path, None] = None, libtf_dir: t.Union[str, Path, None] = None, verbose: bool = False, + torch_with_mkl: bool = True, ) -> None: # make sure user isn't trying to do something silly on MacOS - if build_env.PLATFORM == "darwin" and device == "gpu": + if build_env.PLATFORM == "darwin" and device == Device.GPU: raise BuildError("SmartSim does not support GPU on MacOS") # decide which runtimes to build @@ -154,7 +154,7 @@ def build_redis_ai( ["ONNX", versions.ONNX, color_bool(use_onnx)], ] print(tabulate(backends_table, tablefmt="fancy_outline"), end="\n\n") - print(f"Building for GPU support: {color_bool(device == 'gpu')}\n") + print(f"Building for GPU support: {color_bool(device == Device.GPU)}\n") if not check_backends_install(): sys.exit(1) @@ -188,6 +188,7 @@ def build_redis_ai( build_tf=use_tf, build_onnx=use_onnx, verbose=verbose, + torch_with_mkl=torch_with_mkl, ) if rai_builder.is_built: @@ -195,7 +196,7 @@ def build_redis_ai( else: # get the build environment, update with CUDNN env vars # if present and building for GPU, otherwise warn the user - if device == "gpu": + if device == Device.GPU: gpu_env = build_env.get_cudnn_env() cudnn_env_vars = [ "CUDNN_LIBRARY", @@ -226,18 +227,16 @@ def build_redis_ai( logger.info("ML Backends and RedisAI build complete!") -def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") -> None: +def check_py_torch_version(versions: Versioner, device: Device = Device.CPU) -> None: """Check Python environment for TensorFlow installation""" - - device = device_in.lower() if BuildEnv.is_macos(): - if device == "gpu": + if device == Device.GPU: raise BuildError("SmartSim does not support GPU on MacOS") device_suffix = "" else: # linux - if device == "cpu": + if device == Device.CPU: device_suffix = versions.TORCH_CPU_SUFFIX - elif device == "gpu": + elif device == Device.GPU: device_suffix = versions.TORCH_CUDA_SUFFIX else: raise BuildError("Unrecognized device requested") @@ -261,7 +260,9 @@ def check_py_torch_version(versions: Versioner, device_in: _TDeviceStr = "cpu") "Torch version not found in python environment. " "Attempting to install via `pip`" ) - wheel_device = device if device == "cpu" else device_suffix.replace("+", "") + wheel_device = ( + device.value if device == Device.CPU else device_suffix.replace("+", "") + ) pip( "install", "--extra-index-url", @@ -339,10 +340,10 @@ def _assess_python_env( def _format_incompatible_python_env_message( - missing: t.Iterable[str], conflicting: t.Iterable[str] + missing: t.Collection[str], conflicting: t.Collection[str] ) -> str: indent = "\n\t" - fmt_list: t.Callable[[str, t.Iterable[str]], str] = lambda n, l: ( + fmt_list: t.Callable[[str, t.Collection[str]], str] = lambda n, l: ( f"{n}:{indent}{indent.join(l)}" if l else "" ) missing_str = fmt_list("Missing", missing) @@ -358,13 +359,27 @@ def _format_incompatible_python_env_message( ) +def _configure_keydb_build(versions: Versioner) -> None: + """Configure the redis versions to be used during the build operation""" + versions.REDIS = Version_("6.2.0") + versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB" + versions.REDIS_BRANCH = "v6.2.0" + + CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") + if not CONFIG.conf_path.resolve().is_file(): + raise SSConfigError( + "Database configuration file at REDIS_CONF could not be found" + ) + + +# pylint: disable-next=too-many-statements def execute( args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / ) -> int: verbose = args.v keydb = args.keydb - device: _TDeviceStr = args.device - + device = Device(args.device.lower()) + is_dragon_requested = args.dragon # torch and tf build by default pt = not args.no_pt # pylint: disable=invalid-name tf = not args.no_tf # pylint: disable=invalid-name @@ -376,7 +391,7 @@ def execute( logger.info("Checking requested versions...") versions = Versioner() - logger.info("Checking for build tools...") + logger.debug("Checking for build tools...") if verbose: logger.info("Build Environment:") @@ -385,14 +400,7 @@ def execute( print(tabulate(env, headers=env_vars, tablefmt="github"), "\n") if keydb: - versions.REDIS = Version_("6.2.0") - versions.REDIS_URL = "https://github.com/EQ-Alpha/KeyDB" - versions.REDIS_BRANCH = "v6.2.0" - CONFIG.conf_path = Path(CONFIG.core_path, "config", "keydb.conf") - if not CONFIG.conf_path.resolve().is_file(): - raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" - ) + _configure_keydb_build(versions) if verbose: db_name: DbEngine = "KEYDB" if keydb else "REDIS" @@ -401,6 +409,17 @@ def execute( version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") + if is_dragon_requested: + install_to = CONFIG.core_path / ".dragon" + return_code = install_dragon(install_to) + + if return_code == 0: + logger.info("Dragon installation complete") + elif return_code == 1: + logger.info("Dragon installation not supported on platform") + else: + logger.warning("Dragon installation failed") + try: if not args.only_python_packages: # REDIS/KeyDB @@ -417,6 +436,7 @@ def execute( args.torch_dir, args.libtensorflow_dir, verbose=verbose, + torch_with_mkl=args.torch_with_mkl, ) except (SetupError, BuildError) as e: logger.error(str(e)) @@ -453,10 +473,16 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--device", type=str.lower, - default="cpu", - choices=["cpu", "gpu"], + default=Device.CPU.value, + choices=[device.value for device in Device], help="Device to build ML runtimes for", ) + parser.add_argument( + "--dragon", + action="store_true", + default=False, + help="Install the dragon runtime", + ) parser.add_argument( "--only_python_packages", action="store_true", @@ -499,3 +525,9 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: default=False, help="Build KeyDB instead of Redis", ) + parser.add_argument( + "--no_torch_with_mkl", + dest="torch_with_mkl", + action="store_false", + help="Do not build Torch with Intel MKL", + ) diff --git a/smartsim/_core/_cli/cli.py b/smartsim/_core/_cli/cli.py index 3cad573d1..3d5c6e066 100644 --- a/smartsim/_core/_cli/cli.py +++ b/smartsim/_core/_cli/cli.py @@ -39,6 +39,8 @@ from smartsim._core._cli.info import execute as info_execute from smartsim._core._cli.plugin import plugins from smartsim._core._cli.site import execute as site_execute +from smartsim._core._cli.teardown import configure_parser as teardown_parser +from smartsim._core._cli.teardown import execute as teardown_execute from smartsim._core._cli.utils import MenuItemConfig from smartsim._core._cli.validate import configure_parser as validate_parser from smartsim._core._cli.validate import execute as validate_execute @@ -106,7 +108,7 @@ def default_cli() -> SmartCli: menu = [ MenuItemConfig( "build", - "Build SmartSim dependencies (Redis, RedisAI, ML runtimes)", + "Build SmartSim dependencies (Redis, RedisAI, Dragon, ML runtimes)", build_execute, build_parser, ), @@ -142,6 +144,12 @@ def default_cli() -> SmartCli: "Display information about the current SmartSim installation", info_execute, ), + MenuItemConfig( + "teardown", + "Clean up allocated resources after an experiment terminates", + teardown_execute, + teardown_parser, + ), ] return SmartCli(menu) diff --git a/smartsim/_core/_cli/scripts/dragon_install.py b/smartsim/_core/_cli/scripts/dragon_install.py new file mode 100644 index 000000000..466c390bd --- /dev/null +++ b/smartsim/_core/_cli/scripts/dragon_install.py @@ -0,0 +1,232 @@ +import os +import pathlib +import sys +import typing as t + +from github import Github +from github.GitReleaseAsset import GitReleaseAsset + +from smartsim._core._cli.utils import pip +from smartsim._core._install.builder import WebTGZ +from smartsim._core.config import CONFIG +from smartsim._core.utils.helpers import check_platform, is_crayex_platform +from smartsim.error.errors import SmartSimCLIActionCancelled +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +def create_dotenv(dragon_root_dir: pathlib.Path) -> None: + """Create a .env file with required environment variables for the Dragon runtime""" + dragon_root = str(dragon_root_dir) + dragon_inc_dir = str(dragon_root_dir / "include") + dragon_lib_dir = str(dragon_root_dir / "lib") + dragon_bin_dir = str(dragon_root_dir / "bin") + + dragon_vars = { + "DRAGON_BASE_DIR": dragon_root, + "DRAGON_ROOT_DIR": dragon_root, # note: same as base_dir + "DRAGON_INCLUDE_DIR": dragon_inc_dir, + "DRAGON_LIB_DIR": dragon_lib_dir, + "DRAGON_VERSION": dragon_pin(), + "PATH": dragon_bin_dir, + "LD_LIBRARY_PATH": dragon_lib_dir, + } + + lines = [f"{k}={v}\n" for k, v in dragon_vars.items()] + + if not CONFIG.dragon_dotenv.parent.exists(): + CONFIG.dragon_dotenv.parent.mkdir(parents=True) + + with CONFIG.dragon_dotenv.open("w", encoding="utf-8") as dotenv: + dotenv.writelines(lines) + + +def python_version() -> str: + """Return a formatted string used to filter release assets + for the current python version""" + return f"py{sys.version_info.major}.{sys.version_info.minor}" + + +def dragon_pin() -> str: + """Return a string indicating the pinned major/minor version of the dragon + package to install""" + return "0.9" + + +def _platform_filter(asset_name: str) -> bool: + """Return True if the asset name matches naming standard for current + platform (Cray or non-Cray). Otherwise, returns False. + + :param asset_name: A value to inspect for keywords indicating a Cray EX asset + :returns: True if supplied value is correct for current platform""" + key = "crayex" + is_cray = key in asset_name.lower() + if is_crayex_platform(): + return is_cray + return not is_cray + + +def _version_filter(asset_name: str) -> bool: + """Return true if the supplied value contains a python version match + + :param asset_name: A value to inspect for keywords indicating a python version + :returns: True if supplied value is correct for current python version""" + return python_version() in asset_name + + +def _pin_filter(asset_name: str) -> bool: + """Return true if the supplied value contains a dragon version pin match + + :param asset_name: A value to inspect for keywords indicating a dragon version + :returns: True if supplied value is correct for current dragon version""" + return f"dragon-{dragon_pin()}" in asset_name + + +def _get_release_assets() -> t.Collection[GitReleaseAsset]: + """Retrieve a collection of available assets for all releases that satisfy + the dragon version pin + + :returns: A collection of release assets""" + git = Github() + + dragon_repo = git.get_repo("DragonHPC/dragon") + + if dragon_repo is None: + raise SmartSimCLIActionCancelled("Unable to locate dragon repo") + + # find any releases matching our pinned version requirement + tags = [tag for tag in dragon_repo.get_tags() if dragon_pin() in tag.name] + # repo.get_latest_release fails if only pre-release results are returned + pin_releases = list(dragon_repo.get_release(tag.name) for tag in tags) + releases = sorted(pin_releases, key=lambda r: r.published_at, reverse=True) + + # take the most recent release for the given pin + assets = releases[0].assets + + return assets + + +def filter_assets(assets: t.Collection[GitReleaseAsset]) -> t.Optional[GitReleaseAsset]: + """Filter the available release assets so that HSTA agents are used + when run on a Cray EX platform + + :param assets: The collection of dragon release assets to filter + :returns: An asset meeting platform & version filtering requirements""" + # Expect cray & non-cray assets that require a filter, e.g. + # 'dragon-0.8-py3.9.4.1-bafaa887f.tar.gz', + # 'dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz' + asset = next( + ( + asset + for asset in assets + if _version_filter(asset.name) + and _platform_filter(asset.name) + and _pin_filter(asset.name) + ), + None, + ) + return asset + + +def retrieve_asset_info() -> GitReleaseAsset: + """Find a release asset that meets all necessary filtering criteria + + :param dragon_pin: identify the dragon version to install (e.g. dragon-0.8) + :returns: A GitHub release asset""" + assets = _get_release_assets() + asset = filter_assets(assets) + + platform_result = check_platform() + if not platform_result.is_cray: + logger.warning("Installing Dragon without HSTA support") + for msg in platform_result.failures: + logger.warning(msg) + + if asset is None: + raise SmartSimCLIActionCancelled("No dragon runtime asset available to install") + + logger.debug(f"Retrieved asset metadata: {asset}") + return asset + + +def retrieve_asset(working_dir: pathlib.Path, asset: GitReleaseAsset) -> pathlib.Path: + """Retrieve the physical file associated to a given GitHub release asset + + :param working_dir: location in file system where assets should be written + :param asset: GitHub release asset to retrieve + :returns: path to the downloaded asset""" + if working_dir.exists() and list(working_dir.rglob("*.whl")): + return working_dir + + archive = WebTGZ(asset.browser_download_url) + archive.extract(working_dir) + + logger.debug(f"Retrieved {asset.browser_download_url} to {working_dir}") + return working_dir + + +def install_package(asset_dir: pathlib.Path) -> int: + """Install the package found in `asset_dir` into the current python environment + + :param asset_dir: path to a decompressed archive contents for a release asset""" + wheels = asset_dir.rglob("*.whl") + wheel_path = next(wheels, None) + if not wheel_path: + logger.error(f"No wheel found for package in {asset_dir}") + return 1 + + create_dotenv(wheel_path.parent) + + while wheel_path is not None: + logger.info(f"Installing package: {wheel_path.absolute()}") + + try: + pip("install", "--force-reinstall", str(wheel_path)) + wheel_path = next(wheels, None) + except Exception: + logger.error(f"Unable to install from {asset_dir}") + return 1 + + return 0 + + +def cleanup( + archive_path: t.Optional[pathlib.Path] = None, +) -> None: + """Delete the downloaded asset and any files extracted during installation + + :param archive_path: path to a downloaded archive for a release asset""" + if archive_path: + archive_path.unlink(missing_ok=True) + logger.debug(f"Deleted archive: {archive_path}") + + +def install_dragon(extraction_dir: t.Union[str, os.PathLike[str]]) -> int: + """Retrieve a dragon runtime appropriate for the current platform + and install to the current python environment + :param extraction_dir: path for download and extraction of assets + :returns: Integer return code, 0 for success, non-zero on failures""" + if sys.platform == "darwin": + logger.debug(f"Dragon not supported on platform: {sys.platform}") + return 1 + + extraction_dir = pathlib.Path(extraction_dir) + filename: t.Optional[pathlib.Path] = None + asset_dir: t.Optional[pathlib.Path] = None + + try: + asset_info = retrieve_asset_info() + asset_dir = retrieve_asset(extraction_dir, asset_info) + + return install_package(asset_dir) + except Exception as ex: + logger.error("Unable to install dragon runtime", exc_info=ex) + finally: + cleanup(filename) + + return 2 + + +if __name__ == "__main__": + sys.exit(install_dragon(CONFIG.core_path / ".dragon")) diff --git a/smartsim/_core/_cli/teardown.py b/smartsim/_core/_cli/teardown.py new file mode 100644 index 000000000..a3f181145 --- /dev/null +++ b/smartsim/_core/_cli/teardown.py @@ -0,0 +1,74 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os +import subprocess +import typing as t + +from smartsim._core.config import CONFIG + + +def configure_parser(parser: argparse.ArgumentParser) -> None: + """Builds the parser for the command""" + parser.add_argument( + "--dragon", + action="store_true", + default=False, + help="Terminate Dragon environment resources if" + "any remain after experiment completion", + ) + + +def _do_dragon_teardown() -> int: + """Run dragon-cleanup script to destroy all remaining dragon resources""" + env = os.environ.copy() + dragon_cleanup = next(CONFIG.core_path.rglob("dragon-cleanup"), None) + if dragon_cleanup is None: + print("dragon-cleanup not found. Skipping cleanup") + return 0 + + # Use popen to avoid `dragon-cleanup` doing a kill on all + # python processes, terminating `smart teardown`, and the + # subprocess handling `dragon-cleanup`. Child processes using + # subprocess.run are killed and cleanup is interrupted + with subprocess.Popen( + [str(dragon_cleanup.absolute())], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) as process: + process.wait() + return process.returncode + + +def execute( + args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / +) -> int: + if args.dragon: + return _do_dragon_teardown() + + return 0 diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 8bf0984df..9c9b46cab 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -78,13 +78,17 @@ def clean(core_path: Path, _all: bool = False) -> int: """Remove pre existing installations of ML runtimes :param _all: Remove all non-python dependencies - :type _all: bool, optional """ build_temp = core_path / ".third-party" if build_temp.is_dir(): shutil.rmtree(build_temp, ignore_errors=True) + dragon_temp = core_path / ".dragon" + if dragon_temp.is_dir(): + shutil.rmtree(dragon_temp, ignore_errors=True) + logger.info("Successfully removed dragon installation") + lib_path = core_path / "lib" if lib_path.is_dir(): # remove RedisAI diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 8ea40ae00..96d46d6ee 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -30,7 +30,6 @@ import multiprocessing as mp import os import os.path -import socket import tempfile import typing as t from types import TracebackType @@ -40,7 +39,9 @@ from smartsim import Experiment from smartsim._core._cli.utils import SMART_LOGGER_FORMAT +from smartsim._core._install.builder import Device from smartsim._core.utils.helpers import installed_redisai_backends +from smartsim._core.utils.network import find_free_port from smartsim.log import get_logger logger = get_logger("Smart", fmt=SMART_LOGGER_FORMAT) @@ -61,9 +62,6 @@ _TemporaryDirectory = tempfile.TemporaryDirectory -_TCapitalDeviceStr = t.Literal["CPU", "GPU"] - - class _VerificationTempDir(_TemporaryDirectory): """A Temporary directory to be used as a context manager that will only clean itself up if no error is raised within its context @@ -88,7 +86,8 @@ def execute( simple experiment """ backends = installed_redisai_backends() - device: _TCapitalDeviceStr = args.device.upper() + temp_dir = "" + device = Device(args.device) try: with contextlib.ExitStack() as ctx: temp_dir = ctx.enter_context(_VerificationTempDir(dir=os.getcwd())) @@ -98,7 +97,7 @@ def execute( "SR_LOG_FILE", os.path.join(temp_dir, "smartredis.log") ), } - if device == "GPU": + if device == Device.GPU: validate_env["CUDA_VISIBLE_DEVICES"] = "0" ctx.enter_context(_env_vars_set_to(validate_env)) test_install( @@ -112,10 +111,11 @@ def execute( except Exception as e: logger.error( "SmartSim failed to run a simple experiment!\n" - f"Experiment failed due to the following exception:\n{e}\n\n" - f"Output files are available at `{temp_dir}`", + f"Experiment failed due to the following exception:\n{e}", exc_info=True, ) + if temp_dir: + logger.info(f"Output files are available at `{temp_dir}`") return os.EX_SOFTWARE return os.EX_OK @@ -136,8 +136,8 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--device", type=str.lower, - default="cpu", - choices=["cpu", "gpu"], + default=Device.CPU.value, + choices=[device.value for device in Device], help="Device to test the ML backends against", ) @@ -145,14 +145,15 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: def test_install( location: str, port: t.Optional[int], - device: _TCapitalDeviceStr, + device: Device, with_tf: bool, with_pt: bool, with_onnx: bool, ) -> None: exp = Experiment("ValidationExperiment", exp_path=location, launcher="local") - exp.disable_telemetry() - port = _find_free_port() if port is None else port + exp.telemetry.disable() + port = find_free_port() if port is None else port + with _make_managed_local_orc(exp, port) as client: logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) @@ -205,15 +206,7 @@ def _make_managed_local_orc( exp.stop(orc) -def _find_free_port() -> int: - """A 'good enough' way to find an open port to bind to""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: - sock.bind(("0.0.0.0", 0)) - _, port = sock.getsockname() - return int(port) - - -def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) -> None: +def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: recv_conn, send_conn = mp.Pipe(duplex=False) # Build the model in a subproc so that keras does not hog the gpu proc = mp.Process(target=_build_tf_frozen_model, args=(send_conn, tmp_dir)) @@ -235,7 +228,12 @@ def _test_tf_install(client: Client, tmp_dir: str, device: _TCapitalDeviceStr) - ) from e client.set_model_from_file( - "keras-fcn", model_path, "TF", device=device, inputs=inputs, outputs=outputs + "keras-fcn", + model_path, + "TF", + device=device.value.upper(), + inputs=inputs, + outputs=outputs, ) client.put_tensor("keras-input", np.random.rand(1, 28, 28).astype(np.float32)) client.run_model("keras-fcn", inputs=["keras-input"], outputs=["keras-output"]) @@ -263,7 +261,7 @@ def _build_tf_frozen_model(conn: "Connection", tmp_dir: str) -> None: conn.send((model_path, inputs, outputs)) -def _test_torch_install(client: Client, device: _TCapitalDeviceStr) -> None: +def _test_torch_install(client: Client, device: Device) -> None: import torch from torch import nn @@ -275,7 +273,7 @@ def __init__(self) -> None: def forward(self, x: torch.Tensor) -> torch.Tensor: return self.conv(x) - if device == "GPU": + if device == Device.GPU: device_ = torch.device("cuda") else: device_ = torch.device("cpu") @@ -291,13 +289,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: torch.jit.save(traced, buffer) # type: ignore[no-untyped-call] model = buffer.getvalue() - client.set_model("torch-nn", model, backend="TORCH", device=device) + client.set_model("torch-nn", model, backend="TORCH", device=device.value.upper()) client.put_tensor("torch-in", torch.rand(1, 1, 3, 3).numpy()) client.run_model("torch-nn", inputs=["torch-in"], outputs=["torch-out"]) client.get_tensor("torch-out") -def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None: +def _test_onnx_install(client: Client, device: Device) -> None: from skl2onnx import to_onnx from sklearn.cluster import KMeans @@ -310,7 +308,7 @@ def _test_onnx_install(client: Client, device: _TCapitalDeviceStr) -> None: sample = np.arange(20, dtype=np.float32).reshape(10, 2) client.put_tensor("onnx-input", sample) - client.set_model("onnx-kmeans", model, "ONNX", device=device) + client.set_model("onnx-kmeans", model, "ONNX", device=device.value.upper()) client.run_model( "onnx-kmeans", inputs=["onnx-input"], outputs=["onnx-labels", "onnx-transform"] ) diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index c100ac80e..e0cf5a522 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -267,15 +267,15 @@ class Versioner: """ # compatible Python version - PYTHON_MIN = Version_("3.8.0") + PYTHON_MIN = Version_("3.9.0") # Versions - SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.2")) - SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.2")) + SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.7.0")) + SMARTREDIS = Version_(get_env("SMARTREDIS_VERSION", "0.5.3")) SMARTSIM_SUFFIX = get_env("SMARTSIM_SUFFIX", "") # Redis - REDIS = Version_(get_env("SMARTSIM_REDIS", "7.0.5")) + REDIS = Version_(get_env("SMARTSIM_REDIS", "7.2.4")) REDIS_URL = get_env("SMARTSIM_REDIS_URL", "https://github.com/redis/redis.git/") REDIS_BRANCH = get_env("SMARTSIM_REDIS_BRANCH", REDIS) diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index c098cfd01..fb8ec5b81 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -28,6 +28,7 @@ import concurrent.futures import enum +import fileinput import itertools import os import platform @@ -53,8 +54,7 @@ # TODO: check cmake version and use system if possible to avoid conflicts TRedisAIBackendStr = t.Literal["tensorflow", "torch", "onnxruntime", "tflite"] -TDeviceStr = t.Literal["cpu", "gpu"] - +_PathLike = t.Union[str, "os.PathLike[str]"] _T = t.TypeVar("_T") _U = t.TypeVar("_U") @@ -63,7 +63,6 @@ def expand_exe_path(exe: str) -> str: """Takes an executable and returns the full path to that executable :param exe: executable or file - :type exe: str :raises TypeError: if file is not an executable :raises FileNotFoundError: if executable cannot be found """ @@ -96,6 +95,11 @@ def from_str(cls, string: str, /) -> "Architecture": raise BuildError(f"Unrecognized or unsupported architecture: {string}") +class Device(enum.Enum): + CPU = "cpu" + GPU = "gpu" + + class OperatingSystem(enum.Enum): LINUX = ("linux", "linux2") DARWIN = ("darwin",) @@ -173,7 +177,7 @@ def is_built(self) -> bool: raise NotImplementedError def build_from_git( - self, git_url: str, branch: str, device: TDeviceStr = "cpu" + self, git_url: str, branch: str, device: Device = Device.CPU ) -> None: raise NotImplementedError @@ -274,13 +278,11 @@ def is_built(self) -> bool: return redis_files.issubset(bin_files) or keydb_files.issubset(bin_files) def build_from_git( - self, git_url: str, branch: str, device: TDeviceStr = "cpu" + self, git_url: str, branch: str, device: Device = Device.CPU ) -> None: """Build Redis from git :param git_url: url from which to retrieve Redis - :type git_url: str :param branch: branch to checkout - :type branch: str """ # pylint: disable=too-many-locals database_name = "keydb" if "KeyDB" in git_url else "redis" @@ -364,7 +366,7 @@ class _RAIBuildDependency(ABC): def __rai_dependency_name__(self) -> str: ... @abstractmethod - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: ... + def __place_for_rai__(self, target: _PathLike) -> Path: ... @staticmethod @abstractmethod @@ -372,7 +374,7 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: def _place_rai_dep_at( - target: t.Union[str, "os.PathLike[str]"], verbose: bool + target: _PathLike, verbose: bool ) -> t.Callable[[_RAIBuildDependency], Path]: def _place(dep: _RAIBuildDependency) -> Path: if verbose: @@ -405,6 +407,7 @@ def __init__( build_onnx: bool = False, jobs: int = 1, verbose: bool = False, + torch_with_mkl: bool = True, ) -> None: super().__init__( build_env or {}, @@ -423,6 +426,9 @@ def __init__( self.libtf_dir = libtf_dir self.torch_dir = torch_dir + # extra configuration options + self.torch_with_mkl = torch_with_mkl + # Sanity checks self._validate_platform() @@ -480,7 +486,7 @@ def build_onnx(self) -> bool: def fetch_onnx(self) -> bool: return self.build_onnx - def get_deps_dir_path_for(self, device: TDeviceStr) -> Path: + def get_deps_dir_path_for(self, device: Device) -> Path: def fail_to_format(reason: str) -> BuildError: # pragma: no cover return BuildError(f"Failed to format RedisAI dependency path: {reason}") @@ -497,10 +503,10 @@ def fail_to_format(reason: str) -> BuildError: # pragma: no cover arch = "arm64v8" else: # pragma: no cover raise fail_to_format(f"Unknown architecture: {architecture}") - return self.rai_build_path / f"deps/{os_}-{arch}-{device}" + return self.rai_build_path / f"deps/{os_}-{arch}-{device.value}" def _get_deps_to_fetch_for( - self, device: TDeviceStr + self, device: Device ) -> t.Tuple[_RAIBuildDependency, ...]: os_, arch = self._platform # TODO: It would be nice if the backend version numbers were declared @@ -512,8 +518,8 @@ def _get_deps_to_fetch_for( # DLPack is always required fetchable_deps: t.List[_RAIBuildDependency] = [_DLPackRepository("v0.5_RAI")] if self.fetch_torch: - pt_dep = _choose_pt_variant(os_) - fetchable_deps.append(pt_dep(arch, device, "2.0.1")) + pt_dep = _choose_pt_variant(os_)(arch, device, "2.0.1", self.torch_with_mkl) + fetchable_deps.append(pt_dep) if self.fetch_tf: fetchable_deps.append(_TFArchive(os_, arch, device, "2.13.1")) if self.fetch_onnx: @@ -521,14 +527,13 @@ def _get_deps_to_fetch_for( return tuple(fetchable_deps) - def symlink_libtf(self, device: str) -> None: + def symlink_libtf(self, device: Device) -> None: """Add symbolic link to available libtensorflow in RedisAI deps. :param device: cpu or gpu - :type device: str """ rai_deps_path = sorted( - self.rai_build_path.glob(os.path.join("deps", f"*{device}*")) + self.rai_build_path.glob(os.path.join("deps", f"*{device.value}*")) ) if not rai_deps_path: raise FileNotFoundError("Could not find RedisAI 'deps' directory") @@ -577,16 +582,13 @@ def symlink_libtf(self, device: str) -> None: os.symlink(src_file, dst_file) def build_from_git( - self, git_url: str, branch: str, device: TDeviceStr = "cpu" + self, git_url: str, branch: str, device: Device = Device.CPU ) -> None: """Build RedisAI from git :param git_url: url from which to retrieve RedisAI - :type git_url: str :param branch: branch to checkout - :type branch: str :param device: cpu or gpu - :type device: str """ # delete previous build dir (should never be there) if self.rai_build_path.is_dir(): @@ -616,14 +618,14 @@ def build_from_git( self.run_command(clone_cmd, out=subprocess.DEVNULL, cwd=self.build_dir) self._fetch_deps_for(device) - if self.libtf_dir and device: + if self.libtf_dir and device.value: self.symlink_libtf(device) build_cmd = self._rai_build_env_prefix( with_pt=self.build_torch, with_tf=self.build_tf, with_ort=self.build_onnx, - extra_env={"GPU": "1" if device == "gpu" else "0"}, + extra_env={"GPU": "1" if device == Device.GPU else "0"}, ) if self.torch_dir: @@ -674,7 +676,7 @@ def _rai_build_env_prefix( *(f"{key}={val}" for key, val in extra_env.items()), ] - def _fetch_deps_for(self, device: TDeviceStr) -> None: + def _fetch_deps_for(self, device: Device) -> None: if not self.rai_build_path.is_dir(): raise BuildError("RedisAI build directory not found") @@ -693,13 +695,12 @@ def _fetch_deps_for(self, device: TDeviceStr) -> None: f"found {len(unique_placed_paths)}" ) - def _install_backends(self, device: str) -> None: + def _install_backends(self, device: Device) -> None: """Move backend libraries to smartsim/_core/lib/ :param device: cpu or cpu - :type device: str """ self.rai_install_path = self.rai_build_path.joinpath( - f"install-{device}" + f"install-{device.value}" ).resolve() rai_lib = self.rai_install_path / "redisai.so" rai_backends = self.rai_install_path / "backends" @@ -750,7 +751,7 @@ def url(self) -> str: ... class _WebGitRepository(_WebLocation): def clone( self, - target: t.Union[str, "os.PathLike[str]"], + target: _PathLike, depth: t.Optional[int] = None, branch: t.Optional[str] = None, ) -> None: @@ -780,7 +781,7 @@ def url(self) -> str: def __rai_dependency_name__(self) -> str: return f"dlpack@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target) / "dlpack" self.clone(target, branch=self.version, depth=1) if not target.is_dir(): @@ -794,7 +795,7 @@ def name(self) -> str: _, name = self.url.rsplit("/", 1) return name - def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def download(self, target: _PathLike) -> Path: target = Path(target) if target.is_dir(): target = target / self.name @@ -804,37 +805,41 @@ def download(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: class _ExtractableWebArchive(_WebArchive, ABC): @abstractmethod - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: ... + def _extract_download(self, download_path: Path, target: _PathLike) -> None: ... - def extract(self, target: t.Union[str, "os.PathLike[str]"]) -> None: + def extract(self, target: _PathLike) -> None: with tempfile.TemporaryDirectory() as tmp_dir: arch_path = self.download(tmp_dir) self._extract_download(arch_path, target) class _WebTGZ(_ExtractableWebArchive): - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: + def _extract_download(self, download_path: Path, target: _PathLike) -> None: with tarfile.open(download_path, "r") as tgz_file: tgz_file.extractall(target) class _WebZip(_ExtractableWebArchive): - def _extract_download( - self, download_path: Path, target: t.Union[str, "os.PathLike[str]"] - ) -> None: + def _extract_download(self, download_path: Path, target: _PathLike) -> None: with zipfile.ZipFile(download_path, "r") as zip_file: zip_file.extractall(target) +class WebTGZ(_WebTGZ): + def __init__(self, url: str) -> None: + self._url = url + + @property + def url(self) -> str: + return self._url + + @dataclass(frozen=True) class _PTArchive(_WebZip, _RAIBuildDependency): architecture: Architecture - device: TDeviceStr + device: Device version: str + with_mkl: bool @staticmethod def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: @@ -849,7 +854,20 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: def __rai_dependency_name__(self) -> str: return f"libtorch@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + @staticmethod + def _patch_out_mkl(libtorch_root: Path) -> None: + _modify_source_files( + libtorch_root / "share/cmake/Caffe2/public/mkl.cmake", + r"find_package\(MKL QUIET\)", + "# find_package(MKL QUIET)", + ) + + def extract(self, target: _PathLike) -> None: + super().extract(target) + if not self.with_mkl: + self._patch_out_mkl(Path(target)) + + def __place_for_rai__(self, target: _PathLike) -> Path: self.extract(target) target = Path(target) / "libtorch" if not target.is_dir(): @@ -865,10 +883,10 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: @property def url(self) -> str: - if self.device == "gpu": + if self.device == Device.GPU: pt_build = "cu117" else: - pt_build = "cpu" + pt_build = Device.CPU.value # pylint: disable-next=line-too-long libtorch_archive = ( f"libtorch-cxx11-abi-shared-without-deps-{self.version}%2B{pt_build}.zip" @@ -887,10 +905,10 @@ def supported_platforms() -> t.Sequence[t.Tuple[OperatingSystem, Architecture]]: @property def url(self) -> str: - if self.device == "gpu": + if self.device == Device.GPU: raise BuildError("RedisAI does not currently support GPU on Mac OSX") if self.architecture == Architecture.X64: - pt_build = "cpu" + pt_build = Device.CPU.value libtorch_archive = f"libtorch-macos-{self.version}.zip" root_url = "https://download.pytorch.org/libtorch" return f"{root_url}/{pt_build}/{libtorch_archive}" @@ -902,7 +920,7 @@ def url(self) -> str: ) return f"{root_url}/{libtorch_archive}" - raise BuildError("Unsupported architecture for Pytorch: {self.architecture}") + raise BuildError(f"Unsupported architecture for Pytorch: {self.architecture}") def _choose_pt_variant( @@ -921,7 +939,7 @@ def _choose_pt_variant( class _TFArchive(_WebTGZ, _RAIBuildDependency): os_: OperatingSystem architecture: Architecture - device: TDeviceStr + device: Device version: str @staticmethod @@ -937,7 +955,7 @@ def url(self) -> str: tf_arch = "x86_64" else: raise BuildError( - "Unexpected Architecture for TF Archive: {self.architecture}" + f"Unexpected Architecture for TF Archive: {self.architecture}" ) if self.os_ == OperatingSystem.LINUX: @@ -945,21 +963,21 @@ def url(self) -> str: tf_device = self.device elif self.os_ == OperatingSystem.DARWIN: tf_os = "darwin" - if self.device == "gpu": + if self.device == Device.GPU: raise BuildError("RedisAI does not currently support GPU on Macos") - tf_device = "cpu" + tf_device = Device.CPU else: - raise BuildError("Unexpected OS for TF Archive: {self.os_}") + raise BuildError(f"Unexpected OS for TF Archive: {self.os_}") return ( "https://storage.googleapis.com/tensorflow/libtensorflow/" - f"libtensorflow-{tf_device}-{tf_os}-{tf_arch}-{self.version}.tar.gz" + f"libtensorflow-{tf_device.value}-{tf_os}-{tf_arch}-{self.version}.tar.gz" ) @property def __rai_dependency_name__(self) -> str: return f"libtensorflow@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target) / "libtensorflow" target.mkdir() self.extract(target) @@ -970,7 +988,7 @@ def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: @dataclass(frozen=True) class _ORTArchive(_WebTGZ, _RAIBuildDependency): os_: OperatingSystem - device: TDeviceStr + device: Device version: str @staticmethod @@ -989,15 +1007,15 @@ def url(self) -> str: if self.os_ == OperatingSystem.LINUX: ort_os = "linux" ort_arch = "x64" - ort_build = "-gpu" if self.device == "gpu" else "" + ort_build = "-gpu" if self.device == Device.GPU else "" elif self.os_ == OperatingSystem.DARWIN: ort_os = "osx" ort_arch = "x86_64" ort_build = "" - if self.device == "gpu": + if self.device == Device.GPU: raise BuildError("RedisAI does not currently support GPU on Macos") else: - raise BuildError("Unexpected OS for TF Archive: {self.os_}") + raise BuildError(f"Unexpected OS for TF Archive: {self.os_}") ort_archive = f"onnxruntime-{ort_os}-{ort_arch}{ort_build}-{self.version}.tgz" return f"{ort_url_base}/{ort_archive}" @@ -1005,7 +1023,7 @@ def url(self) -> str: def __rai_dependency_name__(self) -> str: return f"onnxruntime@{self.url}" - def __place_for_rai__(self, target: t.Union[str, "os.PathLike[str]"]) -> Path: + def __place_for_rai__(self, target: _PathLike) -> Path: target = Path(target).resolve() / "onnxruntime" self.extract(target) try: @@ -1046,3 +1064,13 @@ def config_git_command(plat: Platform, cmd: t.Sequence[str]) -> t.List[str]: + cmd[where:] ) return cmd + + +def _modify_source_files( + files: t.Union[_PathLike, t.Iterable[_PathLike]], regex: str, replacement: str +) -> None: + compiled_regex = re.compile(regex) + with fileinput.input(files=files, inplace=True) as handles: + for line in handles: + line = compiled_regex.sub(replacement, line) + print(line, end="") diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 42a548c42..9cf950b21 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -89,6 +89,7 @@ # - Default: None +# pylint: disable-next=too-many-public-methods class Config: def __init__(self) -> None: # SmartSim/smartsim/_core @@ -99,6 +100,7 @@ def __init__(self) -> None: self.lib_path = Path(dependency_path, "lib").resolve() self.bin_path = Path(dependency_path, "bin").resolve() self.conf_path = Path(dependency_path, "config", "redis.conf") + self.conf_dir = Path(self.core_path, "config") @property def redisai(self) -> str: @@ -152,6 +154,30 @@ def database_file_parse_trials(self) -> int: def database_file_parse_interval(self) -> int: return int(os.getenv("SMARTSIM_DB_FILE_PARSE_INTERVAL", "2")) + @property + def dragon_dotenv(self) -> Path: + """Returns the path to a .env file containing dragon environment variables""" + return self.conf_dir / "dragon" / ".env" + + @property + def dragon_server_path(self) -> t.Optional[str]: + return os.getenv( + "SMARTSIM_DRAGON_SERVER_PATH", + os.getenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", None), + ) + + @property + def dragon_server_timeout(self) -> int: + return int(os.getenv("SMARTSIM_DRAGON_TIMEOUT", "30000")) + + @property + def dragon_server_startup_timeout(self) -> int: + return int(os.getenv("SMARTSIM_DRAGON_STARTUP_TIMEOUT", "300000")) + + @property + def dragon_transport(self) -> str: + return os.getenv("SMARTSIM_DRAGON_TRANSPORT", "hsta") + @property def log_level(self) -> str: return os.environ.get("SMARTSIM_LOG_LEVEL", "info") @@ -177,8 +203,14 @@ def test_num_gpus(self) -> int: # pragma: no cover return int(os.environ.get("SMARTSIM_TEST_NUM_GPUS") or 1) @property - def test_port(self) -> int: # pragma: no cover - return int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + def test_ports(self) -> t.Sequence[int]: # pragma: no cover + min_required_ports = 25 + first_port = int(os.environ.get("SMARTSIM_TEST_PORT", 6780)) + num_ports = max( + int(os.environ.get("SMARTSIM_TEST_NUM_PORTS", min_required_ports)), + min_required_ports, + ) + return range(first_port, first_port + num_ports) @property def test_batch_resources(self) -> t.Dict[t.Any, t.Any]: # pragma: no cover @@ -219,6 +251,11 @@ def test_account(self) -> t.Optional[str]: # pragma: no cover # no account by default return os.environ.get("SMARTSIM_TEST_ACCOUNT", None) + @property + def test_mpi(self) -> bool: # pragma: no cover + # By default, test MPI app if it compiles + return int(os.environ.get("SMARTSIM_TEST_MPI", "1")) > 0 + @property def telemetry_frequency(self) -> int: return int(os.environ.get("SMARTSIM_TELEMETRY_FREQUENCY", 5)) @@ -235,6 +272,29 @@ def telemetry_cooldown(self) -> int: def telemetry_subdir(self) -> str: return ".smartsim/telemetry" + @property + def dragon_default_subdir(self) -> str: + return ".smartsim/dragon" + + @property + def dragon_log_filename(self) -> str: + return "dragon_config.log" + + @property + def smartsim_key_path(self) -> str: + """Path to a root directory used for persistence of key files. Default + value `$HOME/.smartsim/keys`. User-overrideable by setting the environment + variable `SMARTSIM_KEY_PATH`. + + :returns: The configured key path. + """ + default_path = Path.home() / ".smartsim" / "keys" + return os.environ.get("SMARTSIM_KEY_PATH", str(default_path)) + + @property + def dragon_pin(self) -> str: + return "0.9" + @lru_cache(maxsize=128, typed=False) def get_config() -> Config: diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 3b673970a..43a218545 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -27,6 +27,7 @@ from __future__ import annotations import itertools +import os import os.path as osp import pathlib import pickle @@ -36,14 +37,17 @@ import threading import time import typing as t -from os import environ from smartredis import Client, ConfigOptions from smartsim._core.utils.network import get_ip_from_host from ..._core.launcher.step import Step -from ..._core.utils.helpers import unpack_colo_db_identifier, unpack_db_identifier +from ..._core.utils.helpers import ( + SignalInterceptionStack, + unpack_colo_db_identifier, + unpack_db_identifier, +) from ..._core.utils.redis import ( db_is_active, set_ml_model, @@ -51,7 +55,7 @@ shutdown_db_node, ) from ...database import Orchestrator -from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity +from ...entity import Ensemble, EntitySequence, Model, SmartSimEntity from ...error import ( LauncherError, SmartSimError, @@ -61,16 +65,25 @@ ) from ...log import get_logger from ...servertype import CLUSTERED, STANDALONE -from ...status import STATUS_CANCELLED, STATUS_RUNNING, TERMINAL_STATUSES +from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG -from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher +from ..launcher import ( + DragonLauncher, + LocalLauncher, + LSFLauncher, + PBSLauncher, + SlurmLauncher, +) from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize +from .controller_utils import _AnonymousBatchJob, _look_up_launched_data from .job import Job from .jobmanager import JobManager from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest if t.TYPE_CHECKING: + from types import FrameType + from ..utils.serialize import TStepLaunchMetaData @@ -90,7 +103,6 @@ def __init__(self, launcher: str = "local") -> None: """Initialize a Controller :param launcher: the type of launcher being used - :type launcher: str """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) @@ -112,9 +124,16 @@ def start( The controller will start the job-manager thread upon execution of all jobs. """ + # launch a telemetry monitor to track job progress + if CONFIG.telemetry_enabled: + self._start_telemetry_monitor(exp_path) + self._jobs.kill_on_interrupt = kill_on_interrupt + # register custom signal handler for ^C (SIGINT) - signal.signal(signal.SIGINT, self._jobs.signal_interrupt) + SignalInterceptionStack.get(signal.SIGINT).push_unique( + self._jobs.signal_interrupt + ) launched = self._launch(exp_name, exp_path, manifest) # start the job manager thread if not already started @@ -125,16 +144,17 @@ def start( launched.map(_look_up_launched_data(self._launcher)) ) - # launch a telemetry monitor to track job progress - if CONFIG.telemetry_enabled: - self._start_telemetry_monitor(exp_path) - # block until all non-database jobs are complete if block: # poll handles its own keyboard interrupt as - # it may be called seperately + # it may be called separately self.poll(5, True, kill_on_interrupt=kill_on_interrupt) + @property + def active_orchestrator_jobs(self) -> t.Dict[str, Job]: + """Return active orchestrator jobs.""" + return {**self._jobs.db_jobs} + @property def orchestrator_active(self) -> bool: with JM_LOCK: @@ -148,11 +168,8 @@ def poll( """Poll running jobs and receive logging output of job status :param interval: number of seconds to wait before polling again - :type interval: int :param verbose: set verbosity - :type verbose: bool :param kill_on_interrupt: flag for killing jobs when SIGINT is received - :type kill_on_interrupt: bool, optional """ self._jobs.kill_on_interrupt = kill_on_interrupt to_monitor = self._jobs.jobs @@ -172,7 +189,6 @@ def finished( """Return a boolean indicating wether a job has finished or not :param entity: object launched by SmartSim. - :type entity: Entity | EntitySequence :returns: bool :raises ValueError: if entity has not been launched yet """ @@ -202,7 +218,6 @@ def stop_entity( the jobmanager so that the job appears as "cancelled". :param entity: entity to be stopped - :type entity: Entity | EntitySequence """ with JM_LOCK: job = self._jobs[entity.name] @@ -225,8 +240,8 @@ def stop_entity( def stop_db(self, db: Orchestrator) -> None: """Stop an orchestrator + :param db: orchestrator to be stopped - :type db: Orchestrator """ if db.batch: self.stop_entity(db) @@ -243,7 +258,13 @@ def stop_db(self, db: Orchestrator) -> None: continue job = self._jobs[node.name] - job.set_status(STATUS_CANCELLED, "", 0, output=None, error=None) + job.set_status( + SmartSimStatus.STATUS_CANCELLED, + "", + 0, + output=None, + error=None, + ) self._jobs.move_to_completed(job) db.reset_hosts() @@ -252,7 +273,6 @@ def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list :param entity_list: entity list to be stopped - :type entity_list: EntitySequence """ if entity_list.batch: @@ -271,14 +291,12 @@ def get_jobs(self) -> t.Dict[str, Job]: def get_entity_status( self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] - ) -> str: + ) -> SmartSimStatus: """Get the status of an entity :param entity: entity to get status of - :type entity: SmartSimEntity | EntitySequence :raises TypeError: if not SmartSimEntity | EntitySequence :return: status of entity - :rtype: str """ if not isinstance(entity, (SmartSimEntity, EntitySequence)): raise TypeError( @@ -289,15 +307,13 @@ def get_entity_status( def get_entity_list_status( self, entity_list: EntitySequence[SmartSimEntity] - ) -> t.List[str]: + ) -> t.List[SmartSimStatus]: """Get the statuses of an entity list :param entity_list: entity list containing entities to get statuses of - :type entity_list: EntitySequence :raises TypeError: if not EntitySequence - :return: list of str statuses - :rtype: list + :return: list of SmartSimStatus statuses """ if not isinstance(entity_list, EntitySequence): raise TypeError( @@ -316,7 +332,6 @@ def init_launcher(self, launcher: str) -> None: and local launching :param launcher: which launcher to initialize - :type launcher: str :raises SSUnsupportedError: if a string is passed that is not a supported launcher :raises TypeError: if no launcher argument is provided. @@ -327,6 +342,7 @@ def init_launcher(self, launcher: str) -> None: "pals": PBSLauncher, "lsf": LSFLauncher, "local": LocalLauncher, + "dragon": DragonLauncher, } if launcher is not None: @@ -340,6 +356,37 @@ def init_launcher(self, launcher: str) -> None: else: raise TypeError("Must provide a 'launcher' argument") + @staticmethod + def symlink_output_files( + job_step: Step, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> None: + """Create symlinks for entity output files that point to the output files + under the .smartsim directory + + :param job_step: Job step instance + :param entity: Entity instance + """ + historical_out, historical_err = map(pathlib.Path, job_step.get_output_files()) + entity_out = pathlib.Path(entity.path) / f"{entity.name}.out" + entity_err = pathlib.Path(entity.path) / f"{entity.name}.err" + + # check if there is already a link to a previous run + if entity_out.is_symlink() or entity_err.is_symlink(): + entity_out.unlink() + entity_err.unlink() + + historical_err.touch() + historical_out.touch() + + if historical_err.exists() and historical_out.exists(): + entity_out.symlink_to(historical_out) + entity_err.symlink_to(historical_err) + else: + raise FileNotFoundError( + f"Output files for {entity.name} could not be found. " + "Symlinking files failed." + ) + def _launch( self, exp_name: str, exp_path: str, manifest: Manifest ) -> LaunchedManifest[t.Tuple[str, Step]]: @@ -349,15 +396,14 @@ def _launch( address of the database can be given to following entities :param exp_name: The name of the launching experiment - :type exp_name: str :param exp_path: path to location of ``Experiment`` directory if generated - :type exp_path: str :param manifest: Manifest of deployables to launch - :type manifest: Manifest """ manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( - exp_name=exp_name, exp_path=exp_path, launcher_name=str(self._launcher) + exp_name=exp_name, + exp_path=exp_path, + launcher_name=str(self._launcher), ) # Loop over deployables to launch and launch multiple orchestrators for orchestrator in manifest.dbs: @@ -385,6 +431,11 @@ def _launch( steps: t.List[ t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] ] = [] + + symlink_substeps: t.List[ + t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] + ] = [] + for elist in manifest.ensembles: ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" if elist.batch: @@ -392,6 +443,11 @@ def _launch( manifest_builder.add_ensemble( elist, [(batch_step.name, step) for step in substeps] ) + + # symlink substeps to maintain directory structure + for substep, substep_entity in zip(substeps, elist.models): + symlink_substeps.append((substep, substep_entity)) + steps.append((batch_step, elist)) else: # if ensemble is to be run as separate job steps, aka not in a batch @@ -409,19 +465,26 @@ def _launch( model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" if model.batch_settings: anon_entity_list = _AnonymousBatchJob(model) - batch_step, _ = self._create_batch_job_step( + batch_step, substeps = self._create_batch_job_step( anon_entity_list, model_telem_dir ) manifest_builder.add_model(model, (batch_step.name, batch_step)) + + symlink_substeps.append((substeps[0], model)) steps.append((batch_step, model)) else: job_step = self._create_job_step(model, model_telem_dir) manifest_builder.add_model(model, (job_step.name, job_step)) steps.append((job_step, model)) - # launch steps + # launch and symlink steps for step, entity in steps: self._launch_step(step, entity) + self.symlink_output_files(step, entity) + + # symlink substeps to maintain directory structure + for substep, entity in symlink_substeps: + self.symlink_output_files(substep, entity) return manifest_builder.finalize() @@ -437,10 +500,8 @@ def _launch_orchestrator( set them in the JobManager :param orchestrator: orchestrator to launch - :type orchestrator: Orchestrator :param manifest_builder: An `LaunchedManifestBuilder` to record the names and `Step`s of the launched orchestrator - :type manifest_builder: LaunchedManifestBuilder[tuple[str, Step]] """ orchestrator.remove_stale_files() orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" @@ -453,7 +514,13 @@ def _launch_orchestrator( manifest_builder.add_database( orchestrator, [(orc_batch_step.name, step) for step in substeps] ) + self._launch_step(orc_batch_step, orchestrator) + self.symlink_output_files(orc_batch_step, orchestrator) + + # symlink substeps to maintain directory structure + for substep, substep_entity in zip(substeps, orchestrator.entities): + self.symlink_output_files(substep, substep_entity) # if orchestrator was run on existing allocation, locally, or in allocation else: @@ -466,6 +533,7 @@ def _launch_orchestrator( ) for db_step in db_steps: self._launch_step(*db_step) + self.symlink_output_files(*db_step) # wait for orchestrator to spin up self._orchestrator_launch_wait(orchestrator) @@ -506,19 +574,43 @@ def _launch_step( """Use the launcher to launch a job step :param job_step: a job step instance - :type job_step: Step :param entity: entity instance - :type entity: SmartSimEntity :raises SmartSimError: if launch fails """ - try: - job_id = self._launcher.run(job_step) - except LauncherError as e: - msg = f"An error occurred when launching {entity.name} \n" - msg += "Check error and output files for details.\n" - msg += f"{entity}" - logger.error(msg) - raise SmartSimError(f"Job step {entity.name} failed to launch") from e + # attempt to retrieve entity name in JobManager.completed + completed_job = self._jobs.completed.get(entity.name, None) + + # if completed job DNE and is the entity name is not + # running in JobManager.jobs or JobManager.db_jobs, + # launch the job + if completed_job is None and ( + entity.name not in self._jobs.jobs and entity.name not in self._jobs.db_jobs + ): + try: + job_id = self._launcher.run(job_step) + except LauncherError as e: + msg = f"An error occurred when launching {entity.name} \n" + msg += "Check error and output files for details.\n" + msg += f"{entity}" + logger.error(msg) + raise SmartSimError(f"Job step {entity.name} failed to launch") from e + + # if the completed job does exist and the entity passed in is the same + # that has ran and completed, relaunch the entity. + elif completed_job is not None and completed_job.entity is entity: + try: + job_id = self._launcher.run(job_step) + except LauncherError as e: + msg = f"An error occurred when launching {entity.name} \n" + msg += "Check error and output files for details.\n" + msg += f"{entity}" + logger.error(msg) + raise SmartSimError(f"Job step {entity.name} failed to launch") from e + + # the entity is using a duplicate name of an existing entity in + # the experiment, throw an error + else: + raise SSUnsupportedError("SmartSim entities cannot have duplicate names.") # a job step is a task if it is not managed by a workload manager (i.e. Slurm) # but is rather started, monitored, and exited through the Popen interface @@ -540,13 +632,10 @@ def _create_batch_job_step( """Use launcher to create batch job step :param entity_list: EntityList to launch as batch - :type entity_list: EntityList :param telemetry_dir: Path to a directory in which the batch job step may write telemetry events - :type telemetry_dir: pathlib.Path :return: batch job step instance and a list of run steps to be executed within the batch job - :rtype: tuple[Step, list[Step]] """ if not entity_list.batch_settings: raise ValueError( @@ -558,7 +647,7 @@ def _create_batch_job_step( entity_list.name, entity_list.path, entity_list.batch_settings ) batch_step.meta["entity_type"] = str(type(entity_list).__name__).lower() - batch_step.meta["status_dir"] = str(telemetry_dir / entity_list.name) + batch_step.meta["status_dir"] = str(telemetry_dir) substeps = [] for entity in entity_list.entities: @@ -575,12 +664,9 @@ def _create_job_step( """Create job steps for all entities with the launcher :param entity: an entity to create a step for - :type entity: SmartSimEntity :param telemetry_dir: Path to a directory in which the job step may write telemetry events - :type telemetry_dir: pathlib.Path :return: the job step - :rtype: Step """ # get SSDB, SSIN, SSOUT and add to entity run settings if isinstance(entity, Model): @@ -597,7 +683,6 @@ def _prep_entity_client_env(self, entity: Model) -> None: """Retrieve all connections registered to this entity :param entity: The entity to retrieve connections from - :type entity: Model """ client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} @@ -662,17 +747,28 @@ def _save_orchestrator(self, orchestrator: Orchestrator) -> None: to the orchestrator. :param orchestrator: Orchestrator configuration to be saved - :type orchestrator: Orchestrator """ - dat_file = "/".join((orchestrator.path, "smartsim_db.dat")) - db_jobs = self._jobs.db_jobs - orc_data = {"db": orchestrator, "db_jobs": db_jobs} - steps = [] - for db_job in db_jobs.values(): - steps.append(self._launcher.step_mapping[db_job.name]) - orc_data["steps"] = steps - with open(dat_file, "wb") as pickle_file: + if not orchestrator.is_active(): + raise Exception("Orchestrator is not running") + + # Extract only the db_jobs associated with this particular orchestrator + if orchestrator.batch: + job_names = [orchestrator.name] + else: + job_names = [dbnode.name for dbnode in orchestrator.entities] + db_jobs = { + name: job for name, job in self._jobs.db_jobs.items() if name in job_names + } + + # Extract the associated steps + steps = [ + self._launcher.step_mapping[db_job.name] for db_job in db_jobs.values() + ] + + orc_data = {"db": orchestrator, "db_jobs": db_jobs, "steps": steps} + + with open(orchestrator.checkpoint_file, "wb") as pickle_file: pickle.dump(orc_data, pickle_file) def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: @@ -684,7 +780,6 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: be launched with SSDB address :param orchestrator: orchestrator instance - :type orchestrator: Orchestrator :raises SmartSimError: if launch fails or manually stopped by user """ if orchestrator.batch: @@ -702,10 +797,9 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # _jobs.get_status acquires JM lock for main thread, no need for locking statuses = self.get_entity_list_status(orchestrator) - if all(stat == STATUS_RUNNING for stat in statuses): + if all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses): ready = True - # TODO remove in favor of by node status check - time.sleep(CONFIG.jm_interval) + # TODO: Add a node status check elif any(stat in TERMINAL_STATUSES for stat in statuses): self.stop_db(orchestrator) msg = "Orchestrator failed during startup" @@ -723,14 +817,14 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # launch explicitly raise - def reload_saved_db(self, checkpoint_file: str) -> Orchestrator: + def reload_saved_db( + self, checkpoint_file: t.Union[str, os.PathLike[str]] + ) -> Orchestrator: with JM_LOCK: - if self.orchestrator_active: - raise SmartSimError("Orchestrator exists and is active") if not osp.exists(checkpoint_file): raise FileNotFoundError( - f"The SmartSim database config file {checkpoint_file} " + f"The SmartSim database config file {os.fspath(checkpoint_file)} " "cannot be found." ) @@ -766,7 +860,7 @@ def reload_saved_db(self, checkpoint_file: str) -> Orchestrator: try: for db_job, step in job_steps: self._jobs.db_jobs[db_job.ename] = db_job - self._launcher.step_mapping[db_job.name] = step + self._launcher.add_step_to_mapping_table(db_job.name, step) if step.task_id: self._launcher.task_manager.add_existing(int(step.task_id)) except LauncherError as e: @@ -795,9 +889,9 @@ def _set_dbobjects(self, manifest: Manifest) -> None: if not db_is_active(hosts=hosts, ports=ports, num_shards=len(db_addresses)): raise SSInternalError("Cannot set DB Objects, DB is not running") - environ[f"SSDB{db_name}"] = db_addresses[0] + os.environ[f"SSDB{db_name}"] = db_addresses[0] - environ[f"SR_DB_TYPE{db_name}"] = ( + os.environ[f"SR_DB_TYPE{db_name}"] = ( CLUSTERED if len(db_addresses) > 1 else STANDALONE ) @@ -833,7 +927,6 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: of the processes launched through this controller. :param exp_dir: An experiment directory - :type exp_dir: str """ if ( self._telemetry_monitor is None @@ -859,43 +952,3 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: cwd=str(pathlib.Path(__file__).parent.parent.parent), shell=False, ) - logger.debug("Telemetry monitor started") - - -class _AnonymousBatchJob(EntityList[Model]): - @staticmethod - def _validate(model: Model) -> None: - if model.batch_settings is None: - msg = "Unable to create _AnonymousBatchJob without batch_settings" - raise SmartSimError(msg) - - def __init__(self, model: Model) -> None: - self._validate(model) - super().__init__(model.name, model.path) - self.entities = [model] - self.batch_settings = model.batch_settings - - def _initialize_entities(self, **kwargs: t.Any) -> None: ... - - -def _look_up_launched_data( - launcher: Launcher, -) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: - def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": - # NOTE: we cannot assume that the name of the launched step - # ``launched_step_name`` is equal to the name of the step referring to - # the entity ``step.name`` as is the case when an entity list is - # launched as a batch job - launched_step_name, step = data - launched_step_map = launcher.step_mapping[launched_step_name] - out_file, err_file = step.get_output_files() - return ( - launched_step_map.step_id, - launched_step_map.task_id, - launched_step_map.managed, - out_file, - err_file, - pathlib.Path(step.meta.get("status_dir", step.cwd)), - ) - - return _unpack_launched_data diff --git a/smartsim/_core/control/controller_utils.py b/smartsim/_core/control/controller_utils.py new file mode 100644 index 000000000..37ae9aebf --- /dev/null +++ b/smartsim/_core/control/controller_utils.py @@ -0,0 +1,77 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import pathlib +import typing as t + +from ..._core.launcher.step import Step +from ...entity import EntityList, Model +from ...error import SmartSimError +from ..launcher.launcher import Launcher + +if t.TYPE_CHECKING: + from ..utils.serialize import TStepLaunchMetaData + + +class _AnonymousBatchJob(EntityList[Model]): + @staticmethod + def _validate(model: Model) -> None: + if model.batch_settings is None: + msg = "Unable to create _AnonymousBatchJob without batch_settings" + raise SmartSimError(msg) + + def __init__(self, model: Model) -> None: + self._validate(model) + super().__init__(model.name, model.path) + self.entities = [model] + self.batch_settings = model.batch_settings + + def _initialize_entities(self, **kwargs: t.Any) -> None: ... + + +def _look_up_launched_data( + launcher: Launcher, +) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: + def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": + # NOTE: we cannot assume that the name of the launched step + # ``launched_step_name`` is equal to the name of the step referring to + # the entity ``step.name`` as is the case when an entity list is + # launched as a batch job + launched_step_name, step = data + launched_step_map = launcher.step_mapping[launched_step_name] + out_file, err_file = step.get_output_files() + return ( + launched_step_map.step_id, + launched_step_map.task_id, + launched_step_map.managed, + out_file, + err_file, + pathlib.Path(step.meta.get("status_dir", step.cwd)), + ) + + return _unpack_launched_data diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index f3bd8cf3a..6941d7607 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -24,46 +24,170 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pathlib import time import typing as t from dataclasses import dataclass from ...entity import EntitySequence, SmartSimEntity -from ...status import STATUS_NEW +from ...status import SmartSimStatus @dataclass(frozen=True) class _JobKey: + """A helper class for creating unique lookup keys within the telemetry + monitor. These keys are not guaranteed to be unique across experiments, + only within an experiment (due to process ID re-use by the OS)""" + step_id: str + """The process id of an unmanaged task""" task_id: str + """The task id of a managed task""" class JobEntity: - """API required for a job processed in the JobManager with support for - telemetry monitoring + """An entity containing run-time SmartSimEntity metadata. The run-time metadata + is required to perform telemetry collection. The `JobEntity` satisfies the core + API necessary to use a `JobManager` to manage retrieval of managed step updates. """ def __init__(self) -> None: self.name: str = "" + """The entity name""" self.path: str = "" + """The root path for entity output files""" self.step_id: str = "" + """The process id of an unmanaged task""" self.task_id: str = "" + """The task id of a managed task""" self.type: str = "" + """The type of the associated `SmartSimEntity`""" self.timestamp: int = 0 + """The timestamp when the entity was created""" self.status_dir: str = "" + """The path configured by the experiment for the entities telemetry output""" + self.telemetry_on: bool = False + """"Flag indicating if optional telemetry is enabled for the entity""" + self.collectors: t.Dict[str, str] = {} + """Mapping of collectors enabled for the entity""" + self.config: t.Dict[str, str] = {} + """Telemetry configuration supplied by the experiment""" + self._is_complete: bool = False + """Flag indicating if the entity has completed execution""" @property def is_db(self) -> bool: + """Returns `True` if the entity represents a database or database shard""" return self.type in ["orchestrator", "dbnode"] @property def is_managed(self) -> bool: + """Returns `True` if the entity is managed by a workload manager""" return bool(self.step_id) @property def key(self) -> _JobKey: + """Return a `_JobKey` that identifies an entity. + NOTE: not guaranteed to be unique over time due to reused process IDs""" return _JobKey(self.step_id, self.task_id) + @property + def is_complete(self) -> bool: + """Returns `True` if the entity has completed execution""" + return self._is_complete + + def check_completion_status(self) -> None: + """Check for telemetry outputs indicating the entity has completed + TODO: determine correct location to avoid exposing telemetry + implementation details into `JobEntity` + """ + # avoid touching file-system if not necessary + if self._is_complete: + return + + # status telemetry is tracked in JSON files written to disk. look + # for a corresponding `stop` event in the entity status directory + state_file = pathlib.Path(self.status_dir) / "stop.json" + if state_file.exists(): + self._is_complete = True + + @staticmethod + def _map_db_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: + """Map DB-specific properties from a runtime manifest onto a `JobEntity` + + :param entity_dict: The raw dictionary deserialized from manifest JSON + :param entity: The entity instance to modify + """ + if entity.is_db: + # add collectors if they're configured to be enabled in the manifest + entity.collectors = { + "client": entity_dict.get("client_file", ""), + "client_count": entity_dict.get("client_count_file", ""), + "memory": entity_dict.get("memory_file", ""), + } + + entity.telemetry_on = any(entity.collectors.values()) + entity.config["host"] = entity_dict.get("hostname", "") + entity.config["port"] = entity_dict.get("port", "") + + @staticmethod + def _map_standard_metadata( + entity_type: str, + entity_dict: t.Dict[str, t.Any], + entity: "JobEntity", + exp_dir: str, + raw_experiment: t.Dict[str, t.Any], + ) -> None: + """Map universal properties from a runtime manifest onto a `JobEntity` + + :param entity_type: The type of the associated `SmartSimEntity` + :param entity_dict: The raw dictionary deserialized from manifest JSON + :param entity: The entity instance to modify + :param exp_dir: The path to the experiment working directory + :param raw_experiment: The raw experiment dictionary deserialized from + manifest JSON + """ + metadata = entity_dict["telemetry_metadata"] + status_dir = pathlib.Path(metadata.get("status_dir")) + is_dragon = raw_experiment["launcher"].lower() == "dragon" + + # all entities contain shared properties that identify the task + entity.type = entity_type + entity.name = ( + entity_dict["name"] + if not is_dragon + else entity_dict["telemetry_metadata"]["step_id"] + ) + entity.step_id = str(metadata.get("step_id") or "") + entity.task_id = str(metadata.get("task_id") or "") + entity.timestamp = int(entity_dict.get("timestamp", "0")) + entity.path = str(exp_dir) + entity.status_dir = str(status_dir) + + @classmethod + def from_manifest( + cls, + entity_type: str, + entity_dict: t.Dict[str, t.Any], + exp_dir: str, + raw_experiment: t.Dict[str, t.Any], + ) -> "JobEntity": + """Instantiate a `JobEntity` from the dictionary deserialized from manifest JSON + + :param entity_type: The type of the associated `SmartSimEntity` + :param entity_dict: The raw dictionary deserialized from manifest JSON + :param exp_dir: The path to the experiment working directory + :param raw_experiment: raw experiment deserialized from manifest JSON + """ + entity = JobEntity() + + cls._map_standard_metadata( + entity_type, entity_dict, entity, exp_dir, raw_experiment + ) + cls._map_db_metadata(entity_dict, entity) + + return entity + class Job: """Keep track of various information for the controller. @@ -83,20 +207,15 @@ def __init__( """Initialize a Job. :param job_name: Name of the job step - :type job_name: str :param job_id: The id associated with the job - :type job_id: str :param entity: The SmartSim entity(list) associated with the job - :type entity: SmartSimEntity | EntitySequence | JobEntity :param launcher: Launcher job was started with - :type launcher: str :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ self.name = job_name self.jid = job_id self.entity = entity - self.status = STATUS_NEW + self.status = SmartSimStatus.STATUS_NEW # status before smartsim status mapping is applied self.raw_status: t.Optional[str] = None self.returncode: t.Optional[int] = None @@ -116,7 +235,7 @@ def ename(self) -> str: def set_status( self, - new_status: str, + new_status: SmartSimStatus, raw_status: str, returncode: t.Optional[int], error: t.Optional[str] = None, @@ -125,9 +244,10 @@ def set_status( """Set the status of a job. :param new_status: The new status of the job - :type new_status: str + :param raw_status: The raw status of the launcher :param returncode: The return code for the job - :type return_code: str + :param error: Content produced by stderr + :param output: Content produced by stdout """ self.status = new_status self.raw_status = raw_status @@ -149,15 +269,12 @@ def reset( """Reset the job in order to be able to restart it. :param new_job_name: name of the new job step - :type new_job_name: str :param new_job_id: new job id to launch under - :type new_job_id: int :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ self.name = new_job_name self.jid = new_job_id - self.status = STATUS_NEW + self.status = SmartSimStatus.STATUS_NEW self.returncode = None self.output = None self.error = None @@ -170,7 +287,6 @@ def error_report(self) -> str: """A descriptive error report based on job fields :return: error report for display in terminal - :rtype: str """ warning = f"{self.ename} failed. See below for details \n" if self.error: @@ -190,7 +306,6 @@ def __str__(self) -> str: """Return user-readable string of the Job :returns: A user-readable string of the Job - :rtype: str """ if self.jid: job = "{}({}): {}" @@ -208,19 +323,18 @@ class History: def __init__(self, runs: int = 0) -> None: """Init a history object for a job - :param runs: number of runs so far, defaults to 0 - :type runs: int, optional + :param runs: number of runs so far """ self.runs = runs self.jids: t.Dict[int, t.Optional[str]] = {} - self.statuses: t.Dict[int, str] = {} + self.statuses: t.Dict[int, SmartSimStatus] = {} self.returns: t.Dict[int, t.Optional[int]] = {} self.job_times: t.Dict[int, float] = {} def record( self, job_id: t.Optional[str], - status: str, + status: SmartSimStatus, returncode: t.Optional[int], job_time: float, ) -> None: diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index e482b9951..1bc24cf9a 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -35,7 +35,7 @@ from ...database import Orchestrator from ...entity import DBNode, EntitySequence, SmartSimEntity from ...log import ContextThread, get_logger -from ...status import STATUS_NEVER_STARTED, TERMINAL_STATUSES +from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG from ..launcher import Launcher, LocalLauncher from ..utils.network import get_ip_from_host @@ -61,7 +61,6 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: """Initialize a Jobmanager :param launcher: a Launcher object to manage jobs - :type: SmartSim.Launcher """ self.monitor: t.Optional[Thread] = None @@ -124,7 +123,6 @@ def move_to_completed(self, job: Job) -> None: actively monitored by the job manager :param job: job instance we are transitioning - :type job: Job """ with self._lock: self.completed[job.ename] = job @@ -141,9 +139,7 @@ def __getitem__(self, entity_name: str) -> Job: from which it was created. :param entity_name: The name of the entity of a job - :type entity_name: str :returns: the Job associated with the entity_name - :rtype: Job """ with self._lock: entities = ChainMap(self.db_jobs, self.jobs, self.completed) @@ -153,7 +149,6 @@ def __call__(self) -> t.Dict[str, Job]: """Returns dictionary all jobs for () operator :returns: Dictionary of all jobs - :rtype: dictionary """ all_jobs = {**self.jobs, **self.db_jobs} return all_jobs @@ -175,13 +170,9 @@ def add_job( """Add a job to the job manager which holds specific jobs by type. :param job_name: name of the job step - :type job_name: str :param job_id: job step id created by launcher - :type job_id: str :param entity: entity that was launched on job step - :type entity: SmartSimEntity | EntitySequence :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ launcher = str(self._launcher) # all operations here should be atomic @@ -197,9 +188,7 @@ def is_finished(self, entity: SmartSimEntity) -> bool: """Detect if a job has completed :param entity: entity to check - :type entity: SmartSimEntity :return: True if finished - :rtype: bool """ with self._lock: job = self[entity.name] # locked operation @@ -239,12 +228,11 @@ def check_jobs(self) -> None: def get_status( self, entity: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], - ) -> str: + ) -> SmartSimStatus: """Return the status of a job. :param entity: SmartSimEntity or EntitySequence instance - :type entity: SmartSimEntity | EntitySequence - :returns: tuple of status + :returns: a SmartSimStatus status """ with self._lock: if entity.name in self.completed: @@ -254,13 +242,12 @@ def get_status( job: Job = self[entity.name] # locked return job.status - return STATUS_NEVER_STARTED + return SmartSimStatus.STATUS_NEVER_STARTED def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance :param launcher: child of Launcher - :type launcher: Launcher instance """ self._launcher = launcher @@ -268,9 +255,7 @@ def query_restart(self, entity_name: str) -> bool: """See if the job just started should be restarted or not. :param entity_name: name of entity to check for a job for - :type entity_name: str :return: if job should be restarted instead of started - :rtype: bool """ if entity_name in self.completed: return True @@ -287,13 +272,9 @@ def restart_job( ready to launch again. :param job_name: new job step name - :type job_name: str :param job_id: new job id - :type job_id: str :param entity_name: name of the entity of the job - :type entity_name: str :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ with self._lock: @@ -311,7 +292,6 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: for corresponding database identifiers :return: dictionary of host ip addresses - :rtype: Dict[str, list] """ address_dict: t.Dict[str, t.List[str]] = {} @@ -333,7 +313,6 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None: """Set the DB hosts in db_jobs so future entities can query this :param orchestrator: orchestrator instance - :type orchestrator: Orchestrator """ # should only be called during launch in the controller @@ -349,9 +328,9 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None: self.db_jobs[dbnode.name].hosts = dbnode.hosts def signal_interrupt(self, signo: int, _frame: t.Optional[FrameType]) -> None: + """Custom handler for whenever SIGINT is received""" if not signo: logger.warning("Received SIGINT with no signal number") - """Custom handler for whenever SIGINT is received""" if self.actively_monitoring and len(self) > 0: if self.kill_on_interrupt: for _, job in self().items(): diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index 25037540c..fd5770f18 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -68,7 +68,6 @@ def dbs(self) -> t.List[Orchestrator]: :raises SmartSimError: if user added to databases to manifest :return: List of orchestrator instances - :rtype: list[Orchestrator] """ dbs = [item for item in self._deployables if isinstance(item, Orchestrator)] return dbs @@ -78,7 +77,6 @@ def models(self) -> t.List[Model]: """Return Model instances in Manifest :return: model instances - :rtype: List[Model] """ _models: t.List[Model] = [ item for item in self._deployables if isinstance(item, Model) @@ -90,7 +88,6 @@ def ensembles(self) -> t.List[Ensemble]: """Return Ensemble instances in Manifest :return: list of ensembles - :rtype: List[Ensemble] """ return [e for e in self._deployables if isinstance(e, Ensemble)] @@ -100,7 +97,6 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: exceptional ones like Orchestrator :return: list of entity lists - :rtype: List[EntitySequence[SmartSimEntity]] """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) @@ -109,6 +105,14 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: return _all_entity_lists + @property + def has_deployable(self) -> bool: + """ + Return True if the manifest contains entities that + must be physically deployed + """ + return bool(self._deployables) + @staticmethod def _check_names(deployables: t.List[t.Any]) -> None: used = [] @@ -294,7 +298,10 @@ def _entities_to_data( def finalize(self) -> LaunchedManifest[_T]: return LaunchedManifest( metadata=_LaunchedManifestMetadata( - self.run_id, self.exp_name, self.exp_path, self.launcher_name + self.run_id, + self.exp_name, + self.exp_path, + self.launcher_name, ), models=tuple(self._models), ensembles=tuple(self._ensembles), diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py new file mode 100644 index 000000000..857a70397 --- /dev/null +++ b/smartsim/_core/control/previewrenderer.py @@ -0,0 +1,192 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import typing as t +from enum import Enum + +import jinja2 +import jinja2.utils as u +from jinja2 import pass_eval_context + +from ..._core.config import CONFIG +from ..._core.control import Manifest +from ...error.errors import PreviewFormatError +from ...log import get_logger +from .job import Job + +logger = get_logger(__name__) + +if t.TYPE_CHECKING: + from smartsim import Experiment + + +class Format(str, Enum): + PLAINTEXT = "plain_text" + + +class Verbosity(str, Enum): + INFO = "info" + DEBUG = "debug" + DEVELOPER = "developer" + + +@pass_eval_context +def as_toggle(_eval_ctx: u.F, value: bool) -> str: + """Return "On" if value returns True, + and "Off" is value returns False. + """ + return "On" if value else "Off" + + +@pass_eval_context +def get_ifname(_eval_ctx: u.F, value: t.List[str]) -> str: + """Extract Network Interface from orchestrator run settings.""" + if value: + for val in value: + if "ifname=" in val: + output = val.split("=")[-1] + return output + return "" + + +@pass_eval_context +def get_dbtype(_eval_ctx: u.F, value: str) -> str: + """Extract data base type.""" + if value: + if "-cli" in value: + db_type, _ = value.split("/")[-1].split("-", 1) + return db_type + return "" + + +@pass_eval_context +def is_list(_eval_ctx: u.F, value: str) -> bool: + """Return True if item is of type list, and False + otherwise, to determine how Jinja template should + render an item. + """ + return isinstance(value, list) + + +def render_to_file(content: str, filename: str) -> None: + """Output preview to a file if an output filename + is specified. + + :param content: The rendered preview. + :param filename: The name of the file to write the preview to. + """ + filename = find_available_filename(filename) + + with open(filename, "w", encoding="utf-8") as prev_file: + prev_file.write(content) + + +def render( + exp: "Experiment", + manifest: t.Optional[Manifest] = None, + verbosity_level: Verbosity = Verbosity.INFO, + output_format: Format = Format.PLAINTEXT, + output_filename: t.Optional[str] = None, + active_dbjobs: t.Optional[t.Dict[str, Job]] = None, +) -> str: + """ + Render the template from the supplied entities. + :param experiment: the experiment to be previewed. + :param manifest: the manifest to be previewed. + :param verbosity_level: the verbosity level + :param output_format: the output format. + """ + + verbosity_level = Verbosity(verbosity_level) + + _check_output_format(output_format) + + loader = jinja2.PackageLoader( + "smartsim.templates.templates.preview", output_format.value + ) + env = jinja2.Environment(loader=loader, autoescape=True) + + env.filters["as_toggle"] = as_toggle + env.filters["get_ifname"] = get_ifname + env.filters["get_dbtype"] = get_dbtype + env.filters["is_list"] = is_list + env.globals["Verbosity"] = Verbosity + + tpl_path = "base.template" + + tpl = env.get_template(tpl_path) + + if verbosity_level == Verbosity.INFO: + logger.warning( + "Only showing user set parameters. Some internal entity " + "fields are truncated. To view truncated fields: use verbosity_level " + "'developer' or 'debug.'" + ) + + rendered_preview = tpl.render( + exp_entity=exp, + active_dbjobs=active_dbjobs, + manifest=manifest, + config=CONFIG, + verbosity_level=verbosity_level, + ) + + if output_filename: + render_to_file( + rendered_preview, + output_filename, + ) + else: + logger.info(rendered_preview) + return rendered_preview + + +def find_available_filename(filename: str) -> str: + """Iterate through potentially unique names until one is found that does + not already exist. Return an unused name variation + + :param filename: The name of the file to write the preview to. + """ + + path = pathlib.Path(filename) + candidate_path = pathlib.Path(filename) + index = 1 + + while candidate_path.exists(): + candidate_path = path.with_name(f"{path.stem}_{index:02}.txt") + index += 1 + return str(candidate_path) + + +def _check_output_format(output_format: Format) -> None: + """ + Check that a valid file output format is given. + """ + if not output_format == Format.PLAINTEXT: + raise PreviewFormatError(f"The only valid output format currently available \ +is {Format.PLAINTEXT.value}") diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 600ae2ff3..508251fe0 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -32,7 +32,7 @@ import tempfile import typing as t from pathlib import Path -from subprocess import PIPE, STDOUT +from subprocess import STDOUT from types import FrameType import filelock @@ -62,11 +62,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: """Parse options to launch model on local cluster :param client: SmartRedis client connected to local DB - :type client: Client :param db_model: List of arguments defining the model - :type db_model: List[str] :return: Name of model - :rtype: str """ parser = argparse.ArgumentParser("Set ML model on DB") parser.add_argument("--name", type=str) @@ -129,11 +126,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: """Parse options to launch script on local cluster :param client: SmartRedis client connected to local DB - :type client: Client :param db_model: List of arguments defining the script - :type db_model: List[str] :return: Name of model - :rtype: str """ parser = argparse.ArgumentParser("Set script on DB") parser.add_argument("--name", type=str) @@ -177,6 +171,7 @@ def main( db_scripts: t.List[t.List[str]], db_identifier: str, ) -> None: + # pylint: disable=too-many-statements global DBPID # pylint: disable=global-statement lo_address = current_ip("lo") @@ -201,8 +196,17 @@ def main( # we generally want to catch all exceptions here as # if this process dies, the application will most likely fail try: - process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT) - DBPID = process.pid + hostname = socket.gethostname() + filename = ( + f"colo_orc_{hostname}.log" + if os.getenv("SMARTSIM_LOG_LEVEL") == "debug" + else os.devnull + ) + with open(filename, "w", encoding="utf-8") as file: + process = psutil.Popen(cmd, stdout=file.fileno(), stderr=STDOUT) + DBPID = process.pid + # printing to stdout shell file for extraction + print(f"__PID__{DBPID}__PID__", flush=True) except Exception as e: cleanup() @@ -245,12 +249,8 @@ def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None: raise SSInternalError( "Failed to set model or script, could not connect to database" ) from ex - finally: - # Make sure we don't keep this around - del client - - for line in iter(process.stdout.readline, b""): - print(line.decode("utf-8").rstrip(), flush=True) + # Make sure we don't keep this around + del client except Exception as e: cleanup() diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py new file mode 100644 index 000000000..92ebd735f --- /dev/null +++ b/smartsim/_core/entrypoints/dragon.py @@ -0,0 +1,351 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterpris +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import dataclasses +import json +import os +import signal +import socket +import sys +import textwrap +import time +import typing as t +from types import FrameType + +import zmq +import zmq.auth.thread + +from smartsim._core.config import get_config +from smartsim._core.launcher.dragon import dragonSockets +from smartsim._core.launcher.dragon.dragonBackend import DragonBackend +from smartsim._core.schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonShutdownRequest, +) +from smartsim._core.utils.network import get_best_interface_and_address +from smartsim.log import ContextThread, get_logger + +""" +Dragon server entrypoint script +""" + +logger = get_logger("Dragon Server") + +# kill is not catchable +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + +SHUTDOWN_INITIATED = False + + +@dataclasses.dataclass +class DragonEntrypointArgs: + launching_address: str + interface: str + + +def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + if not signo: + logger.info("Received signal with no signo") + else: + logger.info(f"Received signal {signo}") + cleanup() + + +def get_log_path() -> str: + config = get_config() + return config.dragon_log_filename + + +def print_summary(network_interface: str, ip_address: str) -> None: + zmq_config = {"interface": network_interface, "address": ip_address} + + log_path = get_log_path() + with open(log_path, "w", encoding="utf-8") as dragon_config_log: + dragon_config_log.write( + textwrap.dedent(f"""\ + -------- Dragon Configuration -------- + IPADDRESS: {ip_address} + NETWORK: {network_interface} + HOSTNAME: {socket.gethostname()} + DRAGON_SERVER_CONFIG: {json.dumps(zmq_config)} + -------------------------------------- + """), + ) + + +def start_updater( + backend: DragonBackend, updater: t.Optional[ContextThread] +) -> ContextThread: + """Start the ``DragonBackend`` updater thread. + + If ``updater`` is not None, then it is first checked and if it + alive, no other thread is started. + + :param backend: The dragon backend for which the thread will be started + :param updater: An existing updater thread that might have to be replaced + :return: Running updater thread + """ + # If the updater was started, check if it completed or died + if updater is not None: + updater.join(0.1) + # If it's alive, there is nothing to do + if updater.is_alive(): + return updater + updater = ContextThread(name="DragonBackend", daemon=True, target=backend.update) + updater.start() + return updater + + +def is_updater_healthy(backend: DragonBackend) -> bool: + """Check if the backend has been updated recently. + + The acceptable delay is defined as the server timeout plus the backend's cooldown + period. If the server timeout is set to `-1`, then the acceptable delay is set to + one minute plus the cooldown period. + + :param backend: The backend for which the updater's health is checked + :return: Whether the backend was updated recently + """ + server_timeout = get_config().dragon_server_timeout / 1000 + acceptable_delay = backend.cooldown_period + ( + 60.0 if server_timeout == -1 else server_timeout + ) + + heartbeat_delay = backend.current_time - backend.last_heartbeat + if heartbeat_delay > acceptable_delay: + logger.debug( + f"Updater inactive for {heartbeat_delay:.2f} seconds, will request restart." + ) + return False + return True + + +def updater_fallback(backend: DragonBackend, updater: ContextThread) -> ContextThread: + """Check if updater has updated the backend recently, if not, check its status + and start a new one if it is not alive. + :param backend: The dragon backend for which the udpater's health must be checked + :param updater: The updater thread which has to be checked and (possibly) replaced + :return: Running updater thread + """ + if is_updater_healthy(backend): + return updater + return start_updater(backend, updater) + + +# pylint: disable-next=too-many-statements +def run( + zmq_context: "zmq.Context[t.Any]", + dragon_head_address: str, + dragon_pid: int, +) -> None: + logger.debug(f"Opening socket {dragon_head_address}") + dragon_head_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REP, True) + dragon_head_socket.bind(dragon_head_address) + dragon_backend = DragonBackend(pid=dragon_pid) + + backend_updater = start_updater(dragon_backend, None) + server = dragonSockets.as_server(dragon_head_socket) + + logger.debug(f"Listening to {dragon_head_address}") + + while not dragon_backend.should_shutdown: + try: + req = server.recv() + logger.debug(f"Received {type(req).__name__} {req}") + except zmq.Again: + backend_updater = updater_fallback(dragon_backend, backend_updater) + continue + + resp = dragon_backend.process_request(req) + + logger.debug(f"Sending {type(resp).__name__} {resp}") + try: + server.send(resp) + except zmq.Again: + logger.error("Could not send response back to launcher.") + backend_updater = updater_fallback(dragon_backend, backend_updater) + + # We can only check the heartbeat if the backend has not shut down + if not dragon_backend.should_shutdown: + logger.debug(f"Listening to {dragon_head_address}") + backend_updater = updater_fallback(dragon_backend, backend_updater) + + if SHUTDOWN_INITIATED: + dragon_backend.process_request(DragonShutdownRequest()) + + logger.info("Backend shutdown has been requested") + + if backend_updater.is_alive(): + backend_updater.join(1) + + if not dragon_backend.frontend_shutdown: + logger.info("Frontend will have to be shut down externally") + while True: + logger.info("Waiting for external shutdown") + time.sleep(5) + + +def execute_entrypoint(args: DragonEntrypointArgs) -> int: + if_config = get_best_interface_and_address() + interface = if_config.interface + address = if_config.address + if not interface: + raise ValueError("Net interface could not be determined") + dragon_head_address = f"tcp://{address}" + + smartsim_config = get_config() + if args.launching_address: + zmq_context = zmq.Context() + zmq_context.setsockopt( + zmq.SNDTIMEO, value=smartsim_config.dragon_server_timeout + ) + zmq_context.setsockopt( + zmq.RCVTIMEO, value=smartsim_config.dragon_server_timeout + ) + zmq_context.setsockopt(zmq.REQ_CORRELATE, 1) + zmq_context.setsockopt(zmq.REQ_RELAXED, 1) + + if str(args.launching_address).split(":", maxsplit=1)[0] == dragon_head_address: + address = "localhost" + dragon_head_address = "tcp://localhost:5555" + else: + dragon_head_address += ":5555" + + zmq_authenticator = dragonSockets.get_authenticator(zmq_context, timeout=-1) + + logger.debug("Getting launcher socket") + launcher_socket = dragonSockets.get_secure_socket(zmq_context, zmq.REQ, False) + + logger.debug(f"Connecting launcher socket to: {args.launching_address}") + launcher_socket.connect(args.launching_address) + client = dragonSockets.as_client(launcher_socket) + + logger.debug( + f"Sending bootstrap request to launcher_socket with {dragon_head_address}" + ) + client.send(DragonBootstrapRequest(address=dragon_head_address)) + response = client.recv() + + logger.debug(f"Received bootstrap response: {response}") + if not isinstance(response, DragonBootstrapResponse): + raise ValueError( + "Could not receive connection confirmation from launcher. Aborting." + ) + + print_summary(interface, dragon_head_address) + + try: + logger.debug("Executing event loop") + run( + zmq_context=zmq_context, + dragon_head_address=dragon_head_address, + dragon_pid=response.dragon_pid, + ) + except Exception as e: + logger.error(f"Dragon server failed with {e}", exc_info=True) + return os.EX_SOFTWARE + finally: + if zmq_authenticator is not None and zmq_authenticator.is_alive(): + zmq_authenticator.stop() + + logger.info("Shutting down! Bye bye!") + + return 0 + + +def remove_config_log() -> None: + """Remove the Dragon `config_log` file from the file system. Used to + clean up after a dragon environment is shutdown to eliminate an + unnecessary attempt to connect to a stopped ZMQ server.""" + log_path = get_log_path() + if os.path.exists(log_path): + os.remove(log_path) + + +def cleanup() -> None: + global SHUTDOWN_INITIATED # pylint: disable=global-statement + logger.debug("Cleaning up") + remove_config_log() + SHUTDOWN_INITIATED = True + + +def register_signal_handlers() -> None: + # make sure to register the cleanup before the start + # the process so our signaller will be able to stop + # the database process. + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def parse_arguments(args: t.List[str]) -> DragonEntrypointArgs: + parser = argparse.ArgumentParser( + prefix_chars="+", description="SmartSim Dragon Head Process" + ) + parser.add_argument( + "+launching_address", + type=str, + help="Address of launching process if a ZMQ connection can be established", + required=True, + ) + parser.add_argument( + "+interface", + type=str, + help="Network Interface name", + required=False, + ) + args_ = parser.parse_args(args) + + if not args_.launching_address: + raise ValueError("Empty launching address supplied.") + + return DragonEntrypointArgs(args_.launching_address, args_.interface) + + +def main(args_: t.List[str]) -> int: + """Execute the dragon entrypoint as a module""" + os.environ["PYTHONUNBUFFERED"] = "1" + logger.info("Dragon server started") + + args = parse_arguments(args_) + register_signal_handlers() + + try: + return_code = execute_entrypoint(args) + return return_code + except Exception: + logger.error( + "An unexpected error occurred in the Dragon entrypoint.", exc_info=True + ) + finally: + cleanup() + + return -1 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/smartsim/_core/entrypoints/dragon_client.py b/smartsim/_core/entrypoints/dragon_client.py new file mode 100644 index 000000000..e998ddce1 --- /dev/null +++ b/smartsim/_core/entrypoints/dragon_client.py @@ -0,0 +1,203 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterpris +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import dataclasses +import json +import os +import signal +import sys +import time +import typing as t +from pathlib import Path +from types import FrameType + +import zmq + +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.schemas import ( + DragonHandshakeRequest, + DragonRequest, + DragonShutdownRequest, + request_registry, +) +from smartsim.log import get_logger + +""" +Dragon client entrypoint script, used to start a server, send requests to it +and then shut it down. +""" + +logger = get_logger("Dragon Client") + +SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + + +@dataclasses.dataclass +class DragonClientEntrypointArgs: + submit: Path + + +def cleanup() -> None: + """Cleanup resources""" + logger.debug("Cleaning up") + + +def parse_requests(request_filepath: Path) -> t.List[DragonRequest]: + """Parse serialized requests from file + + :param request_filepath: Path to file with serialized requests + :return: Deserialized requests + """ + requests: t.List[DragonRequest] = [] + try: + with open(request_filepath, "r", encoding="utf-8") as request_file: + req_strings = json.load(fp=request_file) + except FileNotFoundError as e: + logger.error( + "Could not find file with run requests," + f"please check whether {request_filepath} exists." + ) + raise e from None + except json.JSONDecodeError as e: + logger.error(f"Could not decode request file {request_filepath}.") + raise e from None + + requests = [request_registry.from_string(req_str) for req_str in req_strings] + + return requests + + +def parse_arguments(args: t.List[str]) -> DragonClientEntrypointArgs: + """Parse arguments used to run entrypoint script + + :param args: Arguments without name of executable + :raises ValueError: If the request file is not specified + :return: Parsed arguments + """ + parser = argparse.ArgumentParser( + prefix_chars="+", + description="SmartSim Dragon Client Process, to be used in batch scripts", + ) + parser.add_argument("+submit", type=str, help="Path to request file", required=True) + args_ = parser.parse_args(args) + + if not args_.submit: + raise ValueError("Request file not provided.") + + return DragonClientEntrypointArgs(submit=Path(args_.submit)) + + +def handle_signal(signo: int, _frame: t.Optional[FrameType] = None) -> None: + """Handle signals sent to this process + + :param signo: Signal number + :param _frame: Frame, defaults to None + """ + if not signo: + logger.info("Received signal with no signo") + else: + logger.info(f"Received signal {signo}") + cleanup() + + +def register_signal_handlers() -> None: + """Register signal handlers prior to execution""" + # make sure to register the cleanup before the start + # the process so our signaller will be able to stop + # the server process. + for sig in SIGNALS: + signal.signal(sig, handle_signal) + + +def execute_entrypoint(args: DragonClientEntrypointArgs) -> int: + """Execute the entrypoint with specified arguments + + :param args: Parsed arguments + :return: Return code + """ + + try: + requests = parse_requests(args.submit) + except Exception: + logger.error("Dragon client failed to parse request file", exc_info=True) + return os.EX_OSFILE + + requests.append(DragonShutdownRequest(immediate=False, frontend_shutdown=True)) + + connector = DragonConnector() + + for request in requests: + response = connector.send_request(request) + if response.error_message is not None: + logger.error(response.error_message) + + logger.info("Terminated sending requests, waiting for Dragon Server to complete") + + if not connector.can_monitor: + logger.error( + "Could not get Dragon Server PID and will not be able to monitor it." + ) + return os.EX_IOERR + + while True: + try: + time.sleep(5) + connector.send_request(DragonHandshakeRequest()) + except zmq.error.Again: + logger.debug("Could not reach server, assuming backend has shut down") + break + + logger.info("Client has finished.") + + return os.EX_OK + + +def main(args_: t.List[str]) -> int: + """Execute the dragon client entrypoint as a module""" + + os.environ["PYTHONUNBUFFERED"] = "1" + logger.info("Dragon client started") + + args = parse_arguments(args_) + register_signal_handlers() + + try: + return execute_entrypoint(args) + except Exception: + logger.error( + "An unexpected error occurred in the Dragon client entrypoint", + exc_info=True, + ) + finally: + cleanup() + + return os.EX_SOFTWARE + + +if __name__ == "__main__": + + sys.exit(main(sys.argv[1:])) diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index f94ad6e61..1f445ac4a 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -37,8 +37,8 @@ import psutil import smartsim.log -from smartsim._core.entrypoints.telemetrymonitor import track_event -from smartsim._core.utils.helpers import decode_cmd, get_ts +from smartsim._core.utils.helpers import decode_cmd, get_ts_ms +from smartsim._core.utils.telemetry.telemetry import write_event STEP_PID: t.Optional[int] = None logger = smartsim.log.get_logger(__name__) @@ -49,15 +49,21 @@ def main( cmd: str, - etype: str, + entity_type: str, cwd: str, status_dir: str, ) -> int: - """The main function of the entrypoint. This function takes an encoded step - command and runs it in a subprocess. In the background, this entrypoint - will then monitor the subprocess and write out status events such as when - the subprocess has started or stopped and write these events to a status - directory. + """This function receives an encoded step command from a SmartSim Experiment + and runs it in a subprocess. The entrypoint integrates with the telemetry + monitor by writing status update events. It is useful for wrapping + unmanaged tasks - a workload manager can be queried for a managed task + to achieve the same result. + + :param cmd: a base64 encoded cmd to execute + :param entity_type: `SmartSimEntity` entity class. Valid values + include: orchestrator, dbnode, ensemble, model + :param cwd: working directory to execute the cmd from + :param status_dir: path to the output directory for status updates """ global STEP_PID # pylint: disable=global-statement proxy_pid = os.getpid() @@ -94,34 +100,37 @@ def main( cleanup() return 1 finally: - track_event( - get_ts(), + write_event( + get_ts_ms(), proxy_pid, "", # step_id for unmanaged task is always empty - etype, + entity_type, "start", status_path, - logger, detail=start_detail, return_code=start_rc, ) logger.info(f"Waiting for child process {STEP_PID} to complete") - ret_code = process.wait() + + try: + ret_code = process.wait() + except Exception: + logger.error("Failed to complete process", exc_info=True) + ret_code = -1 logger.info( f"Indirect proxy {proxy_pid} child process {STEP_PID} complete." f" return code: {ret_code}" ) msg = f"Process {STEP_PID} finished with return code: {ret_code}" - track_event( - get_ts(), + write_event( + get_ts_ms(), proxy_pid, "", # step_id for unmanaged task is always empty - etype, + entity_type, "stop", status_path, - logger, detail=msg, return_code=ret_code, ) @@ -132,11 +141,12 @@ def main( def cleanup() -> None: """Perform cleanup required for clean termination""" - logger.info("Performing cleanup") global STEP_PID # pylint: disable=global-statement if STEP_PID is None: return + logger.info("Performing cleanup") + try: # attempt to stop the subprocess performing step-execution if psutil.pid_exists(STEP_PID): @@ -228,7 +238,7 @@ def get_parser() -> argparse.ArgumentParser: rc = main( cmd=parsed_args.command, - etype=parsed_args.entity_type, + entity_type=parsed_args.entity_type, cwd=parsed_args.working_dir, status_dir=parsed_args.telemetry_dir, ) diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index 018fc26fd..c4d8cbbd6 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -37,7 +37,6 @@ from smartsim._core.utils.network import current_ip from smartsim.entity.dbnode import LaunchedShardData -from smartsim.error import SSInternalError from smartsim.log import get_logger logger = get_logger(__name__) @@ -111,6 +110,7 @@ def main(args: argparse.Namespace) -> int: *build_cluster_args(shard_data), *build_bind_args(src_addr, *bind_addrs), ] + print_summary(cmd, args.ifname, shard_data) try: @@ -119,9 +119,10 @@ def main(args: argparse.Namespace) -> int: for line in iter(process.stdout.readline, b""): print(line.decode("utf-8").rstrip(), flush=True) - except Exception as e: + except Exception: cleanup() - raise SSInternalError("Database process starter raised an exception") from e + logger.error("Database process starter raised an exception", exc_info=True) + return 1 return 0 @@ -179,6 +180,7 @@ def cleanup() -> None: action="store_true", help="Specify if this orchestrator shard is part of a cluster", ) + args_ = parser.parse_args() # make sure to register the cleanup before the start diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 115528bf4..5ed1a0c91 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -23,667 +23,149 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import argparse -import json +import asyncio import logging import os +import os.path import pathlib import signal import sys -import threading -import time import typing as t -from dataclasses import dataclass, field from types import FrameType -from watchdog.events import ( - FileCreatedEvent, - FileModifiedEvent, - LoggingEventHandler, - PatternMatchingEventHandler, +import smartsim._core.config as cfg +from smartsim._core.utils.telemetry.telemetry import ( + TelemetryMonitor, + TelemetryMonitorArgs, ) -from watchdog.observers import Observer -from watchdog.observers.api import BaseObserver - -from smartsim._core.config import CONFIG -from smartsim._core.control.job import JobEntity, _JobKey -from smartsim._core.control.jobmanager import JobManager -from smartsim._core.launcher.launcher import Launcher -from smartsim._core.launcher.local.local import LocalLauncher -from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher -from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher -from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher -from smartsim._core.launcher.stepInfo import StepInfo -from smartsim._core.utils.helpers import get_ts -from smartsim._core.utils.serialize import MANIFEST_FILENAME -from smartsim.error.errors import SmartSimError -from smartsim.status import STATUS_COMPLETED, TERMINAL_STATUSES - -"""Telemetry Monitor entrypoint""" - -# kill is not catchable -SIGNALS = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] -_EventClass = t.Literal["start", "stop", "timestep"] -_MAX_MANIFEST_LOAD_ATTEMPTS: t.Final[int] = 6 - - -@dataclass -class Run: - """Model containing entities of an individual start call for an experiment""" - - timestamp: int - models: t.List[JobEntity] - orchestrators: t.List[JobEntity] - ensembles: t.List[JobEntity] - - def flatten( - self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None - ) -> t.List[JobEntity]: - """Flatten runs into a list of SmartSimEntity run events""" - entities = self.models + self.orchestrators + self.ensembles - if filter_fn: - entities = [entity for entity in entities if filter_fn(entity)] - return entities - - -@dataclass -class RuntimeManifest: - """The runtime manifest holds meta information about the experiment entities created - at runtime to satisfy the experiment requirements. - """ - - name: str - path: pathlib.Path - launcher: str - runs: t.List[Run] = field(default_factory=list) - - -def _hydrate_persistable( - persistable_entity: t.Dict[str, t.Any], - entity_type: str, - exp_dir: str, -) -> JobEntity: - """Populate JobEntity instance with supplied metdata and instance details""" - entity = JobEntity() - - metadata = persistable_entity["telemetry_metadata"] - status_dir = pathlib.Path(metadata.get("status_dir")) - - entity.type = entity_type - entity.name = persistable_entity["name"] - entity.step_id = str(metadata.get("step_id") or "") - entity.task_id = str(metadata.get("task_id") or "") - entity.timestamp = int(persistable_entity.get("timestamp", "0")) - entity.path = str(exp_dir) - entity.status_dir = str(status_dir) - - return entity - - -def hydrate_persistable( - entity_type: str, - persistable_entity: t.Dict[str, t.Any], - exp_dir: pathlib.Path, -) -> t.List[JobEntity]: - """Map entity data persisted in a manifest file to an object""" - entities = [] - - # an entity w/parent key creates persistables for entities it contains - parent_keys = {"shards", "models"} - parent_keys = parent_keys.intersection(persistable_entity.keys()) - if parent_keys: - container = "shards" if "shards" in parent_keys else "models" - child_type = "orchestrator" if container == "shards" else "model" - for child_entity in persistable_entity[container]: - entity = _hydrate_persistable(child_entity, child_type, str(exp_dir)) - entities.append(entity) - - return entities - - entity = _hydrate_persistable(persistable_entity, entity_type, str(exp_dir)) - entities.append(entity) - return entities - - -def hydrate_persistables( - entity_type: str, - run: t.Dict[str, t.Any], - exp_dir: pathlib.Path, -) -> t.Dict[str, t.List[JobEntity]]: - """Map a collection of entity data persisted in a manifest file to an object""" - persisted: t.Dict[str, t.List[JobEntity]] = { - "model": [], - "orchestrator": [], - } - for item in run[entity_type]: - entities = hydrate_persistable(entity_type, item, exp_dir) - for new_entity in entities: - persisted[new_entity.type].append(new_entity) - - return persisted - - -def hydrate_runs( - persisted_runs: t.List[t.Dict[str, t.Any]], exp_dir: pathlib.Path -) -> t.List[Run]: - """Map run data persisted in a manifest file to an object""" - the_runs: t.List[Run] = [] - for run_instance in persisted_runs: - run_entities: t.Dict[str, t.List[JobEntity]] = { - "model": [], - "orchestrator": [], - "ensemble": [], - } - - for key in run_entities: - _entities = hydrate_persistables(key, run_instance, exp_dir) - for entity_type, new_entities in _entities.items(): - if new_entities: - run_entities[entity_type].extend(new_entities) - - run = Run( - run_instance["timestamp"], - run_entities["model"], - run_entities["orchestrator"], - run_entities["ensemble"], - ) - the_runs.append(run) - - return the_runs - - -def load_manifest(file_path: str) -> t.Optional[RuntimeManifest]: - """Load a persisted manifest and return the content""" - manifest_dict: t.Optional[t.Dict[str, t.Any]] = None - try_count = 1 - - while manifest_dict is None and try_count < _MAX_MANIFEST_LOAD_ATTEMPTS: - source = pathlib.Path(file_path) - source = source.resolve() - - try: - if text := source.read_text(encoding="utf-8").strip(): - manifest_dict = json.loads(text) - except json.JSONDecodeError as ex: - print(f"Error loading manifest: {ex}") - # hack/fix: handle issues reading file before it is fully written - time.sleep(0.5 * try_count) - finally: - try_count += 1 - - if not manifest_dict: - return None - - exp = manifest_dict.get("experiment", None) - if not exp: - raise ValueError("Manifest missing required experiment") - - runs = manifest_dict.get("runs", None) - if runs is None: - raise ValueError("Manifest missing required runs") - - exp_dir = pathlib.Path(exp["path"]) - runs = hydrate_runs(runs, exp_dir) - - manifest = RuntimeManifest( - name=exp["name"], - path=exp_dir, - launcher=exp["launcher"], - runs=runs, - ) - return manifest - - -def track_event( - timestamp: int, - task_id: t.Union[int, str], - step_id: str, - etype: str, - action: _EventClass, - status_dir: pathlib.Path, - logger: logging.Logger, - detail: str = "", - return_code: t.Optional[int] = None, -) -> None: - """Persist a tracking event for an entity""" - tgt_path = status_dir / f"{action}.json" - tgt_path.parent.mkdir(parents=True, exist_ok=True) - - try: - task_id = int(task_id) - except ValueError: - pass - - entity_dict = { - "timestamp": timestamp, - "job_id": task_id, - "step_id": step_id, - "type": etype, - "action": action, - } - - if detail is not None: - entity_dict["detail"] = detail - - if return_code is not None: - entity_dict["return_code"] = return_code - - try: - if not tgt_path.exists(): - # Don't overwrite existing tracking files - bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) - if bytes_written < 1: - logger.warning("event tracking failed to write tracking file.") - except Exception: - logger.error("Unable to write tracking file.", exc_info=True) - - -def faux_return_code(step_info: StepInfo) -> t.Optional[int]: - """Create a faux return code for a task run by the WLM. Must not be - called with non-terminal statuses or results may be confusing - """ - if step_info.status not in TERMINAL_STATUSES: - return None - - if step_info.status == STATUS_COMPLETED: - return os.EX_OK - - return 1 - - -class ManifestEventHandler(PatternMatchingEventHandler): - """The ManifestEventHandler monitors an experiment for changes and updates - a telemetry datastore as needed. - - It contains event handlers that are triggered by changes to a runtime experiment - manifest. The runtime manifest differs from a standard manifest. A runtime manifest - may contain multiple experiment executions in a `runs` collection. - - It also contains a long-polling loop that checks experiment entities for updates - at each timestep. - """ - - def __init__( - self, - pattern: str, - logger: logging.Logger, - ignore_patterns: t.Any = None, - ignore_directories: bool = True, - case_sensitive: bool = False, - ) -> None: - super().__init__( - [pattern], ignore_patterns, ignore_directories, case_sensitive - ) # type: ignore - self._logger = logger - self._tracked_runs: t.Dict[int, Run] = {} - self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} - self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} - self._launcher: t.Optional[Launcher] = None - self.job_manager: JobManager = JobManager(threading.RLock()) - self._launcher_map: t.Dict[str, t.Type[Launcher]] = { - "slurm": SlurmLauncher, - "pbs": PBSLauncher, - "lsf": LSFLauncher, - "local": LocalLauncher, - } - - def init_launcher(self, launcher: str) -> Launcher: - """Initialize the controller with a specific type of launcher. - SmartSim currently supports slurm, pbs(pro), lsf, - and local launching - - :param launcher: which launcher to initialize - :type launcher: str - :raises SSUnsupportedError: if a string is passed that is not - a supported launcher - :raises TypeError: if no launcher argument is provided. - """ - if not launcher: - raise TypeError("Must provide a 'launcher' argument") - - if launcher_type := self._launcher_map.get(launcher.lower(), None): - return launcher_type() - - raise ValueError("Launcher type not supported: " + launcher) - - def set_launcher(self, launcher_type: str) -> None: - """Set the launcher for the experiment""" - self._launcher = self.init_launcher(launcher_type) - self.job_manager.set_launcher(self._launcher) - self.job_manager.start() - - def process_manifest(self, manifest_path: str) -> None: - """Read the runtime manifest for the experiment and track new entities - - :param manifest_path: The full path to the manifest file - :type manifest_path: str - """ - try: - manifest = load_manifest(manifest_path) - if not manifest: - return - except json.JSONDecodeError: - self._logger.error(f"Malformed manifest encountered: {manifest_path}") - return - except ValueError: - self._logger.error("Manifest content error", exc_info=True) - return - - if self._launcher is None: - self.set_launcher(manifest.launcher) - - if not self._launcher: - raise SmartSimError(f"Unable to set launcher from {manifest_path}") - - runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] - - exp_dir = pathlib.Path(manifest_path).parent.parent.parent - - for run in runs: - for entity in run.flatten( - filter_fn=lambda e: e.key not in self._tracked_jobs and e.is_managed - ): - entity.path = str(exp_dir) - - self._tracked_jobs[entity.key] = entity - track_event( - run.timestamp, - entity.task_id, - entity.step_id, - entity.type, - "start", - pathlib.Path(entity.status_dir), - self._logger, - ) +from smartsim.log import DEFAULT_LOG_FORMAT, HostnameFilter - if entity.is_managed: - self.job_manager.add_job( - entity.name, - entity.task_id, - entity, - False, - ) - self._launcher.step_mapping.add( - entity.name, entity.step_id, entity.task_id, True - ) - self._tracked_runs[run.timestamp] = run +"""Telemetry Monitor entrypoint +Starts a long-running, standalone process that hosts a `TelemetryMonitor`""" - def on_modified(self, event: FileModifiedEvent) -> None: - """Event handler for when a file or directory is modified. - :param event: Event representing file/directory modification. - :type event: FileModifiedEvent - """ - super().on_modified(event) # type: ignore - self._logger.info(f"processing manifest modified @ {event.src_path}") - self.process_manifest(event.src_path) +logger = logging.getLogger("TelemetryMonitor") - def on_created(self, event: FileCreatedEvent) -> None: - """Event handler for when a file or directory is created. - :param event: Event representing file/directory creation. - :type event: FileCreatedEvent - """ - super().on_created(event) # type: ignore - self._logger.info(f"processing manifest created @ {event.src_path}") - self.process_manifest(event.src_path) - - def _to_completed( - self, - timestamp: int, - entity: JobEntity, - step_info: StepInfo, - ) -> None: - """Move a monitored entity from the active to completed collection to - stop monitoring for updates during timesteps. - - :param timestamp: the current timestamp for event logging - :type timestamp: int - :param entity: the running SmartSim Job - :type entity: JobEntity - :param experiment_dir: the experiement directory to monitor for changes - :type experiment_dir: pathlib.Path - :param entity: the StepInfo received when requesting a Job status update - :type entity: StepInfo - """ - inactive_entity = self._tracked_jobs.pop(entity.key) - if entity.key not in self._completed_jobs: - self._completed_jobs[entity.key] = inactive_entity - - job = self.job_manager[entity.name] - self.job_manager.move_to_completed(job) - - status_clause = f"status: {step_info.status}" - error_clause = f", error: {step_info.error}" if step_info.error else "" - detail = f"{status_clause}{error_clause}" - - if hasattr(job.entity, "status_dir"): - write_path = pathlib.Path(job.entity.status_dir) - - track_event( - timestamp, - entity.task_id, - entity.step_id, - entity.type, - "stop", - write_path, - self._logger, - detail=detail, - return_code=faux_return_code(step_info), - ) - - def on_timestep(self, timestamp: int) -> None: - """Called at polling frequency to request status updates on - monitored entities - - :param timestamp: the current timestamp for event logging - :type timestamp: int - :param experiment_dir: the experiement directory to monitor for changes - :type experiment_dir: pathlib.Path - """ - entity_map = self._tracked_jobs - - if not self._launcher: - return - - # consider not using name to avoid collisions - names = {entity.name: entity for entity in entity_map.values()} - - if names: - step_updates = self._launcher.get_step_update(list(names.keys())) - - for step_name, step_info in step_updates: - if step_info and step_info.status in TERMINAL_STATUSES: - completed_entity = names[step_name] - self._to_completed(timestamp, completed_entity, step_info) - - -def can_shutdown(action_handler: ManifestEventHandler, logger: logging.Logger) -> bool: - jobs = action_handler.job_manager.jobs - db_jobs = action_handler.job_manager.db_jobs - - has_jobs = bool(jobs) - has_dbs = bool(db_jobs) - has_running_jobs = has_jobs or has_dbs - - if has_jobs: - logger.debug(f"telemetry monitor is monitoring {len(jobs)} jobs") - if has_dbs: - logger.debug(f"telemetry monitor is monitoring {len(db_jobs)} dbs") - - return not has_running_jobs - - -def event_loop( - observer: BaseObserver, - action_handler: ManifestEventHandler, - frequency: t.Union[int, float], - logger: logging.Logger, - cooldown_duration: int, +def register_signal_handlers( + handle_signal: t.Callable[[int, t.Optional[FrameType]], None] ) -> None: - """Executes all attached timestep handlers every seconds + """Register a signal handling function for all termination events - :param observer: (optional) a preconfigured watchdog Observer to inject - :type observer: t.Optional[BaseObserver] - :param action_handler: The manifest event processor instance - :type action_handler: ManifestEventHandler - :param frequency: frequency (in seconds) of update loop - :type frequency: t.Union[int, float] - :param logger: a preconfigured Logger instance - :type logger: logging.Logger - :param cooldown_duration: number of seconds the telemetry monitor should - poll for new jobs before attempting to shutdown - :type cooldown_duration: int + :param handle_signal: the function to execute when a term signal is received """ - elapsed: int = 0 - last_ts: int = get_ts() - - while observer.is_alive(): - timestamp = get_ts() - logger.debug(f"Telemetry timestep: {timestamp}") - action_handler.on_timestep(timestamp) - - elapsed += timestamp - last_ts - last_ts = timestamp - - if can_shutdown(action_handler, logger): - if elapsed >= cooldown_duration: - logger.info("beginning telemetry manager shutdown") - observer.stop() # type: ignore - else: - # reset cooldown any time there are still jobs running - elapsed = 0 - - time.sleep(frequency) + # NOTE: omitting kill because it is not catchable + term_signals = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] + for signal_num in term_signals: + signal.signal(signal_num, handle_signal) -def main( - frequency: t.Union[int, float], - experiment_dir: pathlib.Path, - logger: logging.Logger, - observer: t.Optional[BaseObserver] = None, - cooldown_duration: t.Optional[int] = 0, -) -> int: - """Setup the monitoring entities and start the timer-based loop that - will poll for telemetry data +def get_parser() -> argparse.ArgumentParser: + """Instantiate a parser to process command line arguments - :param frequency: frequency (in seconds) of update loop - :type frequency: t.Union[int, float] - :param experiment_dir: the experiement directory to monitor for changes - :type experiment_dir: pathlib.Path - :param logger: a preconfigured Logger instance - :type logger: logging.Logger - :param observer: (optional) a preconfigured Observer to inject - :type observer: t.Optional[BaseObserver] - :param cooldown_duration: number of seconds the telemetry monitor should - poll for new jobs before attempting to shutdown - :type cooldown_duration: int + :returns: An argument parser ready to accept required telemetry monitor parameters """ - manifest_relpath = pathlib.Path(CONFIG.telemetry_subdir) / MANIFEST_FILENAME - manifest_path = experiment_dir / manifest_relpath - monitor_pattern = str(manifest_relpath) - - logger.info( - f"Executing telemetry monitor with frequency: {frequency}s" - f", on target directory: {experiment_dir}" - f" matching pattern: {monitor_pattern}" - ) - - cooldown_duration = cooldown_duration or CONFIG.telemetry_cooldown - log_handler = LoggingEventHandler(logger) # type: ignore - action_handler = ManifestEventHandler(monitor_pattern, logger) - - if observer is None: - observer = Observer() - - try: - if manifest_path.exists(): - # a manifest may not exist depending on startup timing - action_handler.process_manifest(str(manifest_path)) - - observer.schedule(log_handler, experiment_dir, recursive=True) # type:ignore - observer.schedule(action_handler, experiment_dir, recursive=True) # type:ignore - observer.start() # type: ignore - - event_loop(observer, action_handler, frequency, logger, cooldown_duration) - return os.EX_OK - except Exception as ex: - logger.error(ex) - finally: - if observer.is_alive(): - observer.stop() # type: ignore - observer.join() - - return os.EX_SOFTWARE - - -def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: - """Helper function to ensure clean process termination""" - if not signo: - logger = logging.getLogger() - logger.warning("Received signal with no signo") - - -def register_signal_handlers() -> None: - """Register a signal handling function for all termination events""" - for sig in SIGNALS: - signal.signal(sig, handle_signal) - - -def get_parser() -> argparse.ArgumentParser: - """Instantiate a parser to process command line arguments""" arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") - arg_parser.add_argument( - "-frequency", - type=int, - help="Frequency of telemetry updates (in seconds))", - required=True, - ) arg_parser.add_argument( "-exp_dir", type=str, help="Experiment root directory", required=True, ) + arg_parser.add_argument( + "-frequency", + type=float, + help="Frequency of telemetry updates (in seconds))", + required=True, + ) arg_parser.add_argument( "-cooldown", type=int, help="Default lifetime of telemetry monitor (in seconds) before auto-shutdown", - default=CONFIG.telemetry_cooldown, + default=cfg.CONFIG.telemetry_cooldown, + ) + arg_parser.add_argument( + "-loglevel", + type=int, + help="Logging level", + default=logging.INFO, ) return arg_parser -if __name__ == "__main__": - os.environ["PYTHONUNBUFFERED"] = "1" +def parse_arguments() -> TelemetryMonitorArgs: + """Parse the command line arguments and return an instance + of TelemetryMonitorArgs populated with the CLI inputs + :returns: `TelemetryMonitorArgs` instance populated with command line arguments + """ parser = get_parser() - args = parser.parse_args() + parsed_args = parser.parse_args() + return TelemetryMonitorArgs( + parsed_args.exp_dir, + parsed_args.frequency, + parsed_args.cooldown, + parsed_args.loglevel, + ) - log = logging.getLogger(f"{__name__}.TelemetryMonitor") - log.setLevel(logging.DEBUG) - log.propagate = False - log_path = os.path.join( - args.exp_dir, CONFIG.telemetry_subdir, "telemetrymonitor.log" - ) - fh = logging.FileHandler(log_path, "a") - log.addHandler(fh) +def configure_logger(logger_: logging.Logger, log_level_: int, exp_dir: str) -> None: + """Configure the telemetry monitor logger to write logs to the + target output file path passed as an argument to the entrypoint + + :param logger_: logger to configure + :param log_level_: log level to apply to the python logging system + :param exp_dir: root path to experiment outputs + """ + logger_.setLevel(log_level_) + logger_.propagate = False + + # use a standard subdirectory of the experiment output path for logs + telemetry_dir = pathlib.Path(exp_dir) / cfg.CONFIG.telemetry_subdir + + # all telemetry monitor logs are written to file in addition to stdout + log_path = telemetry_dir / "logs/telemetrymonitor.out" + log_path.parent.mkdir(parents=True, exist_ok=True) + file_handler = logging.FileHandler(log_path, "a") + + # HostnameFilter is required to enrich log context to use DEFAULT_LOG_FORMAT + file_handler.addFilter(HostnameFilter()) + + formatter = logging.Formatter(DEFAULT_LOG_FORMAT) + file_handler.setFormatter(formatter) + logger_.addHandler(file_handler) + + +if __name__ == "__main__": + """Prepare the telemetry monitor process using command line arguments. + + Sample usage: + python -m smartsim._core.entrypoints.telemetrymonitor -exp_dir + -frequency 30 -cooldown 90 -loglevel INFO + The experiment id is generated during experiment startup + and can be found in the manifest.json in /.smartsim/telemetry + """ + os.environ["PYTHONUNBUFFERED"] = "1" + + args = parse_arguments() + configure_logger(logger, args.log_level, args.exp_dir) + + telemetry_monitor = TelemetryMonitor(args) # Must register cleanup before the main loop is running - register_signal_handlers() + def cleanup_telemetry_monitor(_signo: int, _frame: t.Optional[FrameType]) -> None: + """Create an enclosure on `manifest_observer` to avoid global variables""" + logger.info("Shutdown signal received by telemetry monitor entrypoint") + telemetry_monitor.cleanup() + + register_signal_handlers(cleanup_telemetry_monitor) try: - main( - int(args.frequency), - pathlib.Path(args.exp_dir), - log, - cooldown_duration=args.cooldown, - ) + asyncio.run(telemetry_monitor.run()) sys.exit(0) except Exception: - log.exception( + logger.exception( "Shutting down telemetry monitor due to unexpected error", exc_info=True ) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 502753df7..8706cf568 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -64,11 +64,8 @@ def __init__( collision between entities. :param gen_path: Path in which files need to be generated - :type gen_path: str - :param overwrite: toggle entity replacement, defaults to False - :type overwrite: bool, optional + :param overwrite: toggle entity replacement :param verbose: Whether generation information should be logged to std out - :type verbose: bool, optional """ self._writer = ModelWriter() self.gen_path = gen_path @@ -82,7 +79,6 @@ def log_file(self) -> str: of all generated entities. :returns: path to file with parameter settings - :rtype: str """ return join(self.gen_path, "smartsim_params.txt") @@ -129,11 +125,7 @@ def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: :param tag: A string of characters that signify the string to be changed. Defaults to ``;`` - :type tag: str - - :param regex: full regex for the modelwriter to search for, - defaults to None - :type regex: str | None + :param regex: full regex for the modelwriter to search for """ self._writer.set_tag(tag, regex) @@ -148,7 +140,7 @@ def _gen_exp_dir(self) -> None: ) if not path.isdir(self.gen_path): # keep exists ok for race conditions on NFS - pathlib.Path(self.gen_path).mkdir(exist_ok=True) + pathlib.Path(self.gen_path).mkdir(exist_ok=True, parents=True) else: logger.log( level=self.log_level, msg="Working in previously created experiment" @@ -167,7 +159,6 @@ def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: configuration files for the orchestrator. :param orchestrator: Orchestrator instance - :type orchestrator: Orchestrator | None """ # Loop through orchestrators for orchestrator in orchestrator_list: @@ -177,13 +168,12 @@ def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: # Always remove orchestrator files if present. if path.isdir(orc_path): shutil.rmtree(orc_path, ignore_errors=True) - pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite) + pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite, parents=True) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances :param entity_lists: list of Ensemble instances - :type entity_lists: list """ if not entity_lists: @@ -209,9 +199,7 @@ def _gen_entity_dirs( """Generate directories for Entity instances :param entities: list of Model instances - :type entities: list[Model] - :param entity_list: Ensemble instance, defaults to None - :type entity_list: Ensemble | None + :param entity_list: Ensemble instance :raises EntityExistsError: if a directory already exists for an entity by that name """ @@ -247,7 +235,6 @@ def _write_tagged_entity_files(self, entity: Model) -> None: an Ensemble. :param entity: a Model instance - :type entity: Model """ if entity.files: to_write = [] @@ -258,7 +245,6 @@ def _build_tagged_files(tagged: TaggedFilesHierarchy) -> None: :param tagged: a TaggedFileHierarchy to be built as a directory structure - :type tagged: TaggedFilesHierarchy """ for file in tagged.files: dst_path = path.join(entity.path, tagged.base, path.basename(file)) @@ -291,9 +277,7 @@ def _log_params( and what values were set to the parameters :param entity: the model being generated - :type entity: Model :param files_to_params: a dict connecting each file to its parameter settings - :type files_to_params: t.Dict[str, t.Dict[str, str]] """ used_params: t.Dict[str, str] = {} file_to_tables: t.Dict[str, str] = {} @@ -333,7 +317,6 @@ def _copy_entity_files(entity: Model) -> None: """Copy the entity files and directories attached to this entity. :param entity: Model - :type entity: Model """ if entity.files: for to_copy in entity.files.copy: @@ -348,7 +331,6 @@ def _link_entity_files(entity: Model) -> None: """Symlink the entity files attached to this entity. :param entity: Model - :type entity: Model """ if entity.files: for to_link in entity.files.link: diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index 3062ea1db..2998d4e35 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import collections import re import typing as t @@ -47,10 +48,8 @@ def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: :param tag: tag for the modelwriter to search for, defaults to semi-colon e.g. ";" - :type tag: str :param regex: full regex for the modelwriter to search for, defaults to "(;.+;)" - :type regex: str, optional """ if regex: self.regex = regex @@ -68,13 +67,9 @@ def configure_tagged_model_files( instance. :param tagged_files: list of paths to tagged files - :type model: list[str] :param params: model parameters - :type params: dict[str, str] :param make_missing_tags_fatal: raise an error if a tag is missing - :type make_missing_tags_fatal: bool :returns: A dict connecting each file to its parameter settings - :rtype: dict[str,dict[str,str]] """ files_to_tags: t.Dict[str, t.Dict[str, str]] = {} for tagged_file in tagged_files: @@ -89,7 +84,6 @@ def _set_lines(self, file_path: str) -> None: """Set the lines for the modelwrtter to iterate over :param file_path: path to the newly created and tagged file - :type file_path: str :raises ParameterWriterError: if the newly created file cannot be read """ try: @@ -117,43 +111,31 @@ def _replace_tags( model. The tag defaults to ";" :param model: The model instance - :type model: Model :param make_fatal: (Optional) Set to True to force a fatal error if a tag is not matched - :type make_fatal: bool :returns: A dict of parameter names and values set for the file - :rtype: dict[str,str] """ edited = [] - unused_tags: t.Dict[str, t.List[int]] = {} + unused_tags: t.DefaultDict[str, t.List[int]] = collections.defaultdict(list) used_params: t.Dict[str, str] = {} - for i, line in enumerate(self.lines): - search = re.search(self.regex, line) - if search: - while search: - tagged_line = search.group(0) - previous_value = self._get_prev_value(tagged_line) - if self._is_ensemble_spec(tagged_line, params): - new_val = str(params[previous_value]) - new_line = re.sub(self.regex, new_val, line, 1) - search = re.search(self.regex, new_line) - used_params[previous_value] = new_val - if not search: - edited.append(new_line) - else: - line = new_line - - # if a tag is found but is not in this model's configurations - # put in placeholder value - else: - tag = tagged_line.split(self.tag)[1] - if tag not in unused_tags: - unused_tags[tag] = [] - unused_tags[tag].append(i + 1) - edited.append(re.sub(self.regex, previous_value, line)) - search = None # Move on to the next tag - else: - edited.append(line) + for i, line in enumerate(self.lines, 1): + while search := re.search(self.regex, line): + tagged_line = search.group(0) + previous_value = self._get_prev_value(tagged_line) + if self._is_ensemble_spec(tagged_line, params): + new_val = str(params[previous_value]) + line = re.sub(self.regex, new_val, line, 1) + used_params[previous_value] = new_val + + # if a tag is found but is not in this model's configurations + # put in placeholder value + else: + tag = tagged_line.split(self.tag)[1] + unused_tags[tag].append(i) + line = re.sub(self.regex, previous_value, line) + break + edited.append(line) + for tag, value in unused_tags.items(): missing_tag_message = f"Unused tag {tag} on line(s): {str(value)}" if make_fatal: diff --git a/smartsim/_core/launcher/__init__.py b/smartsim/_core/launcher/__init__.py index 0c4001cd4..d78909641 100644 --- a/smartsim/_core/launcher/__init__.py +++ b/smartsim/_core/launcher/__init__.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from .dragon.dragonLauncher import DragonLauncher from .launcher import Launcher from .local.local import LocalLauncher from .lsf.lsfLauncher import LSFLauncher @@ -32,6 +33,7 @@ __all__ = [ "Launcher", + "DragonLauncher", "LocalLauncher", "LSFLauncher", "PBSLauncher", diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 11d26b141..c69a9cef1 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -42,11 +42,8 @@ def write_colocated_launch_script( is created for this entity. :param file_name: name of the script to write - :type file_name: str :param db_log: log file for the db - :type db_log: str :param colocated_settings: db settings from entity run_settings - :type colocated_settings: dict[str, Any] """ colocated_cmd = _build_colocated_wrapper_cmd(db_log, **colocated_settings) @@ -67,9 +64,14 @@ def write_colocated_launch_script( # STDOUT of the job if colocated_settings["debug"]: script_file.write("export SMARTSIM_LOG_LEVEL=debug\n") - - script_file.write(f"{colocated_cmd}\n") - script_file.write("DBPID=$!\n\n") + script_file.write(f"db_stdout=$({colocated_cmd})\n") + # extract and set DBPID within the shell script that is + # enclosed between __PID__ and sent to stdout by the colocated + # entrypoints file + script_file.write( + "DBPID=$(echo $db_stdout | sed -n " + "'s/.*__PID__\\([0-9]*\\)__PID__.*/\\1/p')\n" + ) # Write the actual launch command for the app script_file.write("$@\n\n") @@ -88,21 +90,13 @@ def _build_colocated_wrapper_cmd( """Build the command use to run a colocated DB application :param db_log: log file for the db - :type db_log: str - :param cpus: db cpus, defaults to 1 - :type cpus: int, optional - :param rai_args: redisai args, defaults to None - :type rai_args: dict[str, str], optional - :param extra_db_args: extra redis args, defaults to None - :type extra_db_args: dict[str, str], optional + :param cpus: db cpus + :param rai_args: redisai args + :param extra_db_args: extra redis args :param port: port to bind DB to - :type port: int :param ifname: network interface(s) to bind DB to - :type ifname: str | list[str], optional :param db_cpu_list: The list of CPUs that the database should be limited to - :type db_cpu_list: str, optional :return: the command to run - :rtype: str """ # pylint: disable=too-many-locals @@ -190,10 +184,8 @@ def _build_colocated_wrapper_cmd( db_script_cmd = _build_db_script_cmd(db_scripts) db_cmd.extend(db_script_cmd) - # run colocated db in the background - db_cmd.append("&") - cmd.extend(db_cmd) + return " ".join(cmd) @@ -235,7 +227,8 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: if db_script.func: # Notice that here db_script.func is guaranteed to be a str # because we don't allow the user to pass a serialized function - sanitized_func = db_script.func.replace("\n", "\\n") + func = db_script.func + sanitized_func = func.replace("\n", "\\n") if not ( sanitized_func.startswith("'") and sanitized_func.endswith("'") diff --git a/smartsim/_core/launcher/dragon/__init__.py b/smartsim/_core/launcher/dragon/__init__.py new file mode 100644 index 000000000..efe03908e --- /dev/null +++ b/smartsim/_core/launcher/dragon/__init__.py @@ -0,0 +1,25 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/launcher/dragon/dragonBackend.py b/smartsim/_core/launcher/dragon/dragonBackend.py new file mode 100644 index 000000000..245660662 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonBackend.py @@ -0,0 +1,734 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import collections +import functools +import itertools +import time +import typing as t +from dataclasses import dataclass, field +from enum import Enum +from threading import RLock + +from tabulate import tabulate + +# pylint: disable=import-error +# isort: off +import dragon.infrastructure.connection as dragon_connection +import dragon.infrastructure.policy as dragon_policy +import dragon.native.group_state as dragon_group_state +import dragon.native.process as dragon_process +import dragon.native.process_group as dragon_process_group +import dragon.native.machine as dragon_machine + +# pylint: enable=import-error +# isort: on +from ...._core.config import get_config +from ...._core.schemas import ( + DragonHandshakeRequest, + DragonHandshakeResponse, + DragonRequest, + DragonResponse, + DragonRunRequest, + DragonRunResponse, + DragonShutdownRequest, + DragonShutdownResponse, + DragonStopRequest, + DragonStopResponse, + DragonUpdateStatusRequest, + DragonUpdateStatusResponse, +) +from ...._core.utils.helpers import create_short_id_str +from ....log import get_logger +from ....status import TERMINAL_STATUSES, SmartSimStatus + +logger = get_logger(__name__) + + +class DragonStatus(str, Enum): + ERROR = str(dragon_group_state.Error()) + RUNNING = str(dragon_group_state.Running()) + + def __str__(self) -> str: + return self.value + + +@dataclass +class ProcessGroupInfo: + status: SmartSimStatus + """Status of step""" + process_group: t.Optional[dragon_process_group.ProcessGroup] = None + """Internal Process Group object, None for finished or not started steps""" + puids: t.Optional[t.List[t.Optional[int]]] = None # puids can be None + """List of Process UIDS belonging to the ProcessGroup""" + return_codes: t.Optional[t.List[int]] = None + """List of return codes of completed processes""" + hosts: t.List[str] = field(default_factory=list) + """List of hosts on which the Process Group """ + redir_workers: t.Optional[dragon_process_group.ProcessGroup] = None + """Workers used to redirect stdout and stderr to file""" + + @property + def smartsim_info(self) -> t.Tuple[SmartSimStatus, t.Optional[t.List[int]]]: + """Information needed by SmartSim Launcher and Job Manager""" + return (self.status, self.return_codes) + + def __str__(self) -> str: + if self.process_group is not None and self.redir_workers is not None: + msg = [f"Active Group ({self.status})"] + if self.puids is not None: + msg.append(f"Number processes: {len(self.puids)}") + else: + msg = [f"Inactive Group ({self.status})"] + + if self.hosts is not None: + msg.append(f"Hosts: {','.join(self.hosts)}") + if self.return_codes is not None: + msg.append(f"{self.return_codes}") + + return ", ".join(msg) + + +# Thanks to Colin Wahl from HPE HPC Dragon Team +def redir_worker(io_conn: dragon_connection.Connection, file_path: str) -> None: + """Read stdout/stderr from the Dragon connection. + + :param io_conn: Dragon connection to stdout or stderr + :param file_path: path to file to write to + """ + while io_conn is None or not io_conn.readable: + time.sleep(0.1) + try: + with open(file_path, "a", encoding="utf-8") as file_to_write: + while True: + output = io_conn.recv() + print(output, flush=True, file=file_to_write, end="") + except EOFError: + pass + except Exception as e: + print(e) + finally: + try: + io_conn.close() + except Exception as e: + print(e) + + +class DragonBackend: + """The DragonBackend class is the main interface between + SmartSim and Dragon. It is not intended to be user-facing, + and will only be called by the Dragon entry-point script or + by threads spawned by it. + """ + + def __init__(self, pid: int) -> None: + self._pid = pid + """PID of dragon executable which launched this server""" + self._group_infos: t.Dict[str, ProcessGroupInfo] = {} + """ProcessGroup execution state information""" + self._queue_lock = RLock() + """Lock that needs to be acquired to access internal queues""" + self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count()) + """Incremental ID to assign to new steps prior to execution""" + + self._initialize_hosts() + self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = ( + collections.OrderedDict() + ) + """Steps waiting for execution""" + self._stop_requests: t.Deque[DragonStopRequest] = collections.deque() + """Stop requests which have not been processed yet""" + self._running_steps: t.List[str] = [] + """List of currently running steps""" + self._completed_steps: t.List[str] = [] + """List of completed steps""" + self._last_beat: float = 0.0 + """Time at which the last heartbeat was set""" + self._heartbeat() + self._last_update_time = self._last_beat + """Time at which the status update was printed the last time""" + self._shutdown_requested = False + """Whether the shutdown was requested to this server""" + self._can_shutdown = False + """Whether the server can shut down""" + self._frontend_shutdown: bool = False + """Whether the server frontend should shut down when the backend does""" + self._shutdown_initiation_time: t.Optional[float] = None + """The time at which the server initiated shutdown""" + smartsim_config = get_config() + self._cooldown_period = ( + smartsim_config.telemetry_frequency * 2 + 5 + if smartsim_config.telemetry_enabled + else 5 + ) + """Time in seconds needed to server to complete shutdown""" + + self._view = DragonBackendView(self) + logger.debug(self._view.host_desc) + + @property + def hosts(self) -> list[str]: + with self._queue_lock: + return self._hosts + + @property + def allocated_hosts(self) -> dict[str, str]: + with self._queue_lock: + return self._allocated_hosts + + @property + def free_hosts(self) -> t.Deque[str]: + with self._queue_lock: + return self._free_hosts + + @property + def group_infos(self) -> dict[str, ProcessGroupInfo]: + with self._queue_lock: + return self._group_infos + + def _initialize_hosts(self) -> None: + with self._queue_lock: + self._hosts: t.List[str] = sorted( + dragon_machine.Node(node).hostname + for node in dragon_machine.System().nodes + ) + """List of hosts available in allocation""" + self._free_hosts: t.Deque[str] = collections.deque(self._hosts) + """List of hosts on which steps can be launched""" + self._allocated_hosts: t.Dict[str, str] = {} + """Mapping of hosts on which a step is already running to step ID""" + + def __str__(self) -> str: + return self.status_message + + @property + def status_message(self) -> str: + """Message with status of available nodes and history of launched jobs. + + :returns: Status message + """ + return ( + "Dragon server backend update\n" + f"{self._view.host_table}\n{self._view.step_table}" + ) + + def _heartbeat(self) -> None: + self._last_beat = self.current_time + + @property + def cooldown_period(self) -> int: + """Time (in seconds) the server will wait before shutting down + + when exit conditions are met (see ``should_shutdown()`` for further details). + """ + return self._cooldown_period + + @property + def _has_cooled_down(self) -> bool: + if self._shutdown_initiation_time is None: + logger.debug(f"Starting cooldown period of {self._cooldown_period} seconds") + self._shutdown_initiation_time = self.current_time + return ( + self.current_time - self._shutdown_initiation_time > self._cooldown_period + ) + + @property + def frontend_shutdown(self) -> bool: + """Whether the frontend will have to shutdown once the backend does + + If False, the frontend will wait for an external signal to stop. + """ + return self._frontend_shutdown + + @property + def last_heartbeat(self) -> float: + """Time (in seconds) at which the last heartbeat was set""" + return self._last_beat + + @property + def should_shutdown(self) -> bool: + """Whether the server should shut down + + A server should shut down if a DragonShutdownRequest was received + and it requested immediate shutdown, or if it did not request immediate + shutdown, but all jobs have been executed. + In both cases, a cooldown period may need to be waited before shutdown. + """ + if self._shutdown_requested and self._can_shutdown: + return self._has_cooled_down + return False + + @property + def current_time(self) -> float: + """Current time for DragonBackend object, in seconds since the Epoch""" + return time.time() + + def _can_honor(self, request: DragonRunRequest) -> t.Tuple[bool, t.Optional[str]]: + """Check if request can be honored with resources available in the allocation. + + Currently only checks for total number of nodes, + in the future it will also look at other constraints + such as memory, accelerators, and so on. + """ + if request.nodes > len(self._hosts): + message = f"Cannot satisfy request. Requested {request.nodes} nodes, " + message += f"but only {len(self._hosts)} nodes are available." + return False, message + if self._shutdown_requested: + message = "Cannot satisfy request, server is shutting down." + return False, message + return True, None + + def _allocate_step( + self, step_id: str, request: DragonRunRequest + ) -> t.Optional[t.List[str]]: + + num_hosts: int = request.nodes + with self._queue_lock: + if num_hosts <= 0 or num_hosts > len(self._free_hosts): + return None + to_allocate = [] + for _ in range(num_hosts): + host = self._free_hosts.popleft() + self._allocated_hosts[host] = step_id + to_allocate.append(host) + return to_allocate + + @staticmethod + def _create_redirect_workers( + global_policy: dragon_policy.Policy, + policies: t.List[dragon_policy.Policy], + puids: t.List[int], + out_file: t.Optional[str], + err_file: t.Optional[str], + ) -> dragon_process_group.ProcessGroup: + grp_redir = dragon_process_group.ProcessGroup( + restart=False, policy=global_policy, pmi_enabled=False + ) + for pol, puid in zip(policies, puids): + proc = dragon_process.Process(None, ident=puid) + if out_file: + grp_redir.add_process( + nproc=1, + template=dragon_process.ProcessTemplate( + target=redir_worker, + args=(proc.stdout_conn, out_file), + stdout=dragon_process.Popen.DEVNULL, + policy=pol, + ), + ) + if err_file: + grp_redir.add_process( + nproc=1, + template=dragon_process.ProcessTemplate( + target=redir_worker, + args=(proc.stderr_conn, err_file), + stdout=dragon_process.Popen.DEVNULL, + policy=pol, + ), + ) + + return grp_redir + + def _stop_steps(self) -> None: + self._heartbeat() + with self._queue_lock: + while len(self._stop_requests) > 0: + request = self._stop_requests.popleft() + step_id = request.step_id + if step_id not in self._group_infos: + logger.error(f"Requested to stop non-existing step {step_id}") + continue + + logger.debug(f"Stopping step {step_id}") + if request.step_id in self._queued_steps: + self._queued_steps.pop(step_id) + else: + # Technically we could just terminate, but what if + # the application intercepts that and ignores it? + proc_group = self._group_infos[step_id].process_group + if ( + proc_group is not None + and proc_group.status == DragonStatus.RUNNING + ): + try: + proc_group.kill() + except dragon_process_group.DragonProcessGroupError: + try: + proc_group.stop() + except dragon_process_group.DragonProcessGroupError: + logger.error("Process group already stopped") + redir_group = self._group_infos[step_id].redir_workers + if redir_group is not None: + try: + redir_group.join(0.1) + redir_group = None + except Exception as e: + logger.error(e) + + self._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + self._group_infos[step_id].return_codes = [-9] + + def _start_steps(self) -> None: + self._heartbeat() + with self._queue_lock: + started = [] + for step_id, request in self._queued_steps.items(): + hosts = self._allocate_step(step_id, self._queued_steps[step_id]) + if not hosts: + continue + + logger.debug(f"Step id {step_id} allocated on {hosts}") + + global_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=hosts[0], + ) + grp = dragon_process_group.ProcessGroup( + restart=False, pmi_enabled=request.pmi_enabled, policy=global_policy + ) + + policies = [] + for node_name in hosts: + local_policy = dragon_policy.Policy( + placement=dragon_policy.Policy.Placement.HOST_NAME, + host_name=node_name, + ) + policies.extend([local_policy] * request.tasks_per_node) + tmp_proc = dragon_process.ProcessTemplate( + target=request.exe, + args=request.exe_args, + cwd=request.path, + env={**request.current_env, **request.env}, + stdout=dragon_process.Popen.PIPE, + stderr=dragon_process.Popen.PIPE, + policy=local_policy, + ) + grp.add_process(nproc=request.tasks_per_node, template=tmp_proc) + + try: + grp.init() + grp.start() + grp_status = SmartSimStatus.STATUS_RUNNING + except Exception as e: + logger.error(e) + grp_status = SmartSimStatus.STATUS_FAILED + + puids = None + try: + puids = list( + set(grp.puids + [puid for puid, retcode in grp.inactive_puids]) + ) + self._group_infos[step_id] = ProcessGroupInfo( + process_group=grp, + puids=puids, + return_codes=[], + status=grp_status, + hosts=hosts, + ) + self._running_steps.append(step_id) + started.append(step_id) + except Exception as e: + logger.error(e) + + if ( + puids is not None + and len(puids) == len(policies) + and grp_status == SmartSimStatus.STATUS_RUNNING + ): + redir_grp = DragonBackend._create_redirect_workers( + global_policy, + policies, + puids, + request.output_file, + request.error_file, + ) + try: + redir_grp.init() + redir_grp.start() + except Exception as e: + raise IOError( + f"Could not redirect stdout and stderr for PUIDS {puids}" + ) from e + self._group_infos[step_id].redir_workers = redir_grp + elif puids is not None and grp_status == SmartSimStatus.STATUS_RUNNING: + logger.error("Cannot redirect workers: some PUIDS are missing") + + if started: + logger.debug(f"{started=}") + + for step_id in started: + try: + self._queued_steps.pop(step_id) + except KeyError: + logger.error( + f"Tried to allocate the same step twice, step id {step_id}" + ) + except Exception as e: + logger.error(e) + + def _refresh_statuses(self) -> None: + self._heartbeat() + with self._queue_lock: + terminated = [] + for step_id in self._running_steps: + group_info = self._group_infos[step_id] + grp = group_info.process_group + if grp is None: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-1] + elif group_info.status not in TERMINAL_STATUSES: + if grp.status == str(DragonStatus.RUNNING): + group_info.status = SmartSimStatus.STATUS_RUNNING + else: + puids = group_info.puids + if puids is not None and all( + puid is not None for puid in puids + ): + try: + group_info.return_codes = [ + dragon_process.Process(None, ident=puid).returncode + for puid in puids + ] + except (ValueError, TypeError) as e: + logger.error(e) + group_info.return_codes = [-1 for _ in puids] + else: + group_info.return_codes = [0] + if not group_info.status == SmartSimStatus.STATUS_CANCELLED: + group_info.status = ( + SmartSimStatus.STATUS_FAILED + if any(group_info.return_codes) + or grp.status == DragonStatus.ERROR + else SmartSimStatus.STATUS_COMPLETED + ) + + if group_info.status in TERMINAL_STATUSES: + terminated.append(step_id) + + if terminated: + logger.debug(f"{terminated=}") + + for step_id in terminated: + self._running_steps.remove(step_id) + self._completed_steps.append(step_id) + group_info = self._group_infos[step_id] + if group_info is not None: + for host in group_info.hosts: + logger.debug(f"Releasing host {host}") + try: + self._allocated_hosts.pop(host) + except KeyError: + logger.error(f"Tried to free a non-allocated host: {host}") + self._free_hosts.append(host) + group_info.process_group = None + group_info.redir_workers = None + + def _update_shutdown_status(self) -> None: + self._heartbeat() + with self._queue_lock: + self._can_shutdown |= ( + all( + grp_info.status in TERMINAL_STATUSES + and grp_info.process_group is None + and grp_info.redir_workers is None + for grp_info in self._group_infos.values() + ) + and self._shutdown_requested + ) + + def _should_print_status(self) -> bool: + if self.current_time - self._last_update_time > 10: + self._last_update_time = self.current_time + return True + return False + + def _update(self) -> None: + self._stop_steps() + self._start_steps() + self._refresh_statuses() + self._update_shutdown_status() + + def _kill_all_running_jobs(self) -> None: + with self._queue_lock: + for step_id, group_info in self._group_infos.items(): + if group_info.status not in TERMINAL_STATUSES: + self._stop_requests.append(DragonStopRequest(step_id=step_id)) + + def update(self) -> None: + """Update internal data structures, queues, and job statuses""" + logger.debug("Dragon Backend update thread started") + while not self.should_shutdown: + try: + self._update() + time.sleep(0.1) + except Exception as e: + logger.error(e) + if self._should_print_status(): + try: + logger.debug(str(self)) + except ValueError as e: + logger.error(e) + + logger.debug("Dragon Backend update thread stopping") + + @functools.singledispatchmethod + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use + def process_request(self, request: DragonRequest) -> DragonResponse: + """Process an incoming DragonRequest""" + raise TypeError(f"Unsure how to process a `{type(request)}` request") + + @process_request.register + def _(self, request: DragonRunRequest) -> DragonRunResponse: + step_id = next(self._step_ids) + with self._queue_lock: + honorable, err = self._can_honor(request) + if not honorable: + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_FAILED, return_codes=[-1] + ) + else: + self._queued_steps[step_id] = request + self._group_infos[step_id] = ProcessGroupInfo( + status=SmartSimStatus.STATUS_NEVER_STARTED + ) + return DragonRunResponse(step_id=step_id, error_message=err) + + @process_request.register + def _(self, request: DragonUpdateStatusRequest) -> DragonUpdateStatusResponse: + with self._queue_lock: + return DragonUpdateStatusResponse( + statuses={ + step_id: self._group_infos[step_id].smartsim_info + for step_id in request.step_ids + if step_id in self._group_infos + } + ) + + @process_request.register + def _(self, request: DragonStopRequest) -> DragonStopResponse: + with self._queue_lock: + self._stop_requests.append(request) + return DragonStopResponse() + + @process_request.register + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use,unused-argument + def _(self, request: DragonHandshakeRequest) -> DragonHandshakeResponse: + return DragonHandshakeResponse(dragon_pid=self._pid) + + @process_request.register + # Deliberately suppressing errors so that overloads have the same signature + # pylint: disable-next=no-self-use,unused-argument + def _(self, request: DragonShutdownRequest) -> DragonShutdownResponse: + self._shutdown_requested = True + self._update_shutdown_status() + if request.immediate: + self._kill_all_running_jobs() + self._frontend_shutdown = request.frontend_shutdown + return DragonShutdownResponse() + + +class DragonBackendView: + def __init__(self, backend: DragonBackend): + self._backend = backend + + @property + def host_desc(self) -> str: + hosts = self._backend.hosts + num_hosts = len(hosts) + host_string = str(num_hosts) + (" hosts" if num_hosts != 1 else " host") + return f"{host_string} available for execution: {hosts}" + + @staticmethod + def _proc_group_info_table_line( + step_id: str, proc_group_info: ProcessGroupInfo + ) -> t.List[str]: + table_line = [step_id, f"{proc_group_info.status.value}"] + + if proc_group_info.hosts is not None: + table_line.append(f"{','.join(proc_group_info.hosts)}") + else: + table_line.append("") + + if proc_group_info.return_codes is not None: + table_line.append( + f"{','.join(str(ret) for ret in proc_group_info.return_codes)}" + ) + else: + table_line.append("") + + if proc_group_info.puids is not None: + table_line.append(f"{len(proc_group_info.puids)}") + else: + table_line.append("") + + return table_line + + @property + def step_table(self) -> str: + """Table representation of all jobs which have been started on the server.""" + headers = ["Step", "Status", "Hosts", "Return codes", "Num procs"] + + group_infos = self._backend.group_infos + + colalign = ( + ["left", "left", "left", "center", "center"] + if len(group_infos) > 0 + else None + ) + values = [ + self._proc_group_info_table_line(step, group_info) + for step, group_info in group_infos.items() + ] + + return tabulate( + values, + headers, + disable_numparse=True, + tablefmt="github", + colalign=colalign, + ) + + @property + def host_table(self) -> str: + """Table representation of current state of nodes available + + in the allocation. + """ + headers = ["Host", "Status"] + hosts = self._backend.hosts + free_hosts = self._backend.free_hosts + + def _host_table_line(host: str) -> list[str]: + return [host, "Free" if host in free_hosts else "Busy"] + + colalign = ["left", "center"] if len(hosts) > 0 else None + values = [_host_table_line(host) for host in hosts] + + return tabulate( + values, headers, disable_numparse=True, tablefmt="github", colalign=colalign + ) diff --git a/smartsim/_core/launcher/dragon/dragonConnector.py b/smartsim/_core/launcher/dragon/dragonConnector.py new file mode 100644 index 000000000..0cd68c24e --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonConnector.py @@ -0,0 +1,532 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import atexit +import fileinput +import itertools +import json +import os +import subprocess +import sys +import typing as t +from collections import defaultdict +from pathlib import Path +from threading import RLock + +import psutil +import zmq +import zmq.auth.thread + +from ...._core.launcher.dragon import dragonSockets +from ....error.errors import SmartSimError +from ....log import get_logger +from ...config import get_config +from ...schemas import ( + DragonBootstrapRequest, + DragonBootstrapResponse, + DragonHandshakeRequest, + DragonHandshakeResponse, + DragonRequest, + DragonResponse, + DragonShutdownRequest, +) +from ...utils.network import find_free_port, get_best_interface_and_address + +logger = get_logger(__name__) + +_SchemaT = t.TypeVar("_SchemaT", bound=t.Union[DragonRequest, DragonResponse]) + +DRG_LOCK = RLock() + + +class DragonConnector: + """This class encapsulates the functionality needed + to start a Dragon server and communicate with it. + """ + + def __init__(self) -> None: + self._context: zmq.Context[t.Any] = zmq.Context.instance() + self._context.setsockopt(zmq.REQ_CORRELATE, 1) + self._context.setsockopt(zmq.REQ_RELAXED, 1) + self._authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None + config = get_config() + self._reset_timeout(config.dragon_server_timeout) + self._dragon_head_socket: t.Optional[zmq.Socket[t.Any]] = None + self._dragon_head_process: t.Optional[subprocess.Popen[bytes]] = None + # Returned by dragon head, useful if shutdown is to be requested + # but process was started by another connector + self._dragon_head_pid: t.Optional[int] = None + self._dragon_server_path = config.dragon_server_path + logger.debug(f"Dragon Server path was set to {self._dragon_server_path}") + self._env_vars: t.Dict[str, str] = {} + if self._dragon_server_path is None: + raise SmartSimError( + "DragonConnector could not find the dragon server path. " + "This should not happen if the Connector was started by an " + "experiment.\nIf the DragonConnector was started manually, " + "then the environment variable SMARTSIM_DRAGON_SERVER_PATH " + "should be set to an existing directory." + ) + + @property + def is_connected(self) -> bool: + """Whether the Connector established a connection to the server + + :return: True if connected + """ + return self._dragon_head_socket is not None + + @property + def can_monitor(self) -> bool: + """Whether the Connector knows the PID of the dragon server head process + and can monitor its status + + :return: True if the server can be monitored""" + return self._dragon_head_pid is not None + + def _handshake(self, address: str) -> None: + self._dragon_head_socket = dragonSockets.get_secure_socket( + self._context, zmq.REQ, False + ) + self._dragon_head_socket.connect(address) + try: + dragon_handshake = _assert_schema_type( + self.send_request(DragonHandshakeRequest()), DragonHandshakeResponse + ) + self._dragon_head_pid = dragon_handshake.dragon_pid + logger.debug( + f"Successful handshake with Dragon server at address {address}" + ) + except (zmq.ZMQError, zmq.Again) as e: + logger.debug(e) + self._dragon_head_socket.close() + self._dragon_head_socket = None + + raise SmartSimError( + f"Unsuccessful handshake with Dragon server at address {address}" + ) from e + + def _reset_timeout(self, timeout: int = get_config().dragon_server_timeout) -> None: + self._context.setsockopt(zmq.SNDTIMEO, value=timeout) + self._context.setsockopt(zmq.RCVTIMEO, value=timeout) + if self._authenticator is not None and self._authenticator.thread is not None: + try: + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.SNDTIMEO, timeout + ) + self._authenticator.thread.authenticator.zap_socket.setsockopt( + zmq.RCVTIMEO, timeout + ) + except zmq.ZMQError: + pass + + def ensure_connected(self) -> None: + """Ensure that the Connector established a connection to the server + + If the Connector is not connected, attempt to connect and raise an error + on failure. + + :raises SmartSimError: if connection cannot be established + """ + if not self.is_connected: + self.connect_to_dragon() + if not self.is_connected: + raise SmartSimError("Could not connect to Dragon server") + + def _get_new_authenticator( + self, timeout: int = get_config().dragon_server_timeout + ) -> None: + if self._authenticator is not None: + if self._authenticator.thread is not None: + try: + logger.debug("Closing ZAP socket") + self._authenticator.thread.authenticator.zap_socket.close() + except Exception as e: + logger.debug(f"Could not close ZAP socket, {e}") + try: + self._authenticator.stop() + except zmq.Again: + logger.debug("Could not stop authenticator") + try: + self._authenticator = dragonSockets.get_authenticator( + self._context, timeout + ) + return + except RuntimeError as e: + logger.error("Could not get authenticator") + raise e from None + + @staticmethod + def _get_dragon_log_level() -> str: + smartsim_to_dragon = defaultdict(lambda: "NONE") + smartsim_to_dragon["developer"] = "INFO" + return smartsim_to_dragon.get(get_config().log_level, "NONE") + + def _connect_to_existing_server(self, path: Path) -> None: + config = get_config() + dragon_config_log = path / config.dragon_log_filename + + if not dragon_config_log.is_file(): + return + + dragon_confs = self._parse_launched_dragon_server_info_from_files( + [dragon_config_log] + ) + logger.debug(dragon_confs) + + for dragon_conf in dragon_confs: + logger.debug( + "Found dragon server config file. Checking if the server" + f" is still up at address {dragon_conf['address']}." + ) + try: + self._reset_timeout() + self._get_new_authenticator(-1) + self._handshake(dragon_conf["address"]) + except SmartSimError as e: + logger.error(e) + finally: + self._reset_timeout(config.dragon_server_timeout) + if self.is_connected: + logger.debug("Connected to existing Dragon server") + return + + def _start_connector_socket(self, socket_addr: str) -> zmq.Socket[t.Any]: + config = get_config() + connector_socket: t.Optional[zmq.Socket[t.Any]] = None + self._reset_timeout(config.dragon_server_startup_timeout) + self._get_new_authenticator(-1) + connector_socket = dragonSockets.get_secure_socket(self._context, zmq.REP, True) + logger.debug(f"Binding connector to {socket_addr}") + connector_socket.bind(socket_addr) + if connector_socket is None: + raise SmartSimError("Socket failed to initialize") + + return connector_socket + + def load_persisted_env(self) -> t.Dict[str, str]: + """Load key-value pairs from a .env file created during dragon installation + + :return: Key-value pairs stored in .env file""" + if self._env_vars: + # use previously loaded env vars. + return self._env_vars + + config = get_config() + + if not config.dragon_dotenv.exists(): + self._env_vars = {} + return self._env_vars + + with open(config.dragon_dotenv, encoding="utf-8") as dot_env: + for kvp in dot_env.readlines(): + split = kvp.strip().split("=", maxsplit=1) + key, value = split[0], split[-1] + self._env_vars[key] = value + + return self._env_vars + + def merge_persisted_env(self, current_env: t.Dict[str, str]) -> t.Dict[str, str]: + """Combine the current environment variable set with the dragon .env by adding + Dragon-specific values and prepending any new values to existing keys + + :param current_env: Environment which has to be merged with .env variables + :return: Merged environment + """ + # ensure we start w/a complete env from current env state + merged_env: t.Dict[str, str] = {**current_env} + + # copy all the values for dragon straight into merged_env + merged_env.update( + {k: v for k, v in self._env_vars.items() if k.startswith("DRAGON")} + ) + + # prepend dragon env updates into existing env vars + for key, value in self._env_vars.items(): + if not key.startswith("DRAGON"): + if current_value := current_env.get(key, None): + # when a key is not dragon specific, don't overwrite the current + # value. instead, prepend the value dragon needs to/current env + value = f"{value}:{current_value}" + merged_env[key] = value + return merged_env + + def connect_to_dragon(self) -> None: + """Connect to Dragon server + + :raises SmartSimError: If connection cannot be established + """ + config = get_config() + with DRG_LOCK: + # TODO use manager instead + if self.is_connected: + return + if self._dragon_server_path is None: + raise SmartSimError("Path to Dragon server not set.") + + logger.info( + "Establishing connection with Dragon server or starting a new one..." + ) + + path = _resolve_dragon_path(self._dragon_server_path) + + self._connect_to_existing_server(path) + if self.is_connected: + return + + path.mkdir(parents=True, exist_ok=True) + + local_address = get_best_interface_and_address().address + if local_address is None: + # TODO parse output file + raise SmartSimError( + "Could not determine SmartSim's local address, " + "the Dragon server could not be started." + ) + # find first available port >= 5995 + port = find_free_port(start=5995) + socket_addr = f"tcp://{local_address}:{port}" + connector_socket = self._start_connector_socket(socket_addr) + + cmd = [ + "dragon", + "-t", + config.dragon_transport, + "-l", + DragonConnector._get_dragon_log_level(), + sys.executable, + "-m", + "smartsim._core.entrypoints.dragon", + "+launching_address", + socket_addr, + ] + + dragon_out_file = path / "dragon_head.out" + dragon_err_file = path / "dragon_head.err" + + self.load_persisted_env() + merged_env = self.merge_persisted_env(os.environ.copy()) + merged_env.update({"PYTHONUNBUFFERED": "1"}) + + with ( + open(dragon_out_file, "w", encoding="utf-8") as dragon_out, + open(dragon_err_file, "w", encoding="utf-8") as dragon_err, + ): + logger.debug(f"Starting Dragon environment: {' '.join(cmd)}") + + # pylint: disable-next=consider-using-with + self._dragon_head_process = subprocess.Popen( + args=cmd, + bufsize=0, + stderr=dragon_err.fileno(), + stdout=dragon_out.fileno(), + cwd=path, + shell=False, + env=merged_env, + start_new_session=True, + ) + + server = dragonSockets.as_server(connector_socket) + logger.debug(f"Listening to {socket_addr}") + request = _assert_schema_type(server.recv(), DragonBootstrapRequest) + server.send( + DragonBootstrapResponse(dragon_pid=self._dragon_head_process.pid) + ) + connector_socket.close() + logger.debug(f"Connecting to {request.address}") + self._reset_timeout(config.dragon_server_timeout) + self._handshake(request.address) + + # Only the Connector which started the server is + # responsible of it, that's why we register the + # cleanup in this code branch. + # The cleanup function should not have references + # to this object to avoid Garbage Collector lockup + server_socket = self._dragon_head_socket + server_process_pid = self._dragon_head_process.pid + + if server_socket is not None and self._dragon_head_process is not None: + atexit.register( + _dragon_cleanup, + server_socket=server_socket, + server_process_pid=server_process_pid, + server_authenticator=self._authenticator, + ) + elif self._dragon_head_process is not None: + self._dragon_head_process.wait(1.0) + if self._dragon_head_process.stdout: + for line in iter(self._dragon_head_process.stdout.readline, b""): + logger.info(line.decode("utf-8").rstrip()) + if self._dragon_head_process.stderr: + for line in iter(self._dragon_head_process.stderr.readline, b""): + logger.warning(line.decode("utf-8").rstrip()) + logger.warning(self._dragon_head_process.returncode) + else: + logger.warning("Could not start Dragon server as subprocess") + + def cleanup(self) -> None: + """Shut down Dragon server and authenticator thread""" + if self._dragon_head_socket is not None and self._dragon_head_pid is not None: + _dragon_cleanup( + server_socket=self._dragon_head_socket, + server_process_pid=self._dragon_head_pid, + server_authenticator=self._authenticator, + ) + self._dragon_head_socket = None + self._dragon_head_pid = None + self._authenticator = None + + def send_request(self, request: DragonRequest, flags: int = 0) -> DragonResponse: + """Send a request to the Dragon server using a secure socket + + :param request: The request to send + :param flags: 0MQ flags, defaults to 0 + :raises SmartSimError: If not connected to Dragon server + :return: Response from server + """ + self.ensure_connected() + if (socket := self._dragon_head_socket) is None: + raise SmartSimError("Not connected to Dragon") + return self._send_req_with_socket(socket, request, flags) + + @staticmethod + def _parse_launched_dragon_server_info_from_iterable( + stream: t.Iterable[str], num_dragon_envs: t.Optional[int] = None + ) -> t.List[t.Dict[str, str]]: + lines = (line.strip() for line in stream) + lines = (line for line in lines if line) + tokenized = (line.split(maxsplit=1) for line in lines) + tokenized = (tokens for tokens in tokenized if len(tokens) > 1) + dragon_env_jsons = ( + config_dict + for first, config_dict in tokenized + if "DRAGON_SERVER_CONFIG" in first + ) + dragon_envs = (json.loads(config_dict) for config_dict in dragon_env_jsons) + + dragon_envs = ( + dragon_env for dragon_env in dragon_envs if "address" in dragon_env + ) + + if num_dragon_envs: + sliced_dragon_envs = itertools.islice(dragon_envs, num_dragon_envs) + return list(sliced_dragon_envs) + return list(dragon_envs) + + @classmethod + def _parse_launched_dragon_server_info_from_files( + cls, + file_paths: t.List[t.Union[str, "os.PathLike[str]"]], + num_dragon_envs: t.Optional[int] = None, + ) -> t.List[t.Dict[str, str]]: + with fileinput.FileInput(file_paths) as ifstream: + dragon_envs = cls._parse_launched_dragon_server_info_from_iterable( + ifstream, num_dragon_envs + ) + + return dragon_envs + + @staticmethod + def _send_req_with_socket( + socket: zmq.Socket[t.Any], + request: DragonRequest, + send_flags: int = 0, + recv_flags: int = 0, + ) -> DragonResponse: + client = dragonSockets.as_client(socket) + with DRG_LOCK: + logger.debug(f"Sending {type(request).__name__}: {request}") + client.send(request, send_flags) + response = client.recv(flags=recv_flags) + + logger.debug(f"Received {type(response).__name__}: {response}") + return response + + +def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + if not isinstance(obj, typ): + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") + return obj + + +def _dragon_cleanup( + server_socket: t.Optional[zmq.Socket[t.Any]] = None, + server_process_pid: t.Optional[int] = 0, + server_authenticator: t.Optional[zmq.auth.thread.ThreadAuthenticator] = None, +) -> None: + """Clean up resources used by the launcher. + :param server_socket: (optional) Socket used to connect to dragon environment + :param server_process_pid: (optional) Process ID of the dragon entrypoint + :param server_authenticator: (optional) Authenticator used to secure sockets + """ + try: + if server_socket is not None: + print("Sending shutdown request to dragon environment") + # pylint: disable-next=protected-access + DragonConnector._send_req_with_socket( + server_socket, DragonShutdownRequest(), recv_flags=zmq.NOBLOCK + ) + except zmq.error.ZMQError as e: + # Can't use the logger as I/O file may be closed + if not isinstance(e, zmq.Again): + print("Could not send shutdown request to dragon server") + print(f"ZMQ error: {e}", flush=True) + finally: + print("Sending shutdown request is complete") + + if server_process_pid and psutil.pid_exists(server_process_pid): + try: + _, retcode = os.waitpid(server_process_pid, 0) + print( + f"Dragon server process shutdown is complete, return code {retcode}", + flush=True, + ) + except Exception as e: + logger.debug(e) + + try: + if server_authenticator is not None and server_authenticator.is_alive(): + print("Shutting down ZMQ authenticator") + server_authenticator.stop() + except Exception: + print("Authenticator shutdown error") + else: + print("Authenticator shutdown is complete") + + +def _resolve_dragon_path(fallback: t.Union[str, "os.PathLike[str]"]) -> Path: + dragon_server_path = get_config().dragon_server_path or os.path.join( + fallback, ".smartsim", "dragon" + ) + dragon_server_paths = dragon_server_path.split(":") + if len(dragon_server_paths) > 1: + logger.warning( + "Multiple dragon servers not supported, " + "will connect to (or start) first server in list." + ) + return Path(dragon_server_paths[0]) diff --git a/smartsim/_core/launcher/dragon/dragonLauncher.py b/smartsim/_core/launcher/dragon/dragonLauncher.py new file mode 100644 index 000000000..17b47e309 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonLauncher.py @@ -0,0 +1,321 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import os +import typing as t + +from ...._core.launcher.stepMapping import StepMap +from ....error import LauncherError, SmartSimError +from ....log import get_logger +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + RunSettings, + SbatchSettings, + SettingsBase, +) +from ....status import SmartSimStatus +from ...schemas import ( + DragonRunRequest, + DragonRunResponse, + DragonStopRequest, + DragonStopResponse, + DragonUpdateStatusRequest, + DragonUpdateStatusResponse, +) +from ..launcher import WLMLauncher +from ..pbs.pbsLauncher import PBSLauncher +from ..slurm.slurmLauncher import SlurmLauncher +from ..step import DragonBatchStep, DragonStep, LocalStep, Step +from ..stepInfo import StepInfo +from .dragonConnector import DragonConnector, _SchemaT + +logger = get_logger(__name__) + + +class DragonLauncher(WLMLauncher): + """This class encapsulates the functionality needed + to launch jobs on systems that use Dragon on top of a workload manager. + + All WLM launchers are capable of launching managed and unmanaged + jobs. Managed jobs are queried through interaction with with WLM, + in this case the Dragon server. Unmanaged jobs are held in the TaskManager + and are managed through references to their launching process ID + i.e. a psutil.Popen object. + Batch Jobs are routed to either Slurm or PBS and their step ID + is stored, prefixed with the name of the scheduler, to allow + the Job Manager to interact with it. + """ + + def __init__(self) -> None: + super().__init__() + self._connector = DragonConnector() + """Connector used to start and interact with the Dragon server""" + self._slurm_launcher = SlurmLauncher() + """Slurm sub-launcher, used only for batch jobs""" + self._pbs_launcher = PBSLauncher() + """PBS sub-launcher, used only for batch jobs""" + + @property + def is_connected(self) -> bool: + return self._connector.is_connected + + def cleanup(self) -> None: + self._connector.cleanup() + + # RunSettings types supported by this launcher + @property + def supported_rs(self) -> t.Dict[t.Type[SettingsBase], t.Type[Step]]: + # RunSettings types supported by this launcher + return { + DragonRunSettings: DragonStep, + SbatchSettings: DragonBatchStep, + QsubBatchSettings: DragonBatchStep, + RunSettings: LocalStep, + } + + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: + super().add_step_to_mapping_table(name, step_map) + + if step_map.step_id is None: + return + sublauncher: t.Optional[t.Union[SlurmLauncher, PBSLauncher]] = None + if step_map.step_id.startswith("SLURM-"): + sublauncher = self._slurm_launcher + elif step_map.step_id.startswith("PBS-"): + sublauncher = self._pbs_launcher + else: + return + + sublauncher_step_map = StepMap( + step_id=DragonLauncher._unprefix_step_id(step_map.step_id), + task_id=step_map.task_id, + managed=step_map.managed, + ) + sublauncher.add_step_to_mapping_table(name, sublauncher_step_map) + + def run(self, step: Step) -> t.Optional[str]: + """Run a job step through Slurm + + :param step: a job step instance + :raises LauncherError: if launch fails + :return: job step id if job is managed + """ + + if not self.task_manager.actively_monitoring: + self.task_manager.start() + + step_id = None + task_id = None + + cmd = step.get_launch_cmd() + out, err = step.get_output_files() + + if isinstance(step, DragonBatchStep): + # wait for batch step to submit successfully + sublauncher_step_id: t.Optional[str] = None + return_code, out, err = self.task_manager.start_and_wait(cmd, step.cwd) + if return_code != 0: + raise LauncherError(f"Sbatch submission failed\n {out}\n {err}") + if out: + sublauncher_step_id = out.strip() + logger.debug( + f"Gleaned batch job id: {sublauncher_step_id} for {step.name}" + ) + + if sublauncher_step_id is None: + raise SmartSimError("Could not get step id for batch step") + + if isinstance(step.batch_settings, SbatchSettings): + self._slurm_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "SLURM-" + sublauncher_step_id + elif isinstance(step.batch_settings, QsubBatchSettings): + self._pbs_launcher.step_mapping.add( + step.name, sublauncher_step_id, task_id, step.managed + ) + step_id = "PBS-" + sublauncher_step_id + elif isinstance(step, DragonStep): + run_args = step.run_settings.run_args + req_env = step.run_settings.env_vars + self._connector.load_persisted_env() + merged_env = self._connector.merge_persisted_env(os.environ.copy()) + nodes = int(run_args.get("nodes", None) or 1) + tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + response = _assert_schema_type( + self._connector.send_request( + DragonRunRequest( + exe=cmd[0], + exe_args=cmd[1:], + path=step.cwd, + name=step.name, + nodes=nodes, + tasks_per_node=tasks_per_node, + env=req_env, + current_env=merged_env, + output_file=out, + error_file=err, + ) + ), + DragonRunResponse, + ) + step_id = str(response.step_id) + else: + # pylint: disable-next=consider-using-with + out_strm = open(out, "w+", encoding="utf-8") + # pylint: disable-next=consider-using-with + err_strm = open(err, "w+", encoding="utf-8") + task_id = self.task_manager.start_task( + cmd, step.cwd, step.env, out=out_strm.fileno(), err=err_strm.fileno() + ) + step.managed = False + + self.step_mapping.add(step.name, step_id, task_id, step.managed) + + return step_id + + def stop(self, step_name: str) -> StepInfo: + """Step a job step + + :param step_name: name of the job to stop + :return: update for job due to cancel + """ + + stepmap = self.step_mapping[step_name] + step_id = str(stepmap.step_id) + + if step_id.startswith("SLURM-"): + return self._slurm_launcher.stop(step_name) + + if step_id.startswith("PBS-"): + return self._pbs_launcher.stop(step_name) + + _assert_schema_type( + self._connector.send_request(DragonStopRequest(step_id=step_id)), + DragonStopResponse, + ) + + _, step_info = self.get_step_update([step_name])[0] + if not step_info: + raise LauncherError(f"Could not get step_info for job step {step_name}") + + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED # set status to cancelled instead of failed + ) + step_info.launcher_status = str(SmartSimStatus.STATUS_CANCELLED) + return step_info + + @staticmethod + def _unprefix_step_id(step_id: str) -> str: + return step_id.split("-", maxsplit=1)[1] + + def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: + """Get step updates for Dragon-managed jobs + + :param step_ids: list of job step ids + :return: list of updates for managed jobs + """ + + step_id_updates: dict[str, StepInfo] = {} + + dragon_step_ids: t.List[str] = [] + slurm_step_ids: t.List[str] = [] + pbs_step_ids: t.List[str] = [] + for step_id in step_ids: + if step_id.startswith("SLURM-"): + slurm_step_ids.append(step_id) + elif step_id.startswith("PBS-"): + pbs_step_ids.append(step_id) + else: + dragon_step_ids.append(step_id) + + if slurm_step_ids: + # pylint: disable-next=protected-access + slurm_updates = self._slurm_launcher._get_managed_step_update( + [ + DragonLauncher._unprefix_step_id(step_id) + for step_id in slurm_step_ids + ] + ) + step_id_updates.update(dict(zip(slurm_step_ids, slurm_updates))) + + if pbs_step_ids: + # pylint: disable-next=protected-access + pbs_updates = self._pbs_launcher._get_managed_step_update( + [DragonLauncher._unprefix_step_id(step_id) for step_id in pbs_step_ids] + ) + step_id_updates.update(dict(zip(pbs_step_ids, pbs_updates))) + + if dragon_step_ids: + response = _assert_schema_type( + self._connector.send_request( + DragonUpdateStatusRequest(step_ids=dragon_step_ids) + ), + DragonUpdateStatusResponse, + ) + + for step_id in step_ids: + if step_id not in response.statuses: + msg = "Missing step id update from Dragon launcher." + if response.error_message is not None: + msg += "\nDragon backend reported following error: " + msg += response.error_message + logger.error(msg) + info = StepInfo( + SmartSimStatus.STATUS_FAILED, + SmartSimStatus.STATUS_FAILED.value, + -1, + ) + else: + status, ret_codes = response.statuses[step_id] + if ret_codes: + grp_ret_code = min(ret_codes) + if any(ret_codes): + _err_msg = ( + f"One or more processes failed for job {step_id} " + f"Return codes were: {ret_codes}" + ) + logger.error(_err_msg) + else: + grp_ret_code = None + info = StepInfo(status, status.value, grp_ret_code) + + step_id_updates[step_id] = info + + # Order matters as we return an ordered list of StepInfo objects + return [step_id_updates[step_id] for step_id in step_ids] + + def __str__(self) -> str: + return "Dragon" + + +def _assert_schema_type(obj: object, typ: t.Type[_SchemaT], /) -> _SchemaT: + if not isinstance(obj, typ): + raise TypeError(f"Expected schema of type `{typ}`, but got {type(obj)}") + return obj diff --git a/smartsim/_core/launcher/dragon/dragonSockets.py b/smartsim/_core/launcher/dragon/dragonSockets.py new file mode 100644 index 000000000..80acd61a2 --- /dev/null +++ b/smartsim/_core/launcher/dragon/dragonSockets.py @@ -0,0 +1,158 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +import zmq +import zmq.auth.thread + +from smartsim._core.config.config import get_config +from smartsim._core.schemas import dragonRequests as _dragonRequests +from smartsim._core.schemas import dragonResponses as _dragonResponses +from smartsim._core.schemas import utils as _utils +from smartsim._core.utils.security import KeyManager +from smartsim.log import get_logger + +if t.TYPE_CHECKING: + from zmq import Context + from zmq.sugar.socket import Socket + +logger = get_logger(__name__) + +AUTHENTICATOR: t.Optional["zmq.auth.thread.ThreadAuthenticator"] = None + + +def as_server( + socket: "Socket[t.Any]", +) -> _utils.SocketSchemaTranslator[ + _dragonResponses.DragonResponse, + _dragonRequests.DragonRequest, +]: + return _utils.SocketSchemaTranslator( + socket, _dragonResponses.response_registry, _dragonRequests.request_registry + ) + + +def as_client( + socket: "Socket[t.Any]", +) -> _utils.SocketSchemaTranslator[ + _dragonRequests.DragonRequest, + _dragonResponses.DragonResponse, +]: + return _utils.SocketSchemaTranslator( + socket, _dragonRequests.request_registry, _dragonResponses.response_registry + ) + + +def get_secure_socket( + context: "zmq.Context[t.Any]", + socket_type: int, + is_server: bool, +) -> "Socket[t.Any]": + """Create secured socket that consumes & produces encrypted messages + + :param context: ZMQ context object + :param socket_type: Type of ZMQ socket to create + :param is_server: Pass `True` to secure the socket as server. Pass `False` + to secure the socket as a client. + :returns: the secured socket prepared for sending encrypted messages + """ + config = get_config() + socket: "Socket[t.Any]" = context.socket(socket_type) + + key_manager = KeyManager(config, as_server=is_server, as_client=not is_server) + server_keys, client_keys = key_manager.get_keys() + logger.debug(f"Applying keys to socket: {server_keys}, {client_keys}") + + if is_server: + logger.debug("Configuring socket as server") + + # configure the server keys on the socket + socket.curve_secretkey = server_keys.private + socket.curve_publickey = server_keys.public + + socket.curve_server = True + else: + # configure client keys on the socket to encrypt outgoing messages + socket.curve_secretkey = client_keys.private + socket.curve_publickey = client_keys.public + + # set the server public key for decrypting incoming messages + socket.curve_serverkey = server_keys.public + return socket + + +def get_authenticator( + context: "zmq.Context[t.Any]", timeout: int = get_config().dragon_server_timeout +) -> "zmq.auth.thread.ThreadAuthenticator": + """Create an authenticator to handle encryption of ZMQ communications + + :param context: ZMQ context object + :returns: the activated `Authenticator` + """ + # pylint: disable-next=global-statement + global AUTHENTICATOR + + if AUTHENTICATOR is not None: + if AUTHENTICATOR.is_alive(): + return AUTHENTICATOR + try: + logger.debug("Stopping authenticator") + AUTHENTICATOR.thread.authenticator.zap_socket.close() + AUTHENTICATOR.thread.join(0.1) + AUTHENTICATOR = None + except Exception as e: + logger.debug(e) + finally: + logger.debug("Stopped authenticator") + + config = get_config() + + key_manager = KeyManager(config, as_client=True) + server_keys, client_keys = key_manager.get_keys() + logger.debug(f"Applying keys to authenticator: {server_keys}, {client_keys}") + + AUTHENTICATOR = zmq.auth.thread.ThreadAuthenticator(context, log=logger) + + ctx_sndtimeo = context.getsockopt(zmq.SNDTIMEO) + ctx_rcvtimeo = context.getsockopt(zmq.RCVTIMEO) + + AUTHENTICATOR.context.setsockopt(zmq.SNDTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.RCVTIMEO, timeout) + AUTHENTICATOR.context.setsockopt(zmq.REQ_CORRELATE, 1) + AUTHENTICATOR.context.setsockopt(zmq.REQ_RELAXED, 1) + + # allow all keys in the client key directory to connect + logger.debug(f"Securing with client keys in {key_manager.client_keys_dir}") + AUTHENTICATOR.configure_curve(domain="*", location=key_manager.client_keys_dir) + + logger.debug("Starting authenticator") + AUTHENTICATOR.start() + + context.setsockopt(zmq.SNDTIMEO, ctx_sndtimeo) + context.setsockopt(zmq.RCVTIMEO, ctx_rcvtimeo) + + return AUTHENTICATOR diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 80000c22f..1bf768065 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -27,6 +27,7 @@ import abc import typing as t +from ..._core.launcher.stepMapping import StepMap from ...error import AllocationError, LauncherError, SSUnsupportedError from ...settings import SettingsBase from .step import Step @@ -69,6 +70,15 @@ def run(self, step: Step) -> t.Optional[str]: def stop(self, step_name: str) -> StepInfo: raise NotImplementedError + def add_step_to_mapping_table(self, name: str, step_map: StepMap) -> None: + """Add a StepMap to the Launcher step mapping table + making it monitor the step. + + :param name: name of step to be added + :param step_map: step map of added step + """ + self.step_mapping[name] = step_map + class WLMLauncher(Launcher): # cov-wlm """The base class for any Launcher that utilizes workload @@ -94,15 +104,11 @@ def create_step( """Create a WLM job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param step_settings: batch or run settings for entity - :type step_settings: BatchSettings | RunSettings :raises SSUnsupportedError: if batch or run settings type isnt supported :raises LauncherError: if step creation fails :return: step instance - :rtype: Step """ try: step_class = self.supported_rs[type(step_settings)] @@ -129,9 +135,7 @@ def get_step_update( """Get update for a list of job steps :param step_names: list of job steps to get updates for - :type step_names: list[str] :return: list of name, job update tuples - :rtype: list[(str, StepInfo)] """ updates: t.List[t.Tuple[str, t.Union[StepInfo, None]]] = [] @@ -162,9 +166,7 @@ def _get_unmanaged_step_update( """Get step updates for Popen managed jobs :param task_ids: task id to check - :type task_ids: list[str] :return: list of step updates - :rtype: list[StepInfo] """ updates = [] for task_id in task_ids: diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 96778ec0d..ffcb84f23 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -59,9 +59,7 @@ def get_step_update( """Get status updates of each job step name provided :param step_names: list of step_names - :type step_names: list[str] :return: list of tuples for update - :rtype: list[tuple[str, StepInfo | None]] """ # step ids are process ids of the tasks # as there is no WLM intermediary @@ -78,9 +76,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """Return the address of nodes assigned to the step :param step_names: list of step_names - :type step_names: list[str] :return: list of node addresses - :rtype: list[list[str]] TODO: Use socket to find the actual Lo address? """ @@ -92,9 +88,7 @@ def run(self, step: Step) -> str: files will be written to the entity path. :param step: LocalStep instance to run - :type step: LocalStep :return: task_id of the newly created step - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -118,9 +112,7 @@ def stop(self, step_name: str) -> UnmanagedStepInfo: """Stop a job step :param step_name: name of the step to be stopped - :type step_name: str :return: a UnmanagedStepInfo instance - :rtype: UnmanagedStepInfo """ # step_id is task_id for local. Naming for consistency step_id = self.step_mapping[step_name].task_id diff --git a/smartsim/_core/launcher/lsf/lsfCommands.py b/smartsim/_core/launcher/lsf/lsfCommands.py index d6d0ee031..cb92587c1 100644 --- a/smartsim/_core/launcher/lsf/lsfCommands.py +++ b/smartsim/_core/launcher/lsf/lsfCommands.py @@ -33,7 +33,6 @@ def bjobs(args: t.List[str]) -> t.Tuple[str, str]: """Calls LSF bjobs with args :param args: List of command arguments - :type args: List of str :returns: Output and error of bjobs """ cmd = ["bjobs"] + args @@ -47,9 +46,7 @@ def bkill(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: returncode, output and error - :rtype: (int, str, str) """ cmd = ["bkill"] + args returncode, out, error = execute_cmd(cmd) @@ -62,9 +59,7 @@ def jskill(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: returncode, output and error - :rtype: (int, str, str) """ cmd = ["jskill"] + args @@ -76,9 +71,7 @@ def jslist(args: t.List[str]) -> t.Tuple[str, str]: """Calls LSF jslist with args :param args: List of command arguments - :type args: List of str :returns: Output and error of jslist - :rtype: (str, str) """ cmd = ["jslist"] + args _, out, err = execute_cmd(cmd) diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index a8b6fafdb..e0ad808ed 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -38,7 +38,7 @@ RunSettings, SettingsBase, ) -from ....status import STATUS_CANCELLED, STATUS_COMPLETED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -91,10 +91,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through LSF :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -134,9 +132,7 @@ def stop(self, step_name: str) -> StepInfo: """Stop/cancel a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -155,7 +151,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod @@ -183,9 +181,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ updates: t.List[StepInfo] = [] @@ -207,7 +203,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: # create LSFBatchStepInfo objects to return batch_info = LSFBatchStepInfo(stat, None) # account for case where job history is not logged by LSF - if batch_info.status == STATUS_COMPLETED: + if batch_info.status == SmartSimStatus.STATUS_COMPLETED: batch_info.returncode = 0 updates.append(batch_info) return updates diff --git a/smartsim/_core/launcher/lsf/lsfParser.py b/smartsim/_core/launcher/lsf/lsfParser.py index 33837d2bd..c3272fa99 100644 --- a/smartsim/_core/launcher/lsf/lsfParser.py +++ b/smartsim/_core/launcher/lsf/lsfParser.py @@ -31,9 +31,7 @@ def parse_bsub(output: str) -> str: """Parse bsub output and return job id. :param output: stdout of bsub command - :type output: str :returns: job id - :rtype: str """ for line in output.split("\n"): if line.startswith("Job"): @@ -45,9 +43,7 @@ def parse_bsub_error(output: str) -> str: """Parse and return error output of a failed bsub command. :param output: stderr of qsub command - :type output: str :returns: error message - :rtype: str """ # Search for first non-empty line error_lines = [] @@ -77,11 +73,8 @@ def parse_jslist_stepid(output: str, step_id: str) -> t.Tuple[str, t.Optional[st options to obtain step status :param output: output of the bjobs command - :type output: str :param step_id: allocation id or job step id - :type step_id: str :return: status and return code - :rtype: (str, str) """ result: t.Tuple[str, t.Optional[str]] = ("NOTFOUND", None) @@ -101,11 +94,8 @@ def parse_bjobs_jobid(output: str, job_id: str) -> str: to obtain job status. :param output: output of the bjobs command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status - :rtype: str """ result = "NOTFOUND" for line in output.split("\n"): @@ -126,9 +116,7 @@ def parse_bjobs_nodes(output: str) -> t.List[str]: a job in a list with the duplicates removed. :param output: output of the `bjobs -w` command - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes = [] @@ -146,11 +134,8 @@ def parse_max_step_id_from_jslist(output: str) -> t.Optional[str]: properly returned :param output: output bjobs - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ max_step_id = None diff --git a/smartsim/_core/launcher/pbs/pbsCommands.py b/smartsim/_core/launcher/pbs/pbsCommands.py index f738ef1f8..989af93be 100644 --- a/smartsim/_core/launcher/pbs/pbsCommands.py +++ b/smartsim/_core/launcher/pbs/pbsCommands.py @@ -33,7 +33,6 @@ def qstat(args: t.List[str]) -> t.Tuple[str, str]: """Calls PBS qstat with args :param args: List of command arguments - :type args: List of str :returns: Output and error of qstat """ cmd = ["qstat"] + args @@ -45,7 +44,6 @@ def qsub(args: t.List[str]) -> t.Tuple[str, str]: """Calls PBS qsub with args :param args: List of command arguments - :type args: List of str :returns: Output and error of salloc """ cmd = ["qsub"] + args @@ -59,9 +57,7 @@ def qdel(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: output and error - :rtype: str """ cmd = ["qdel"] + args returncode, out, error = execute_cmd(cmd) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index 0b2f85e95..8c2099a8b 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -39,7 +39,7 @@ RunSettings, SettingsBase, ) -from ....status import STATUS_CANCELLED, STATUS_COMPLETED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -53,7 +53,11 @@ ) from ..stepInfo import PBSStepInfo, StepInfo from .pbsCommands import qdel, qstat -from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat +from .pbsParser import ( + parse_qstat_jobid, + parse_qstat_jobid_json, + parse_step_id_from_qstat, +) logger = get_logger(__name__) @@ -88,10 +92,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through PBSPro :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -131,9 +133,7 @@ def stop(self, step_name: str) -> StepInfo: """Stop/cancel a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -149,7 +149,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod @@ -178,20 +180,29 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ updates: t.List[StepInfo] = [] qstat_out, _ = qstat(step_ids) stats = [parse_qstat_jobid(qstat_out, str(step_id)) for step_id in step_ids] + + # Fallback: if all jobs result as NOTFOUND, it might be an issue + # with truncated names, we resort to json format which does not truncate + # information + if all(stat is None for stat in stats): + qstat_out_json, _ = qstat(["-f", "-F", "json"] + step_ids) + stats = [ + parse_qstat_jobid_json(qstat_out_json, str(step_id)) + for step_id in step_ids + ] + # create PBSStepInfo objects to return for stat, _ in zip(stats, step_ids): - info = PBSStepInfo(stat, None) + info = PBSStepInfo(stat or "NOTFOUND", None) # account for case where job history is not logged by PBS - if info.status == STATUS_COMPLETED: + if info.status == SmartSimStatus.STATUS_COMPLETED: info.returncode = 0 updates.append(info) diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 362577595..6f8384b11 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -33,9 +33,7 @@ def parse_qsub(output: str) -> str: output is the job id itself. :param output: stdout of qsub command - :type output: str :returns: job id - :rtype: str """ return output @@ -44,9 +42,7 @@ def parse_qsub_error(output: str) -> str: """Parse and return error output of a failed qsub command. :param output: stderr of qsub command - :type output: str :returns: error message - :rtype: str """ # look for error first for line in output.split("\n"): @@ -61,18 +57,15 @@ def parse_qsub_error(output: str) -> str: return base_err -def parse_qstat_jobid(output: str, job_id: str) -> str: +def parse_qstat_jobid(output: str, job_id: str) -> t.Optional[str]: """Parse and return output of the qstat command run with options to obtain job status. :param output: output of the qstat command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status - :rtype: str """ - result = "NOTFOUND" + result = None for line in output.split("\n"): fields = line.split() if len(fields) >= 5: @@ -83,6 +76,25 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: return result +def parse_qstat_jobid_json(output: str, job_id: str) -> t.Optional[str]: + """Parse and return output of the qstat command run with JSON options + to obtain job status. + + :param output: output of the qstat command in JSON format + :param job_id: allocation id or job step id + :return: status + """ + out_json = load_and_clean_json(output) + + if "Jobs" not in out_json: + return None + jobs: dict[str, t.Any] = out_json["Jobs"] + job: t.Optional[dict[str, t.Any]] = jobs.get(job_id, None) + if job is None: + return None + return str(job.get("job_state", None)) + + def parse_qstat_nodes(output: str) -> t.List[str]: """Parse and return the qstat command run with options to obtain node list. @@ -93,9 +105,7 @@ def parse_qstat_nodes(output: str) -> t.List[str]: The `output` parameter must be in JSON format. :param output: output of the qstat command in JSON format - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes: t.List[str] = [] out_json = load_and_clean_json(output) @@ -116,11 +126,8 @@ def parse_step_id_from_qstat(output: str, step_name: str) -> t.Optional[str]: """Parse and return the step id from a qstat command :param output: output qstat - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ step_id: t.Optional[str] = None out_json = load_and_clean_json(output) diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index 2e37f1d79..839826297 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -38,7 +38,6 @@ def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls sstat with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sstat """ _, out, err = _execute_slurm_cmd("sstat", args, raise_on_err=raise_on_err) @@ -49,7 +48,6 @@ def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls sacct with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sacct """ _, out, err = _execute_slurm_cmd("sacct", args, raise_on_err=raise_on_err) @@ -60,7 +58,6 @@ def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str """Calls slurm salloc with args :param args: List of command arguments - :type args: List of str :returns: Output and error of salloc """ _, out, err = _execute_slurm_cmd("salloc", args, raise_on_err=raise_on_err) @@ -71,7 +68,6 @@ def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls slurm sinfo with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sinfo """ _, out, err = _execute_slurm_cmd("sinfo", args, raise_on_err=raise_on_err) @@ -82,7 +78,6 @@ def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, s """Calls slurm scontrol with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sinfo """ _, out, err = _execute_slurm_cmd("scontrol", args, raise_on_err=raise_on_err) @@ -95,9 +90,7 @@ def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, st returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: output and error - :rtype: str """ return _execute_slurm_cmd("scancel", args, raise_on_err=raise_on_err) diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index e939a63db..2e4102391 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -40,7 +40,7 @@ SettingsBase, SrunSettings, ) -from ....status import STATUS_CANCELLED +from ....status import SmartSimStatus from ...config import CONFIG from ..launcher import WLMLauncher from ..step import ( @@ -100,10 +100,8 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: would return nid00034 :param step_names: list of job step names - :type step_names: list[str] :raises LauncherError: if nodelist aquisition fails :return: list of hostnames - :rtype: list[str] """ _, step_ids = self.step_mapping.get_ids(step_names, managed=True) step_str = _create_step_id_str([val for val in step_ids if val is not None]) @@ -122,10 +120,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ self.check_for_slurm() if not self.task_manager.actively_monitoring: @@ -175,9 +171,7 @@ def stop(self, step_name: str) -> StepInfo: """Step a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -218,7 +212,9 @@ def stop(self, step_name: str) -> StepInfo: if not step_info: raise LauncherError(f"Could not get step_info for job step {step_name}") - step_info.status = STATUS_CANCELLED # set status to cancelled instead of failed + step_info.status = ( + SmartSimStatus.STATUS_CANCELLED + ) # set status to cancelled instead of failed return step_info @staticmethod @@ -255,9 +251,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ step_str = _create_step_id_str(step_ids) sacct_out, _ = sacct( diff --git a/smartsim/_core/launcher/slurm/slurmParser.py b/smartsim/_core/launcher/slurm/slurmParser.py index ede687eb6..4ec187f19 100644 --- a/smartsim/_core/launcher/slurm/slurmParser.py +++ b/smartsim/_core/launcher/slurm/slurmParser.py @@ -43,9 +43,7 @@ def parse_salloc_error(output: str) -> t.Optional[str]: """Parse and return error output of a failed salloc command :param output: stderr output of salloc command - :type output: str :return: error message - :rtype: str """ salloc = which("salloc") # look for error first @@ -73,10 +71,9 @@ def jobid_exact_match(parsed_id: str, job_id: str) -> bool: the prefix of another job_id, like 1 and 11 or 1.1 and 1.10. Works with job id or step id (i.e. with or without a '.' in the id) + :param parsed_id: the id read from the line - :type paserd_id: str :param job_id: the id to check for equality - :type job_id: str """ if "." in job_id: return parsed_id == job_id @@ -88,11 +85,8 @@ def parse_sacct(output: str, job_id: str) -> t.Tuple[str, t.Optional[str]]: """Parse and return output of the sacct command :param output: output of the sacct command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status and returncode - :rtype: tuple """ result: t.Tuple[str, t.Optional[str]] = ("PENDING", None) for line in output.split("\n"): @@ -113,9 +107,7 @@ def parse_sstat_nodes(output: str, job_id: str) -> t.List[str]: a job in a list with the duplicates removed. :param output: output of the sstat command - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes = [] for line in output.split("\n"): @@ -134,11 +126,8 @@ def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[str]: :param output: output of sacct --noheader -p --format=jobname,jobid --job - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ step_id = None for line in output.split("\n"): diff --git a/smartsim/_core/launcher/step/__init__.py b/smartsim/_core/launcher/step/__init__.py index 663edb682..c492f3e97 100644 --- a/smartsim/_core/launcher/step/__init__.py +++ b/smartsim/_core/launcher/step/__init__.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from .alpsStep import AprunStep +from .dragonStep import DragonBatchStep, DragonStep from .localStep import LocalStep from .lsfStep import BsubBatchStep, JsrunStep from .mpiStep import MpiexecStep, MpirunStep, OrterunStep diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 61ca5eee8..eb7903af9 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: """Initialize a ALPS aprun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: AprunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -65,7 +62,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ aprun = self.run_settings.run_command if not aprun: @@ -122,7 +118,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py new file mode 100644 index 000000000..036a9e565 --- /dev/null +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -0,0 +1,248 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import os +import shutil +import sys +import typing as t + +from ...._core.schemas.dragonRequests import DragonRunRequest, request_registry +from ....error.errors import SSUnsupportedError +from ....log import get_logger +from ....settings import ( + DragonRunSettings, + QsubBatchSettings, + SbatchSettings, + Singularity, +) +from .step import Step + +logger = get_logger(__name__) + + +class DragonStep(Step): + def __init__(self, name: str, cwd: str, run_settings: DragonRunSettings) -> None: + """Initialize a srun job step + + :param name: name of the entity to be launched + :param cwd: path to launch dir + :param run_settings: run settings for entity + """ + super().__init__(name, cwd, run_settings) + self.managed = True + + @property + def run_settings(self) -> DragonRunSettings: + return t.cast(DragonRunSettings, self.step_settings) + + def get_launch_cmd(self) -> t.List[str]: + """Get stringified version of request + needed to launch this step + + :return: launch command + """ + run_settings = self.run_settings + exe_cmd = [] + + if run_settings.colocated_db_settings: + # Replace the command with the entrypoint wrapper script + bash = shutil.which("bash") + if not bash: + raise RuntimeError("Could not find bash in PATH") + launch_script_path = self.get_colocated_launch_script() + exe_cmd += [bash, launch_script_path] + + if isinstance(run_settings.container, Singularity): + # pylint: disable-next=protected-access + exe_cmd += run_settings.container._container_cmds(self.cwd) + + exe_cmd += run_settings.exe + + exe_args = self._get_exe_args_list(run_settings) + + exe_cmd_and_args = exe_cmd + exe_args + + return exe_cmd_and_args + + @staticmethod + def _get_exe_args_list(run_setting: DragonRunSettings) -> t.List[str]: + """Convenience function to encapsulate checking the + runsettings.exe_args type to always return a list + """ + exe_args = run_setting.exe_args + args: t.List[str] = exe_args if isinstance(exe_args, list) else [exe_args] + return args + + +class DragonBatchStep(Step): + def __init__( + self, + name: str, + cwd: str, + batch_settings: t.Union[SbatchSettings, QsubBatchSettings], + ) -> None: + """Initialize a Slurm Sbatch step + + :param name: name of the entity to launch + :param cwd: path to launch dir + :param batch_settings: batch settings for entity + """ + super().__init__(name, cwd, batch_settings) + self.steps: t.List[Step] = [] + self.managed = True + self.batch_settings = batch_settings + self._request_file_name = "requests.json" + + def get_launch_cmd(self) -> t.List[str]: + """Get the launch command for the batch + + :return: launch command for the batch + """ + if isinstance(self.batch_settings, SbatchSettings): + script = self._write_sbatch_script() + return [self.batch_settings.batch_cmd, "--parsable", script] + if isinstance(self.batch_settings, QsubBatchSettings): + script = self._write_qsub_script() + return [self.batch_settings.batch_cmd, script] + + raise SSUnsupportedError( + "DragonBatchStep only support SbatchSettings and QsubBatchSettings" + ) + + def add_to_batch(self, step: Step) -> None: + """Add a job step to this batch + + :param step: a job step instance e.g. DragonStep + """ + self.steps.append(step) + logger.debug(f"Added step command to batch for {step.name}") + + @staticmethod + def _dragon_entrypoint_cmd(request_file: str) -> str: + """Return command needed to run the Dragon entrypoint""" + cmd = [ + sys.executable, + "-m", + "smartsim._core.entrypoints.dragon_client", + "+submit", + request_file, + ] + return " ".join(cmd) + + def _write_request_file(self) -> str: + """Write json file with requests to submit to Dragon server""" + request_file = self.get_step_file( + ending="json", script_name=self._request_file_name + ) + requests = [] + for step in self.steps: + run_settings = t.cast(DragonRunSettings, step.step_settings) + run_args = run_settings.run_args + env = run_settings.env_vars + nodes = int(run_args.get("nodes", None) or 1) + tasks_per_node = int(run_args.get("tasks-per-node", None) or 1) + + cmd = step.get_launch_cmd() + out, err = step.get_output_files() + request = DragonRunRequest( + exe=cmd[0], + exe_args=cmd[1:], + path=step.cwd, + name=step.name, + nodes=nodes, + tasks_per_node=tasks_per_node, + env=env, + current_env=os.environ, + output_file=out, + error_file=err, + ) + requests.append(request_registry.to_string(request)) + with open(request_file, "w", encoding="utf-8") as script_file: + script_file.write(json.dumps(requests)) + + return request_file + + def _write_sbatch_script(self) -> str: + """Write the PBS batch script + + :return: batch script path after writing + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#SBATCH --output={output}\n") + script_file.write(f"#SBATCH --error={error}\n") + script_file.write(f"#SBATCH --job-name={self.name}\n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#SBATCH {opt}\n") + + script_file.write( + f"#SBATCH --export=ALL,SMARTSIM_DRAGON_SERVER_PATH={self.cwd}," + "PYTHONUNBUFFERED=1\n" + ) + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) + return batch_script + + def _write_qsub_script(self) -> str: + """Write the Slurm batch script + + :return: batch script path after writing + """ + batch_script = self.get_step_file(ending=".sh") + output, error = self.get_output_files() + request_file = self._write_request_file() + with open(batch_script, "w", encoding="utf-8") as script_file: + script_file.write("#!/bin/bash\n\n") + script_file.write(f"#PBS -o {output}\n") + script_file.write(f"#PBS -e {error}\n") + script_file.write(f"#PBS -N {self.name}\n") + script_file.write("#PBS -V \n") + + # add additional sbatch options + for opt in self.batch_settings.format_batch_args(): + script_file.write(f"#PBS {opt}\n") + + script_file.write(f"#PBS -v SMARTSIM_DRAGON_SERVER_PATH={self.cwd}\n") + + for cmd in self.batch_settings.preamble: + script_file.write(f"{cmd}\n") + + script_file.write( + DragonBatchStep._dragon_entrypoint_cmd(request_file) + "\n" + ) + + return batch_script diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 1c88dadb8..0cb921e19 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, batch_settings: BsubBatchSettings) -> No """Initialize a LSF bsub step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: BsubBatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -57,7 +54,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, script] @@ -66,7 +62,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = step.get_launch_cmd() self.step_cmds.append(launch_cmd) @@ -76,7 +71,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() @@ -113,11 +107,8 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings): """Initialize a LSF jsrun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: RunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -155,7 +146,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ jsrun = self.run_settings.run_command if not jsrun: @@ -223,7 +213,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ exe = self.run_settings.exe args = self.run_settings._exe_args # pylint: disable=protected-access diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 785d55e92..767486462 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -43,11 +43,8 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: """Initialize a job step conforming to the MPI standard :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: RunSettings """ super().__init__(name, cwd, run_settings) @@ -64,7 +61,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ run_cmd = self.run_settings.run_command if not run_cmd: @@ -130,7 +126,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() @@ -161,14 +156,10 @@ def __init__(self, name: str, cwd: str, run_settings: MpiexecSettings) -> None: """Initialize an mpiexec job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: MpiexecSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) @@ -179,14 +170,10 @@ def __init__(self, name: str, cwd: str, run_settings: MpirunSettings) -> None: """Initialize an mpirun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: MpirunSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) @@ -197,14 +184,10 @@ def __init__(self, name: str, cwd: str, run_settings: OrterunSettings) -> None: """Initialize an orterun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: OrterunSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 65dac3225..82a91aaa4 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -38,11 +38,8 @@ def __init__(self, name: str, cwd: str, batch_settings: QsubBatchSettings) -> No """Initialize a PBSpro qsub step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: QsubBatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -53,7 +50,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, script] @@ -62,7 +58,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = step.get_launch_cmd() self.step_cmds.append(launch_cmd) @@ -72,7 +67,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 7baab891b..83f39cf09 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, batch_settings: SbatchSettings) -> None: """Initialize a Slurm Sbatch step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: SbatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -57,7 +54,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, "--parsable", script] @@ -66,7 +62,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = ["cd", step.cwd, ";"] launch_cmd += step.get_launch_cmd() @@ -77,7 +72,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() @@ -108,11 +102,8 @@ def __init__(self, name: str, cwd: str, run_settings: SrunSettings) -> None: """Initialize a srun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: SrunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -125,7 +116,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ srun = self.run_settings.run_command if not srun: @@ -206,7 +196,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index ddb95a850..2cce6e610 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -28,6 +28,7 @@ import functools import os.path as osp +import pathlib import sys import time import typing as t @@ -66,10 +67,21 @@ def _create_unique_name(entity_name: str) -> str: step_name = entity_name + "-" + get_base_36_repr(time.time_ns()) return step_name + @staticmethod + def _ensure_output_directory_exists(output_dir: str) -> None: + """Create the directory for the step output if it doesn't exist already""" + if not osp.exists(output_dir): + pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) + def get_output_files(self) -> t.Tuple[str, str]: - """Return two paths to error and output files based on cwd""" - output = self.get_step_file(ending=".out") - error = self.get_step_file(ending=".err") + """Return two paths to error and output files based on metadata directory""" + try: + output_dir = self.meta["status_dir"] + except KeyError as exc: + raise KeyError("Status directory for this step has not been set.") from exc + self._ensure_output_directory_exists(output_dir) + output = osp.join(output_dir, f"{self.entity_name}.out") + error = osp.join(output_dir, f"{self.entity_name}.err") return output, error def get_step_file( @@ -114,7 +126,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ raise SmartSimError("add_to_batch not implemented for this step type") @@ -127,6 +138,14 @@ def proxyable_launch_cmd( ) -> t.Callable[[_StepT], t.List[str]]: @functools.wraps(fn) def _get_launch_cmd(self: _StepT) -> t.List[str]: + """ + Generate a launch command that executes the `JobStep` with the + indirect launching entrypoint instead of directly. The original + command is passed to the proxy as a base64 encoded string. + + Steps implementing `get_launch_cmd` and decorated with + `proxyable_launch_cmd` will generate status updates that can be consumed + by the telemetry monitor and dashboard""" original_cmd_list = fn(self) if not CONFIG.telemetry_enabled: @@ -134,18 +153,22 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: if self.managed: raise UnproxyableStepError( - f"Attempting to proxy managed step of type {type(self)}" + f"Attempting to proxy managed step of type {type(self)} " "through the unmanaged step proxy entry point" ) proxy_module = "smartsim._core.entrypoints.indirect" - etype = self.meta["entity_type"] + entity_type = self.meta["entity_type"] status_dir = self.meta["status_dir"] + + logger.debug(f"Encoding command{' '.join(original_cmd_list)}") + + # encode the original cmd to avoid potential collisions and escaping + # errors when passing it using CLI arguments to the indirect entrypoint encoded_cmd = encode_cmd(original_cmd_list) - # NOTE: this is NOT safe. should either 1) sign cmd and verify OR 2) - # serialize step and let the indirect entrypoint rebuild the - # cmd... for now, test away... + # return a new command that executes the proxy and passes + # the original command as an argument return [ sys.executable, "-m", @@ -155,7 +178,7 @@ def _get_launch_cmd(self: _StepT) -> t.List[str]: "+command", encoded_cmd, "+entity_type", - etype, + entity_type, "+telemetry_dir", status_dir, "+working_dir", diff --git a/smartsim/_core/launcher/stepInfo.py b/smartsim/_core/launcher/stepInfo.py index 56b5218fc..875eb0322 100644 --- a/smartsim/_core/launcher/stepInfo.py +++ b/smartsim/_core/launcher/stepInfo.py @@ -28,20 +28,13 @@ import psutil -from ...status import ( - SMARTSIM_STATUS, - STATUS_CANCELLED, - STATUS_COMPLETED, - STATUS_FAILED, - STATUS_PAUSED, - STATUS_RUNNING, -) +from ...status import SmartSimStatus class StepInfo: def __init__( self, - status: str = "", + status: SmartSimStatus, launcher_status: str = "", returncode: t.Optional[int] = None, output: t.Optional[str] = None, @@ -54,48 +47,50 @@ def __init__( self.error = error def __str__(self) -> str: - info_str = f"Status: {self.status}" + info_str = f"Status: {self.status.value}" info_str += f" | Launcher Status {self.launcher_status}" info_str += f" | Returncode {str(self.returncode)}" return info_str @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: raise NotImplementedError def _get_smartsim_status( self, status: str, returncode: t.Optional[int] = None - ) -> str: + ) -> SmartSimStatus: """ Map the status of the WLM step to a smartsim-specific status """ - if status in SMARTSIM_STATUS: - return SMARTSIM_STATUS[status] + if any(ss_status.value == status for ss_status in SmartSimStatus): + return SmartSimStatus(status) if status in self.mapping and returncode in [None, 0]: return self.mapping[status] - return STATUS_FAILED + return SmartSimStatus.STATUS_FAILED class UnmanagedStepInfo(StepInfo): @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # see https://github.com/giampaolo/psutil/blob/master/psutil/_pslinux.py # see https://github.com/giampaolo/psutil/blob/master/psutil/_common.py return { - psutil.STATUS_RUNNING: STATUS_RUNNING, - psutil.STATUS_SLEEPING: STATUS_RUNNING, # sleeping thread is still alive - psutil.STATUS_WAKING: STATUS_RUNNING, - psutil.STATUS_DISK_SLEEP: STATUS_RUNNING, - psutil.STATUS_DEAD: STATUS_FAILED, - psutil.STATUS_TRACING_STOP: STATUS_PAUSED, - psutil.STATUS_WAITING: STATUS_PAUSED, - psutil.STATUS_STOPPED: STATUS_PAUSED, - psutil.STATUS_LOCKED: STATUS_PAUSED, - psutil.STATUS_PARKED: STATUS_PAUSED, - psutil.STATUS_IDLE: STATUS_PAUSED, - psutil.STATUS_ZOMBIE: STATUS_COMPLETED, + psutil.STATUS_RUNNING: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_SLEEPING: ( + SmartSimStatus.STATUS_RUNNING + ), # sleeping thread is still alive + psutil.STATUS_WAKING: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_DISK_SLEEP: SmartSimStatus.STATUS_RUNNING, + psutil.STATUS_DEAD: SmartSimStatus.STATUS_FAILED, + psutil.STATUS_TRACING_STOP: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_WAITING: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_STOPPED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_LOCKED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_PARKED: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_IDLE: SmartSimStatus.STATUS_PAUSED, + psutil.STATUS_ZOMBIE: SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -114,30 +109,30 @@ def __init__( class SlurmStepInfo(StepInfo): # cov-slurm # see https://slurm.schedmd.com/squeue.html#lbAG mapping = { - "RUNNING": STATUS_RUNNING, - "CONFIGURING": STATUS_RUNNING, - "STAGE_OUT": STATUS_RUNNING, - "COMPLETED": STATUS_COMPLETED, - "DEADLINE": STATUS_COMPLETED, - "TIMEOUT": STATUS_COMPLETED, - "BOOT_FAIL": STATUS_FAILED, - "FAILED": STATUS_FAILED, - "NODE_FAIL": STATUS_FAILED, - "OUT_OF_MEMORY": STATUS_FAILED, - "CANCELLED": STATUS_CANCELLED, - "CANCELLED+": STATUS_CANCELLED, - "REVOKED": STATUS_CANCELLED, - "PENDING": STATUS_PAUSED, - "PREEMPTED": STATUS_PAUSED, - "RESV_DEL_HOLD": STATUS_PAUSED, - "REQUEUE_FED": STATUS_PAUSED, - "REQUEUE_HOLD": STATUS_PAUSED, - "REQUEUED": STATUS_PAUSED, - "RESIZING": STATUS_PAUSED, - "SIGNALING": STATUS_PAUSED, - "SPECIAL_EXIT": STATUS_PAUSED, - "STOPPED": STATUS_PAUSED, - "SUSPENDED": STATUS_PAUSED, + "RUNNING": SmartSimStatus.STATUS_RUNNING, + "CONFIGURING": SmartSimStatus.STATUS_RUNNING, + "STAGE_OUT": SmartSimStatus.STATUS_RUNNING, + "COMPLETED": SmartSimStatus.STATUS_COMPLETED, + "DEADLINE": SmartSimStatus.STATUS_COMPLETED, + "TIMEOUT": SmartSimStatus.STATUS_COMPLETED, + "BOOT_FAIL": SmartSimStatus.STATUS_FAILED, + "FAILED": SmartSimStatus.STATUS_FAILED, + "NODE_FAIL": SmartSimStatus.STATUS_FAILED, + "OUT_OF_MEMORY": SmartSimStatus.STATUS_FAILED, + "CANCELLED": SmartSimStatus.STATUS_CANCELLED, + "CANCELLED+": SmartSimStatus.STATUS_CANCELLED, + "REVOKED": SmartSimStatus.STATUS_CANCELLED, + "PENDING": SmartSimStatus.STATUS_PAUSED, + "PREEMPTED": SmartSimStatus.STATUS_PAUSED, + "RESV_DEL_HOLD": SmartSimStatus.STATUS_PAUSED, + "REQUEUE_FED": SmartSimStatus.STATUS_PAUSED, + "REQUEUE_HOLD": SmartSimStatus.STATUS_PAUSED, + "REQUEUED": SmartSimStatus.STATUS_PAUSED, + "RESIZING": SmartSimStatus.STATUS_PAUSED, + "SIGNALING": SmartSimStatus.STATUS_PAUSED, + "SPECIAL_EXIT": SmartSimStatus.STATUS_PAUSED, + "STOPPED": SmartSimStatus.STATUS_PAUSED, + "SUSPENDED": SmartSimStatus.STATUS_PAUSED, } def __init__( @@ -155,23 +150,27 @@ def __init__( class PBSStepInfo(StepInfo): # cov-pbs @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see http://nusc.nsu.ru/wiki/lib/exe/fetch.php/doc/pbs/PBSReferenceGuide19.2.1.pdf#M11.9.90788.PBSHeading1.81.Job.States return { - "R": STATUS_RUNNING, - "B": STATUS_RUNNING, - "H": STATUS_PAUSED, - "M": STATUS_PAUSED, # Actually means that it was moved to another server, + "R": SmartSimStatus.STATUS_RUNNING, + "B": SmartSimStatus.STATUS_RUNNING, + "H": SmartSimStatus.STATUS_PAUSED, + "M": ( + SmartSimStatus.STATUS_PAUSED + ), # Actually means that it was moved to another server, # TODO: understand what this implies - "Q": STATUS_PAUSED, - "S": STATUS_PAUSED, - "T": STATUS_PAUSED, # This means in transition, see above for comment - "U": STATUS_PAUSED, - "W": STATUS_PAUSED, - "E": STATUS_COMPLETED, - "F": STATUS_COMPLETED, - "X": STATUS_COMPLETED, + "Q": SmartSimStatus.STATUS_PAUSED, + "S": SmartSimStatus.STATUS_PAUSED, + "T": ( + SmartSimStatus.STATUS_PAUSED + ), # This means in transition, see above for comment + "U": SmartSimStatus.STATUS_PAUSED, + "W": SmartSimStatus.STATUS_PAUSED, + "E": SmartSimStatus.STATUS_COMPLETED, + "F": SmartSimStatus.STATUS_COMPLETED, + "X": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -183,10 +182,14 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: # if PBS job history isnt available, and job isnt in queue - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status) @@ -197,16 +200,16 @@ def __init__( class LSFBatchStepInfo(StepInfo): # cov-lsf @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=execution-about-job-states return { - "RUN": STATUS_RUNNING, - "PSUSP": STATUS_PAUSED, - "USUSP": STATUS_PAUSED, - "SSUSP": STATUS_PAUSED, - "PEND": STATUS_PAUSED, - "DONE": STATUS_COMPLETED, + "RUN": SmartSimStatus.STATUS_RUNNING, + "PSUSP": SmartSimStatus.STATUS_PAUSED, + "USUSP": SmartSimStatus.STATUS_PAUSED, + "SSUSP": SmartSimStatus.STATUS_PAUSED, + "PEND": SmartSimStatus.STATUS_PAUSED, + "DONE": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -218,9 +221,13 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status) @@ -231,14 +238,14 @@ def __init__( class LSFJsrunStepInfo(StepInfo): # cov-lsf @property - def mapping(self) -> t.Dict[str, str]: + def mapping(self) -> t.Dict[str, SmartSimStatus]: # pylint: disable=line-too-long # see https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=execution-about-job-states return { - "Killed": STATUS_COMPLETED, - "Running": STATUS_RUNNING, - "Queued": STATUS_PAUSED, - "Complete": STATUS_COMPLETED, + "Killed": SmartSimStatus.STATUS_COMPLETED, + "Running": SmartSimStatus.STATUS_RUNNING, + "Queued": SmartSimStatus.STATUS_PAUSED, + "Complete": SmartSimStatus.STATUS_COMPLETED, } def __init__( @@ -250,9 +257,13 @@ def __init__( ) -> None: if status == "NOTFOUND": if returncode is not None: - smartsim_status = "Completed" if returncode == 0 else "Failed" + smartsim_status = ( + SmartSimStatus.STATUS_COMPLETED + if returncode == 0 + else SmartSimStatus.STATUS_FAILED + ) else: - smartsim_status = "Completed" + smartsim_status = SmartSimStatus.STATUS_COMPLETED returncode = 0 else: smartsim_status = self._get_smartsim_status(status, returncode) diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index 84123944e..60f097da6 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -114,17 +114,11 @@ def start_task( by a workload manager :param cmd_list: command to run - :type cmd_list: list[str] :param cwd: current working directory - :type cwd: str :param env: environment to launch with - :type env: dict[str, str], optional. If None, calling environment is inherited - :param out: output file, defaults to PIPE - :type out: file, optional - :param err: error file, defaults to PIPE - :type err: file, optional + :param out: output file + :param err: error file :return: task id - :rtype: int """ with self._lock: proc = execute_async_cmd(cmd_list, cwd, env=env, out=out, err=err) @@ -150,15 +144,10 @@ def start_and_wait( This is primarily used for batch job launches :param cmd_list: command to run - :type cmd_list: list[str] :param cwd: current working directory - :type cwd: str :param env: environment to launch with - :type env: dict[str, str], optional - :param timeout: time to wait, defaults to None - :type timeout: int, optional + :param timeout: time to wait :return: returncode, output, and err - :rtype: int, str, str """ returncode, out, err = execute_cmd(cmd_list, cwd=cwd, env=env, timeout=timeout) if VERBOSE_TM: @@ -169,7 +158,6 @@ def add_existing(self, task_id: int) -> None: """Add existing task to be managed by the TaskManager :param task_id: task id of existing task - :type task_id: str :raises LauncherError: If task cannot be found """ with self._lock: @@ -186,7 +174,6 @@ def remove_task(self, task_id: str) -> None: """Remove a task from the TaskManager :param task_id: id of the task to remove - :type task_id: str """ with self._lock: if VERBOSE_TM: @@ -210,9 +197,7 @@ def get_task_update( """Get the update of a task :param task_id: task id - :type task_id: str :return: status, returncode, output, error - :rtype: str, int, str, str """ with self._lock: try: @@ -251,13 +236,9 @@ def add_task_history( Add a task to record its future returncode, output and error :param task_id: id of the task - :type task_id: str :param returncode: returncode - :type returncode: int, defaults to None - :param out: output, defaults to None - :type out: str, optional - :param err: output, defaults to None - :type err: str, optional + :param out: output + :param err: output """ self.task_history[task_id] = (returncode, out, err) @@ -278,7 +259,6 @@ def __init__(self, process: psutil.Process) -> None: """Initialize a task :param process: Popen object - :type process: psutil.Process """ self.process = process self.pid = str(self.process.pid) @@ -287,7 +267,6 @@ def check_status(self) -> t.Optional[int]: """Ping the job and return the returncode if finished :return: returncode if finished otherwise None - :rtype: int """ if self.owned and isinstance(self.process, psutil.Popen): poll_result = self.process.poll() @@ -302,7 +281,6 @@ def get_io(self) -> t.Tuple[t.Optional[str], t.Optional[str]]: """Get the IO from the subprocess :return: output and error from the Popen - :rtype: str, str """ # Process class does not implement communicate if not self.owned or not isinstance(self.process, psutil.Popen): @@ -335,8 +313,7 @@ def kill_callback(proc: psutil.Process) -> None: def terminate(self, timeout: int = 10) -> None: """Terminate a this process and all children. - :param timeout: time to wait for task death, defaults to 10 - :type timeout: int, optional + :param timeout: time to wait for task death """ def terminate_callback(proc: psutil.Process) -> None: diff --git a/smartsim/_core/launcher/util/launcherUtil.py b/smartsim/_core/launcher/util/launcherUtil.py index a24d69e49..1a6ec5d83 100644 --- a/smartsim/_core/launcher/util/launcherUtil.py +++ b/smartsim/_core/launcher/util/launcherUtil.py @@ -38,9 +38,7 @@ def __init__( """Initialize a ComputeNode :param node_name: the name of the node - :type node_name: str :param node_ppn: the number of ppn - :type node_ppn: int """ self.name: t.Optional[str] = node_name self.ppn: t.Optional[int] = node_ppn @@ -52,7 +50,6 @@ def _is_valid_node(self) -> bool: and ppn being not None. :returns: True if valid, false otherwise - :rtype: bool """ if self.name is None: return False @@ -80,7 +77,6 @@ def _is_valid_partition(self) -> bool: and each ComputeNode being valid :returns: True if valid, false otherwise - :rtype: bool """ if self.name is None: return False diff --git a/smartsim/_core/launcher/util/shell.py b/smartsim/_core/launcher/util/shell.py index c747bacbc..a2b5bc76b 100644 --- a/smartsim/_core/launcher/util/shell.py +++ b/smartsim/_core/launcher/util/shell.py @@ -49,22 +49,14 @@ def execute_cmd( """Execute a command locally :param cmd_list: list of command with arguments - :type cmd_list: list of str - :param shell: run in system shell, defaults to False - :type shell: bool, optional - :param cwd: current working directory, defaults to None - :type cwd: str, optional - :param env: environment to launcher process with, - defaults to None (current env) - :type env: dict[str, str], optional - :param proc_input: input to the process, defaults to "" - :type proc_input: str, optional - :param timeout: timeout of the process, defaults to None - :type timeout: int, optional + :param shell: run in system shell + :param cwd: current working directory + :param env: environment to launcher process with + :param proc_input: input to the process + :param timeout: timeout of the process :raises ShellError: if timeout of process was exceeded :raises ShellError: if child process raises an error :return: returncode, output, and error of the process - :rtype: tuple of (int, str, str) """ if VERBOSE_SHELL: source = "shell" if shell else "Popen" @@ -106,13 +98,9 @@ def execute_async_cmd( popen subprocess object wrapped with psutil. :param cmd_list: list of command with arguments - :type cmd_list: list of str :param cwd: current working directory - :type cwd: str :param env: environment variables to set - :type env: dict[str, str] :return: the subprocess object - :rtype: psutil.Popen """ if VERBOSE_SHELL: logger.debug(f"Executing command: {' '.join(cmd_list)}") diff --git a/smartsim/settings/mpirunSettings.py b/smartsim/_core/schemas/__init__.py similarity index 78% rename from smartsim/settings/mpirunSettings.py rename to smartsim/_core/schemas/__init__.py index 994d62bba..d7ee9d83d 100644 --- a/smartsim/settings/mpirunSettings.py +++ b/smartsim/_core/schemas/__init__.py @@ -24,18 +24,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from warnings import simplefilter, warn +from .dragonRequests import * +from .dragonResponses import * -from ..log import get_logger - -# pylint: disable-next=unused-import -from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings - -logger = get_logger(__name__) - -simplefilter("once", DeprecationWarning) -warn( - "mpirunSettings will be deprecated; use mpiSettings instead.", - DeprecationWarning, - stacklevel=2, -) +__all__ = [ + "DragonRequest", + "DragonRunRequest", + "DragonHandshakeRequest", + "DragonUpdateStatusRequest", + "DragonStopRequest", + "DragonResponse", + "DragonRunResponse", + "DragonHandshakeResponse", + "DragonUpdateStatusResponse", + "DragonStopResponse", +] diff --git a/smartsim/_core/schemas/dragonRequests.py b/smartsim/_core/schemas/dragonRequests.py new file mode 100644 index 000000000..3e384f746 --- /dev/null +++ b/smartsim/_core/schemas/dragonRequests.py @@ -0,0 +1,90 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from pydantic import BaseModel, Field, PositiveInt + +import smartsim._core.schemas.utils as _utils + +# Black and Pylint disagree about where to put the `...` +# pylint: disable=multiple-statements + +request_registry = _utils.SchemaRegistry["DragonRequest"]() + + +class DragonRequest(BaseModel): ... + + +class DragonRunRequestView(DragonRequest): + exe: t.Annotated[str, Field(min_length=1)] + exe_args: t.List[t.Annotated[str, Field(min_length=1)]] = [] + path: t.Annotated[str, Field(min_length=1)] + nodes: PositiveInt = 1 + tasks: PositiveInt = 1 + tasks_per_node: PositiveInt = 1 + hostlist: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + output_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + error_file: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + env: t.Dict[str, t.Optional[str]] = {} + name: t.Optional[t.Annotated[str, Field(min_length=1)]] = None + pmi_enabled: bool = True + + +@request_registry.register("run") +class DragonRunRequest(DragonRunRequestView): + current_env: t.Dict[str, t.Optional[str]] = {} + + def __str__(self) -> str: + return str(DragonRunRequestView.parse_obj(self.dict(exclude={"current_env"}))) + + +@request_registry.register("update_status") +class DragonUpdateStatusRequest(DragonRequest): + step_ids: t.List[t.Annotated[str, Field(min_length=1)]] + + +@request_registry.register("stop") +class DragonStopRequest(DragonRequest): + step_id: t.Annotated[str, Field(min_length=1)] + + +@request_registry.register("handshake") +class DragonHandshakeRequest(DragonRequest): ... + + +@request_registry.register("bootstrap") +class DragonBootstrapRequest(DragonRequest): + address: t.Annotated[str, Field(min_length=1)] + + +@request_registry.register("shutdown") +class DragonShutdownRequest(DragonRequest): + immediate: bool = True + """Whether the server should shut down immediately, setting this to False means + that the server will shut down when all jobs are terminated.""" + frontend_shutdown: bool = True + """Whether the frontend will have to shut down or wait for external termination""" diff --git a/smartsim/_core/schemas/dragonResponses.py b/smartsim/_core/schemas/dragonResponses.py new file mode 100644 index 000000000..3c5c30a10 --- /dev/null +++ b/smartsim/_core/schemas/dragonResponses.py @@ -0,0 +1,73 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import typing as t + +from pydantic import BaseModel, Field + +import smartsim._core.schemas.utils as _utils +from smartsim.status import SmartSimStatus + +# Black and Pylint disagree about where to put the `...` +# pylint: disable=multiple-statements + +response_registry = _utils.SchemaRegistry["DragonResponse"]() + + +class DragonResponse(BaseModel): + error_message: t.Optional[str] = None + + +@response_registry.register("run") +class DragonRunResponse(DragonResponse): + step_id: t.Annotated[str, Field(min_length=1)] + + +@response_registry.register("status_update") +class DragonUpdateStatusResponse(DragonResponse): + # status is a dict: {step_id: (is_alive, returncode)} + statuses: t.Mapping[ + t.Annotated[str, Field(min_length=1)], + t.Tuple[SmartSimStatus, t.Optional[t.List[int]]], + ] = {} + + +@response_registry.register("stop") +class DragonStopResponse(DragonResponse): ... + + +@response_registry.register("handshake") +class DragonHandshakeResponse(DragonResponse): + dragon_pid: int + + +@response_registry.register("bootstrap") +class DragonBootstrapResponse(DragonResponse): + dragon_pid: int + + +@response_registry.register("shutdown") +class DragonShutdownResponse(DragonResponse): ... diff --git a/smartsim/_core/schemas/utils.py b/smartsim/_core/schemas/utils.py new file mode 100644 index 000000000..9cb36bcf5 --- /dev/null +++ b/smartsim/_core/schemas/utils.py @@ -0,0 +1,124 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import dataclasses +import typing as t + +import pydantic +import pydantic.dataclasses + +if t.TYPE_CHECKING: + from zmq.sugar.socket import Socket + +_SchemaT = t.TypeVar("_SchemaT", bound=pydantic.BaseModel) +_SendT = t.TypeVar("_SendT", bound=pydantic.BaseModel) +_RecvT = t.TypeVar("_RecvT", bound=pydantic.BaseModel) + +_DEFAULT_MSG_DELIM: t.Final[str] = "|" + + +@t.final +@pydantic.dataclasses.dataclass(frozen=True) +class _Message(t.Generic[_SchemaT]): + payload: _SchemaT + header: str = pydantic.Field(min_length=1) + delimiter: str = pydantic.Field(min_length=1, default=_DEFAULT_MSG_DELIM) + + def __str__(self) -> str: + return self.delimiter.join((self.header, self.payload.json())) + + @classmethod + def from_str( + cls, + str_: str, + payload_type: t.Type[_SchemaT], + delimiter: str = _DEFAULT_MSG_DELIM, + ) -> "_Message[_SchemaT]": + header, payload = str_.split(delimiter, 1) + return cls(payload_type.parse_raw(payload), header, delimiter) + + +class SchemaRegistry(t.Generic[_SchemaT]): + def __init__( + self, init_map: t.Optional[t.Mapping[str, t.Type[_SchemaT]]] = None + ) -> None: + self._map = dict(init_map) if init_map else {} + + def register(self, key: str) -> t.Callable[[t.Type[_SchemaT]], t.Type[_SchemaT]]: + if _DEFAULT_MSG_DELIM in key: + _msg = f"Registry key cannot contain delimiter `{_DEFAULT_MSG_DELIM}`" + raise ValueError(_msg) + if not key: + raise KeyError("Key cannot be the empty string") + if key in self._map: + raise KeyError(f"Key `{key}` has already been registered for this parser") + + def _register(cls: t.Type[_SchemaT]) -> t.Type[_SchemaT]: + self._map[key] = cls + return cls + + return _register + + def to_string(self, schema: _SchemaT) -> str: + return str(self._to_message(schema)) + + def _to_message(self, schema: _SchemaT) -> _Message[_SchemaT]: + reverse_map = dict((v, k) for k, v in self._map.items()) + try: + val = reverse_map[type(schema)] + except KeyError: + raise TypeError(f"Unregistered schema type: {type(schema)}") from None + return _Message(schema, val, _DEFAULT_MSG_DELIM) + + def from_string(self, str_: str) -> _SchemaT: + try: + type_, _ = str_.split(_DEFAULT_MSG_DELIM, 1) + except ValueError: + _msg = f"Failed to determine schema type of the string {repr(str_)}" + raise ValueError(_msg) from None + try: + cls = self._map[type_] + except KeyError: + raise ValueError(f"No type of value `{type_}` is registered") from None + msg = _Message.from_str(str_, cls, _DEFAULT_MSG_DELIM) + return self._from_message(msg) + + @staticmethod + def _from_message(msg: _Message[_SchemaT]) -> _SchemaT: + return msg.payload + + +@dataclasses.dataclass(frozen=True) +class SocketSchemaTranslator(t.Generic[_SendT, _RecvT]): + socket: "Socket[t.Any]" + _send_registry: SchemaRegistry[_SendT] + _recv_registry: SchemaRegistry[_RecvT] + + def send(self, schema: _SendT, flags: int = 0) -> None: + self.socket.send_string(self._send_registry.to_string(schema), flags) + + def recv(self, flags: int = 0) -> _RecvT: + return self._recv_registry.from_string(self.socket.recv_string(flags)) diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index cb9395881..3ea928797 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -24,5 +24,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .helpers import colorize, delete_elements, init_default, installed_redisai_backends +from .helpers import ( + check_for_utility, + colorize, + delete_elements, + execute_platform_cmd, + installed_redisai_backends, + is_crayex_platform, +) from .redis import check_cluster_status, create_cluster, db_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 27059e320..df2c016a1 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -28,7 +28,10 @@ A file of helper functions for SmartSim """ import base64 +import collections.abc import os +import signal +import subprocess import typing as t import uuid from datetime import datetime @@ -38,16 +41,19 @@ from smartsim._core._install.builder import TRedisAIBackendStr as _TRedisAIBackendStr +if t.TYPE_CHECKING: + from types import FrameType + + +_TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] + def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: """Unpack the unformatted database identifier and format for env variable suffix using the token :param db_id: the unformatted database identifier eg. identifier_1 - :type db_id: str :param token: character to use to construct the db suffix - :type token: str :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") - :rtype: (str, str) """ if db_id == "orchestrator": @@ -58,10 +64,9 @@ def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: def unpack_colo_db_identifier(db_id: str) -> str: """Create database identifier suffix for colocated database + :param db_id: the unformatted database identifier - :type db_id: str :return: db suffix - :rtype: str """ return "_" + db_id if db_id else "" @@ -92,10 +97,9 @@ def fmt_dict(value: t.Dict[str, t.Any]) -> str: def get_base_36_repr(positive_int: int) -> str: """Converts a positive integer to its base 36 representation + :param positive_int: the positive integer to convert - :type positive_int: int :return: base 36 representation of the given positive int - :rtype: str """ digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" result = [] @@ -108,23 +112,10 @@ def get_base_36_repr(positive_int: int) -> str: return "".join(reversed(result)) -def init_default( - default: t.Any, - init_value: t.Any, - expected_type: t.Union[t.Type[t.Any], t.Tuple[t.Type[t.Any], ...], None] = None, -) -> t.Any: - if init_value is None: - return default - if expected_type is not None and not isinstance(init_value, expected_type): - raise TypeError(f"Argument was of type {type(init_value)}, not {expected_type}") - return init_value - - def expand_exe_path(exe: str) -> str: """Takes an executable and returns the full path to that executable :param exe: executable or file - :type exe: str :raises TypeError: if file is not an executable :raises FileNotFoundError: if executable cannot be found """ @@ -186,9 +177,7 @@ def colorize( def delete_elements(dictionary: t.Dict[str, t.Any], key_list: t.List[str]) -> None: """Delete elements from a dictionary. :param dictionary: the dictionary from which the elements must be deleted. - :type dictionary: dict :param key_list: the list of keys to delete from the dictionary. - :type key: any """ for key in key_list: if key in dictionary: @@ -213,9 +202,7 @@ def cat_arg_and_value(arg_name: str, value: str) -> str: `-arg_name=value` (i.e., `-a val`) :param arg_name: the command line argument name - :type arg_name: str :param value: the command line argument value - :type value: str """ if arg_name.startswith("--"): @@ -259,10 +246,8 @@ def installed_redisai_backends( the backend directories (`redisai_tensorflow`, `redisai_torch`, `redisai_onnxruntime`, or `redisai_tflite`). - :param backends_path: path containing backends, defaults to None - :type backends_path: str, optional + :param backends_path: path containing backends :return: list of installed RedisAI backends - :rtype: set[str] """ # import here to avoid circular import base_path = redis_install_base(backends_path) @@ -276,12 +261,12 @@ def installed_redisai_backends( return {backend for backend in backends if _installed(base_path, backend)} -def get_ts() -> int: - """Return the current timestamp (accurate to seconds) cast to an integer""" - return int(datetime.timestamp(datetime.now())) +def get_ts_ms() -> int: + """Return the current timestamp (accurate to milliseconds) cast to an integer""" + return int(datetime.now().timestamp() * 1000) -def encode_cmd(cmd: t.List[str]) -> str: +def encode_cmd(cmd: t.Sequence[str]) -> str: """Transform a standard command list into an encoded string safe for providing as an argument to a proxy entrypoint """ @@ -302,3 +287,205 @@ def decode_cmd(encoded_cmd: str) -> t.List[str]: cleaned_cmd = decoded_cmd.decode("ascii").split("|") return cleaned_cmd + + +def check_for_utility(util_name: str) -> str: + """Check for existence of the provided CLI utility. + + :param util_name: CLI utility to locate + :returns: Full path to executable if found. Otherwise, empty string""" + utility = "" + + try: + utility = expand_exe_path(util_name) + except FileNotFoundError: + ... + + return utility + + +def execute_platform_cmd(cmd: str) -> t.Tuple[str, int]: + """Execute the platform check command as a subprocess + + :param cmd: the command to execute + :returns: True if platform is cray ex, False otherwise""" + process = subprocess.run( + cmd.split(), + capture_output=True, + check=False, + ) + return process.stdout.decode("utf-8"), process.returncode + + +class CrayExPlatformResult: + locate_msg = "Unable to locate `{0}`." + + def __init__(self, ldconfig: t.Optional[str], fi_info: t.Optional[str]) -> None: + self.ldconfig: t.Optional[str] = ldconfig + self.fi_info: t.Optional[str] = fi_info + self.has_pmi: bool = False + self.has_pmi2: bool = False + self.has_cxi: bool = False + + @property + def has_ldconfig(self) -> bool: + return bool(self.ldconfig) + + @property + def has_fi_info(self) -> bool: + return bool(self.fi_info) + + @property + def is_cray(self) -> bool: + return all( + ( + self.has_ldconfig, + self.has_fi_info, + self.has_pmi, + self.has_pmi2, + self.has_cxi, + ) + ) + + @property + def failures(self) -> t.List[str]: + """Return a list of messages describing all failed validations""" + failure_messages = [] + + if not self.has_ldconfig: + failure_messages.append(self.locate_msg.format("ldconfig")) + + if not self.has_fi_info: + failure_messages.append(self.locate_msg.format("fi_info")) + + if self.has_ldconfig and self.has_fi_info: + if not self.has_pmi: + failure_messages.append(self.locate_msg.format("pmi.so")) + if not self.has_pmi2: + failure_messages.append(self.locate_msg.format("pmi2.so")) + if not self.has_cxi: + failure_messages.append(self.locate_msg.format("cxi.so")) + + return failure_messages + + +def check_platform() -> CrayExPlatformResult: + """Returns True if the current platform is identified as Cray EX and + HSTA-aware dragon package can be installed, False otherwise. + + :returns: True if current platform is Cray EX, False otherwise""" + + # ldconfig -p | grep cray | grep pmi.so && + # ldconfig -p | grep cray | grep pmi2.so && + # fi_info | grep cxi + + ldconfig = check_for_utility("ldconfig") + fi_info = check_for_utility("fi_info") + + result = CrayExPlatformResult(ldconfig, fi_info) + if not all((result.has_ldconfig, result.has_fi_info)): + return result + + ldconfig1 = f"{ldconfig} -p" + ldc_out1, _ = execute_platform_cmd(ldconfig1) + candidates = [x for x in ldc_out1.split("\n") if "cray" in x] + result.has_pmi = any(x for x in candidates if "pmi.so" in x) + + ldconfig2 = f"{ldconfig} -p" + ldc_out2, _ = execute_platform_cmd(ldconfig2) + candidates = [x for x in ldc_out2.split("\n") if "cray" in x] + result.has_pmi2 = any(x for x in candidates if "pmi2.so" in x) + + fi_info_out, _ = execute_platform_cmd(fi_info) + result.has_cxi = any(x for x in fi_info_out.split("\n") if "cxi" in x) + + return result + + +def is_crayex_platform() -> bool: + """Returns True if the current platform is identified as Cray EX and + HSTA-aware dragon package can be installed, False otherwise. + + :returns: True if current platform is Cray EX, False otherwise""" + result = check_platform() + return result.is_cray + + +@t.final +class SignalInterceptionStack(collections.abc.Collection[_TSignalHandlerFn]): + """Registers a stack of callables to be called when a signal is + received before calling the original signal handler. + """ + + def __init__( + self, + signalnum: int, + callbacks: t.Optional[t.Iterable[_TSignalHandlerFn]] = None, + ) -> None: + """Set up a ``SignalInterceptionStack`` for particular signal number. + + .. note:: + This class typically should not be instanced directly as it will + change the registered signal handler regardless of if a signal + interception stack is already present. Instead, it is generally + best to create or get a signal interception stack for a particular + signal number via the `get` factory method. + + :param signalnum: The signal number to intercept + :param callbacks: A iterable of functions to call upon receiving the signal + """ + self._callbacks = list(callbacks) if callbacks else [] + self._original = signal.signal(signalnum, self) + + def __call__(self, signalnum: int, frame: t.Optional["FrameType"]) -> None: + """Handle the signal on which the interception stack was registered. + End by calling the originally registered signal hander (if present). + + :param frame: The current stack frame + """ + for fn in self: + fn(signalnum, frame) + if callable(self._original): + self._original(signalnum, frame) + + def __contains__(self, obj: object) -> bool: + return obj in self._callbacks + + def __iter__(self) -> t.Iterator[_TSignalHandlerFn]: + return reversed(self._callbacks) + + def __len__(self) -> int: + return len(self._callbacks) + + @classmethod + def get(cls, signalnum: int) -> "SignalInterceptionStack": + """Fetch an existing ``SignalInterceptionStack`` or create a new one + for a particular signal number. + + :param signalnum: The singal number of the signal interception stack + should be registered + :returns: The existing or created signal interception stack + """ + handler = signal.getsignal(signalnum) + if isinstance(handler, cls): + return handler + return cls(signalnum, []) + + def push(self, fn: _TSignalHandlerFn) -> None: + """Add a callback to the signal interception stack. + + :param fn: A callable to add to the unique signal stack + """ + self._callbacks.append(fn) + + def push_unique(self, fn: _TSignalHandlerFn) -> bool: + """Add a callback to the signal interception stack if and only if the + callback is not already present. + + :param fn: A callable to add to the unique signal stack + :returns: True if the callback was added, False if the callback was + already present + """ + if did_push := fn not in self: + self.push(fn) + return did_push diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index 69eeb3e1b..aaceb7fc6 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import socket +import typing as t import psutil @@ -33,13 +34,16 @@ """ +class IFConfig(t.NamedTuple): + interface: t.Optional[str] + address: t.Optional[str] + + def get_ip_from_host(host: str) -> str: """Return the IP address for the interconnect. :param host: hostname of the compute node e.g. nid00004 - :type host: str :returns: ip of host - :rtype: str """ ip_address = socket.gethostbyname(host) return ip_address @@ -50,11 +54,9 @@ def get_ip_from_interface(interface: str) -> str: # pragma: no cover """Get IPV4 address of a network interface :param interface: interface name - :type interface: str :raises ValueError: if the interface does not exist :raises ValueError: if interface does not have an IPV4 address :return: ip address of interface - :rtype: str """ net_if_addrs = psutil.net_if_addrs() if interface not in net_if_addrs: @@ -86,3 +88,32 @@ def current_ip(interface: str = "lo") -> str: # pragma: no cover return get_ip_from_interface(loopback) return get_ip_from_interface(interface) + + +def get_best_interface_and_address() -> IFConfig: + available_ifs = psutil.net_if_addrs() + # TODO make this a CONFIG-time parameter + known_ifs = ["hsn", "ipogif", "ib"] + for interface in available_ifs: + if any(interface.startswith(if_prefix) for if_prefix in known_ifs): + return IFConfig(interface, get_ip_from_interface(interface)) + return IFConfig(None, None) + + +def find_free_port(start: int = 0) -> int: + """A 'good enough' way to find an open port to bind to + + :param start: The first port number to consider + :returns: The first open port found + """ + port_num = -1 + while port_num < 0: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + try: + sock.bind(("0.0.0.0", start)) + _, port = sock.getsockname() + port_num = int(port) + except Exception: + # swallow connection exception; test if the next port is open + start += 1 + return port_num diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 3bcf1c1f2..7fa59ad83 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -53,9 +53,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm needs to occur manually which is not often. :param hosts: List of hostnames to connect to - :type hosts: List[str] :param ports: List of ports for each hostname - :type ports: List[int] :raises SmartSimError: if cluster creation fails """ ip_list = [] @@ -69,7 +67,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm redis_cli = CONFIG.database_cli cmd = [redis_cli, "--cluster", "create"] cmd += ip_list - cmd += ["--cluster-replicas", "0"] + cmd += ["--cluster-replicas", "0", "--cluster-yes"] returncode, out, err = execute_cmd(cmd, proc_input="yes", shell=False) if returncode != 0: @@ -85,11 +83,8 @@ def check_cluster_status( """Check that a Redis/KeyDB cluster is up and running :param hosts: List of hostnames to connect to - :type hosts: List[str] :param ports: List of ports for each hostname - :type ports: List[int] :param trials: number of attempts to verify cluster status - :type trials: int, optional :raises SmartSimError: If cluster status cannot be verified """ @@ -129,13 +124,9 @@ def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> boo just ping DB. :param hosts: list of hosts - :type hosts: list[str] :param ports: list of ports - :type ports: list[int] :param num_shards: Number of DB shards - :type num_shards: int :return: Whether DB is running - :rtype: bool """ # if single shard if num_shards < 2: @@ -210,7 +201,7 @@ def set_script(db_script: DBScript, client: Client) -> None: client.set_script( name=db_script.name, script=db_script.script, device=device ) - else: + elif callable(db_script.script): client.set_function( name=db_script.name, function=db_script.script, device=device ) @@ -229,11 +220,8 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- will take care of this automatically. :param host_ip: IP of host to connect to - :type hosts: str :param ports: Port to which node is listening - :type ports: int :return: returncode, output, and error of the process - :rtype: tuple of (int, str, str) """ redis_cli = CONFIG.database_cli cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] @@ -241,7 +229,9 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- if returncode != 0: logger.error(out) - logger.error(err) + err_msg = "Error while shutting down DB node. " + err_msg += f"Return code: {returncode}, err: {err}" + logger.error(err_msg) elif out: logger.debug(out) diff --git a/smartsim/_core/utils/security.py b/smartsim/_core/utils/security.py new file mode 100644 index 000000000..e6f84c81a --- /dev/null +++ b/smartsim/_core/utils/security.py @@ -0,0 +1,302 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import dataclasses +import pathlib +import stat +import typing as t +from enum import IntEnum + +import zmq +import zmq.auth + +from smartsim._core.config.config import Config +from smartsim.log import get_logger + +logger = get_logger(__name__) + + +class _KeyPermissions(IntEnum): + """Permissions used by KeyManager""" + + PRIVATE_KEY = stat.S_IRUSR | stat.S_IWUSR + """Permissions only allowing an owner to read and write the file""" + PUBLIC_KEY = stat.S_IRUSR | stat.S_IWUSR | stat.S_IROTH | stat.S_IRGRP + """Permissions allowing an owner, others, and the group to read a file""" + + PRIVATE_DIR = ( + stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IXGRP + ) + """Permissions allowing only owners to read, write and traverse a directory""" + PUBLIC_DIR = ( + stat.S_IRUSR + | stat.S_IWUSR + | stat.S_IXUSR + | stat.S_IROTH + | stat.S_IXOTH + | stat.S_IRGRP + | stat.S_IXGRP + ) + """Permissions allowing non-owners to traverse a directory""" + + +@dataclasses.dataclass(frozen=True) +class KeyPair: + """A public and private key pair""" + + public: bytes = dataclasses.field(default=b"") + """The public key""" + + private: bytes = dataclasses.field(default=b"", repr=False) + """The private key""" + + @property + def empty(self) -> bool: + """Return `True` if the KeyPair has no key values set. Useful + for faking the null object pattern""" + return self.public == self.private and len(self.public) == 0 + + +class _KeyLocator: + """Determines the paths to use when persisting a `KeyPair` to disk""" + + def __init__( + self, + root_dir: pathlib.Path, + filename: str, + category: str, + ) -> None: + """Initiailize a `KeyLocator` + + :param root_dir: root path where keys are persisted to disk + :param filename: the stem name of the key file + :param category: the category or use-case for the key (e.g. server) + :param separate_keys: flag indicating if public and private keys should + be persisted in separate, corresponding directories + """ + + # constants for standardized paths. + self._public_subdir = "pub" + """The category subdirectory to use when persisting a public key""" + + self._private_subdir = "priv" + """The category subdirectory to use when persisting a private key""" + + self._public_extension = "key" + """The extension found on public keys""" + + self._private_extension = "key_secret" + """The extension found on private keys""" + + self._key_root_dir = root_dir + """Path to the root directory containing key files""" + + self._filename = filename + """Base name for key files""" + + self._category = category + """Category name used to further separate key locations""" + + @property + def public_dir(self) -> pathlib.Path: + """Target directory for the public key""" + return self.public.parent + + @property + def private_dir(self) -> pathlib.Path: + """Target directory for the private key""" + return self.private.parent + + @property + def public_filename(self) -> str: + """Filename (.) of the public key file""" + return f"{self._filename}.{self._public_extension}" + + @property + def private_filename(self) -> str: + """Filename (.) of the private key file""" + return f"{self._filename}.{self._private_extension}" + + @property + def public(self) -> pathlib.Path: + """Full target path of the public key file""" + # combine the root and key type (e.g. /foo/bar + /server) + # then combine the pub/priv key subdir (e.g. /foo/bar/server + /pub) + path = self._key_root_dir / self._category / self._public_subdir + return path / self.public_filename + + @property + def private(self) -> pathlib.Path: + """Full target path of the private key file""" + # combine the root and key type (e.g. /foo/bar + /server) + # then combine the pub/priv key subdir (e.g. /foo/bar/server + /pub) + path = self._key_root_dir / self._category / self._private_subdir + # combine the pub/priv key subdir if necessary (e.g. /foo/bar + /priv) + + return path / self.private_filename + + +class KeyManager: + def __init__( + self, config: Config, as_server: bool = False, as_client: bool = False + ) -> None: + """Initialize a KeyManager instance. + :param config: SmartSim configuration + :param as_server: flag to indicate when executing in the server context; + set to `True` to avoid loading client secret key + :param as_client: flag to indicate when executing in the client context; + set to `True` to avoid loading server secret key + """ + + self._as_server = as_server + """Set to `True` to return keys appropriate for the server context""" + + self._as_client = as_client + """Set to `True` to return keys appropriate for the client context""" + + key_dir = pathlib.Path(config.smartsim_key_path).resolve() + + # Results in key path such as /server/pub/smartsim.key + self._server_locator = _KeyLocator(key_dir, "smartsim", "server") + """The locator for producing the paths to store server key files""" + + # Results in key path such as /client/pub/smartsim.key + self._client_locator = _KeyLocator(key_dir, "smartsim", "client") + """The locator for producing the paths to store client key files""" + + def create_directories(self) -> None: + """Create the subdirectory structure necessary to hold + the public and private key pairs for servers & clients""" + for locator in [self._server_locator, self._client_locator]: + if not locator.public_dir.exists(): + permission = _KeyPermissions.PUBLIC_DIR + logger.debug(f"Creating key dir: {locator.public_dir}, {permission}") + locator.public_dir.mkdir(parents=True, mode=permission) + + if not locator.private_dir.exists(): + permission = _KeyPermissions.PRIVATE_DIR + logger.debug(f"Creating key dir: {locator.private_dir}, {permission}") + locator.private_dir.mkdir(parents=True, mode=permission) + + @classmethod + def _load_keypair(cls, locator: _KeyLocator, in_context: bool) -> KeyPair: + """Load a specific `KeyPair` from disk + + :param locator: a `KeyLocator` that specifies the path to an existing key + :param in_context: Boolean flag indicating if the keypair is the active + context; ensures the public and private keys are both loaded when `True`. + Only the public key is loaded when `False` + :returns: a KeyPair containing the loaded public/private key + """ + # private keys contain public & private key parts + key_path = locator.private if in_context else locator.public + + pub_key: bytes = b"" + priv_key: t.Optional[bytes] = b"" + + if key_path.exists(): + logger.debug(f"Existing key files located at {key_path}") + pub_key, priv_key = zmq.auth.load_certificate(key_path) + else: + logger.debug(f"No key files found at {key_path}") + + # avoid a `None` value in the private key when it isn't loaded + return KeyPair(pub_key, priv_key or b"") + + def _load_keys(self) -> t.Tuple[KeyPair, KeyPair]: + """Use ZMQ auth to load public/private key pairs for the server and client + components from the standard key paths for the associated experiment + + :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) + ]""" + try: + server_keys = self._load_keypair(self._server_locator, self._as_server) + client_keys = self._load_keypair(self._client_locator, self._as_client) + + return server_keys, client_keys + except (ValueError, OSError): + # expected if no keys could be loaded from disk + logger.warning("Loading key pairs failed.", exc_info=True) + + return KeyPair(), KeyPair() + + @classmethod + def _move_public_key(cls, locator: _KeyLocator) -> None: + """The public and private key pair are created in the same directory. Move + the public key out of the private subdir and into the public subdir + + :param locator: `KeyLocator` that determines the path to the + key pair persisted in the same directory. + """ + new_path = locator.private.with_suffix(locator.public.suffix) + if new_path != locator.public: + logger.debug(f"Moving key file from {locator.public} to {new_path}") + new_path.rename(locator.public) + + def _create_keys(self) -> None: + """Create and persist key files to disk""" + for locator in [self._server_locator, self._client_locator]: + # create keys in the private directory... + zmq.auth.create_certificates(locator.private_dir, locator.private.stem) + + # ...but move the public key out of the private subdirectory + self._move_public_key(locator) + + # and ensure correct r/w/x permissions on each file. + locator.private.chmod(_KeyPermissions.PRIVATE_KEY) + locator.public.chmod(_KeyPermissions.PUBLIC_KEY) + + def get_keys(self, create: bool = True) -> t.Tuple[KeyPair, KeyPair]: + """Use ZMQ auth to generate a public/private key pair for the server + and client components. + + :param no_create: pass `no_create=True` to ensure keys are not + created and only pre-existing keys can be loaded + :returns: 2-tuple of `KeyPair` (server_keypair, client_keypair) + """ + logger.debug(f"Loading keys, creation {'is' if create else 'not'} allowed") + server_keys, client_keys = self._load_keys() + + # check if we received "empty keys" + if not server_keys.empty or not client_keys.empty: + return server_keys, client_keys + + if not create: + # if directed not to create new keys, return "empty keys" + logger.debug("Returning empty key pairs") + return KeyPair(), KeyPair() + + self.create_directories() + self._create_keys() + + # load keys to ensure they were persisted + return self._load_keys() + + @property + def client_keys_dir(self) -> pathlib.Path: + "Return the path to the client public keys directory" + return self._client_locator.public_dir diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 69840b838..d4ec66eaf 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -33,10 +33,8 @@ import smartsim._core._cli.utils as _utils import smartsim.log -from smartsim._core.config import CONFIG if t.TYPE_CHECKING: - from smartsim import Experiment from smartsim._core.control.manifest import LaunchedManifest as _Manifest from smartsim.database.orchestrator import Orchestrator from smartsim.entity import DBNode, Ensemble, Model @@ -54,9 +52,6 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: - if not CONFIG.telemetry_enabled: - return - manifest.metadata.run_telemetry_subdirectory.mkdir(parents=True, exist_ok=True) exp_out, exp_err = smartsim.log.get_exp_log_paths() @@ -82,7 +77,7 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: manifest_dict = { "schema info": { "schema_name": "entity manifest", - "version": "0.0.3", + "version": "0.0.4", }, "experiment": { "name": manifest.metadata.exp_name, @@ -228,6 +223,7 @@ def _dictify_db( db_type, _ = db_path.name.split("-", 1) else: db_type = "Unknown" + return { "name": db.name, "type": db_type, @@ -238,6 +234,17 @@ def _dictify_db( "conf_file": shard.cluster_conf_file, "out_file": out_file, "err_file": err_file, + "memory_file": ( + str(status_dir / "memory.csv") if db.telemetry.is_enabled else "" + ), + "client_file": ( + str(status_dir / "client.csv") if db.telemetry.is_enabled else "" + ), + "client_count_file": ( + str(status_dir / "client_count.csv") + if db.telemetry.is_enabled + else "" + ), "telemetry_metadata": { "status_dir": str(status_dir), "step_id": step_id, diff --git a/smartsim/_core/utils/telemetry/__init__.py b/smartsim/_core/utils/telemetry/__init__.py new file mode 100644 index 000000000..efe03908e --- /dev/null +++ b/smartsim/_core/utils/telemetry/__init__.py @@ -0,0 +1,25 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/smartsim/_core/utils/telemetry/collector.py b/smartsim/_core/utils/telemetry/collector.py new file mode 100644 index 000000000..178126dec --- /dev/null +++ b/smartsim/_core/utils/telemetry/collector.py @@ -0,0 +1,482 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import abc +import asyncio +import collections +import itertools +import logging +import typing as t + +import redis.asyncio as redisa +import redis.exceptions as redisex + +from smartsim._core.control.job import JobEntity +from smartsim._core.utils.helpers import get_ts_ms +from smartsim._core.utils.telemetry.sink import FileSink, Sink + +logger = logging.getLogger("TelemetryMonitor") + + +class Collector(abc.ABC): + """Base class for telemetry collectors. + + A Collector is used to retrieve runtime metrics about an entity.""" + + def __init__(self, entity: JobEntity, sink: Sink) -> None: + """Initialize the collector + + :param entity: entity to collect metrics on + :param sink: destination to write collected information + """ + self._entity = entity + self._sink = sink + self._enabled = True + + @property + def enabled(self) -> bool: + """Boolean indicating if the collector should perform data collection""" + return self._entity.telemetry_on + + @enabled.setter + def enabled(self, value: bool) -> None: + self._entity.telemetry_on = value + + @property + def entity(self) -> JobEntity: + """The `JobEntity` for which data is collected + :return: the entity""" + return self._entity + + @property + def sink(self) -> Sink: + """The sink where collected data is written + :return: the sink + """ + return self._sink + + @abc.abstractmethod + async def prepare(self) -> None: + """Initialization logic for the collector""" + + @abc.abstractmethod + async def collect(self) -> None: + """Execute metric collection""" + + @abc.abstractmethod + async def shutdown(self) -> None: + """Execute cleanup of resources for the collector""" + + +class _DBAddress: + """Helper class to hold and pretty-print connection details""" + + def __init__(self, host: str, port: int) -> None: + """Initialize the instance + :param host: host address for database connections + :param port: port number for database connections + """ + self.host = host.strip() if host else "" + self.port = port + self._check() + + def _check(self) -> None: + """Validate input arguments""" + if not self.host: + raise ValueError(f"{type(self).__name__} requires host") + if not self.port: + raise ValueError(f"{type(self).__name__} requires port") + + def __str__(self) -> str: + """Pretty-print the instance""" + return f"{self.host}:{self.port}" + + +class DBCollector(Collector): + """A base class for collectors that retrieve statistics from an orchestrator""" + + def __init__(self, entity: JobEntity, sink: Sink) -> None: + """Initialize the `DBCollector` + + :param entity: entity with metadata about the resource to monitor + :param sink: destination to write collected information + """ + super().__init__(entity, sink) + self._client: t.Optional[redisa.Redis[bytes]] = None + self._address = _DBAddress( + self._entity.config.get("host", ""), + int(self._entity.config.get("port", 0)), + ) + + async def _configure_client(self) -> None: + """Configure the client connection to the target database""" + try: + if not self._client: + self._client = redisa.Redis( + host=self._address.host, port=self._address.port + ) + except Exception as e: + logger.exception(e) + finally: + if not self._client: + logger.error( + f"{type(self).__name__} failed to connect to {self._address}" + ) + + async def prepare(self) -> None: + """Initialization logic for the DB collector. Creates a database + connection then executes the `post_prepare` callback function.""" + if self._client: + return + + await self._configure_client() + await self._post_prepare() + + @abc.abstractmethod + async def _post_prepare(self) -> None: + """Hook function to enable subclasses to perform actions + after a db client is ready""" + + @abc.abstractmethod + async def _perform_collection( + self, + ) -> t.Sequence[t.Tuple[t.Union[int, float, str], ...]]: + """Hook function for subclasses to execute custom metric retrieval. + NOTE: all implementations return an iterable of metrics to avoid + adding extraneous base class code to differentiate the results + + :return: an iterable containing individual metric collection results + """ + + async def collect(self) -> None: + """Execute database metric collection if the collector is enabled. Writes + the resulting metrics to the associated output sink. Calling `collect` + when `self.enabled` is `False` performs no actions.""" + if not self.enabled: + # collectors may be disabled by monitoring changes to the + # manifest. Leave the collector but do NOT collect + logger.debug(f"{type(self).__name__} is not enabled") + return + + await self.prepare() + if not self._client: + logger.warning(f"{type(self).__name__} cannot collect") + return + + try: + # if we can't communicate w/the db, exit + if not await self._check_db(): + return + + all_metrics = await self._perform_collection() + for metrics in all_metrics: + await self._sink.save(*metrics) + except Exception as ex: + logger.warning(f"Collect failed for {type(self).__name__}", exc_info=ex) + + async def shutdown(self) -> None: + """Execute cleanup of database client connections""" + try: + if self._client: + logger.info( + f"Shutting down {self._entity.name}::{self.__class__.__name__}" + ) + await self._client.close() + self._client = None + except Exception as ex: + logger.error( + f"An error occurred during {type(self).__name__} shutdown", exc_info=ex + ) + + async def _check_db(self) -> bool: + """Check if the target database is reachable. + + :return: `True` if connection succeeds, `False` otherwise. + """ + try: + if self._client: + return await self._client.ping() + except redisex.ConnectionError: + logger.warning(f"Cannot ping db {self._address}") + + return False + + +class DBMemoryCollector(DBCollector): + """A `DBCollector` that collects memory consumption metrics""" + + def __init__(self, entity: JobEntity, sink: Sink) -> None: + super().__init__(entity, sink) + self._columns = ["used_memory", "used_memory_peak", "total_system_memory"] + + async def _post_prepare(self) -> None: + """Write column headers for a CSV formatted output sink after + the database connection is established""" + await self._sink.save("timestamp", *self._columns) + + async def _perform_collection( + self, + ) -> t.Sequence[t.Tuple[int, float, float, float]]: + """Perform memory metric collection and return the results + + :return: an iterable containing individual metric collection results + in the format `(timestamp,used_memory,used_memory_peak,total_system_memory)` + """ + if self._client is None: + return [] + + db_info = await self._client.info("memory") + + used = float(db_info["used_memory"]) + peak = float(db_info["used_memory_peak"]) + total = float(db_info["total_system_memory"]) + + value = (get_ts_ms(), used, peak, total) + + # return a list containing a single record to simplify the parent + # class code to save multiple records from a single collection + return [value] + + +class DBConnectionCollector(DBCollector): + """A `DBCollector` that collects database client-connection metrics""" + + def __init__(self, entity: JobEntity, sink: Sink) -> None: + super().__init__(entity, sink) + self._columns = ["client_id", "address"] + + async def _post_prepare(self) -> None: + """Write column headers for a CSV formatted output sink after + the database connection is established""" + await self._sink.save("timestamp", *self._columns) + + async def _perform_collection( + self, + ) -> t.Sequence[t.Tuple[t.Union[int, str, str], ...]]: + """Perform connection metric collection and return the results + + :return: an iterable containing individual metric collection results + in the format `(timestamp,client_id,address)` + """ + if self._client is None: + return [] + + now_ts = get_ts_ms() + clients = await self._client.client_list() + + values: t.List[t.Tuple[int, str, str]] = [] + + # content-filter the metrics and return them all together + for client in clients: + # all records for the request will have the same timestamp + value = now_ts, client["id"], client["addr"] + values.append(value) + + return values + + +class DBConnectionCountCollector(DBCollector): + """A DBCollector that collects aggregated client-connection count metrics""" + + def __init__(self, entity: JobEntity, sink: Sink) -> None: + super().__init__(entity, sink) + self._columns = ["num_clients"] + + async def _post_prepare(self) -> None: + """Write column headers for a CSV formatted output sink after + the database connection is established""" + await self._sink.save("timestamp", *self._columns) + + async def _perform_collection( + self, + ) -> t.Sequence[t.Tuple[int, int]]: + """Perform connection-count metric collection and return the results + + :return: an iterable containing individual metric collection results + in the format `(timestamp,num_clients)` + """ + if self._client is None: + return [] + + client_list = await self._client.client_list() + + addresses = {item["addr"] for item in client_list} + + # return a list containing a single record to simplify the parent + # class code to save multiple records from a single collection + value = (get_ts_ms(), len(addresses)) + return [value] + + +class CollectorManager: + """The `CollectorManager` manages the set of all collectors required to retrieve + metrics for an experiment. It provides the ability to add and remove collectors + with unique configuration per entity. The `CollectorManager` is primarily used + to perform bulk actions on 1-to-many collectors (e.g. prepare all collectors, + request metrics for all collectors, close all collector connections)""" + + def __init__(self, timeout_ms: int = 1000) -> None: + """Initialize the `CollectorManager` without collectors + :param timeout_ms: maximum time (in ms) allowed for `Collector.collect` + """ + # A lookup table to hold a list of registered collectors per entity + self._collectors: t.Dict[str, t.List[Collector]] = collections.defaultdict(list) + # Max time to allow a collector to work before cancelling requests + self._timeout_ms = timeout_ms + + def clear(self) -> None: + """Remove all collectors from the monitored set""" + self._collectors = collections.defaultdict(list) + + def add(self, collector: Collector) -> None: + """Add a collector to the monitored set + + :param collector: `Collector` instance to monitor + """ + entity_name = collector.entity.name + + registered_collectors = self._collectors[entity_name] + + # Exit if the collector is already registered to the entity + if any(c for c in registered_collectors if type(c) is type(collector)): + return + + logger.debug(f"Adding collector: {entity_name}::{type(collector).__name__}") + registered_collectors.append(collector) + + def add_all(self, collectors: t.Sequence[Collector]) -> None: + """Add multiple collectors to the monitored set + + :param collectors: a collection of `Collectors` to monitor + """ + for collector in collectors: + self.add(collector) + + async def remove_all(self, entities: t.Sequence[JobEntity]) -> None: + """Remove all collectors registered to the supplied entities + + :param entities: a collection of `JobEntity` instances that will + no longer have registered collectors + """ + if not entities: + return + + tasks = (self.remove(entity) for entity in entities) + await asyncio.gather(*tasks) + + async def remove(self, entity: JobEntity) -> None: + """Remove all collectors registered to the supplied entity + + :param entities: `JobEntity` that will no longer have registered collectors + """ + registered = self._collectors.pop(entity.name, []) + if not registered: + return + + logger.debug(f"Removing collectors registered for {entity.name}") + asyncio.gather(*(collector.shutdown() for collector in registered)) + + async def prepare(self) -> None: + """Prepare registered collectors to perform collection""" + tasks = (collector.prepare() for collector in self.all_collectors) + # use gather so all collectors are prepared before collection + await asyncio.gather(*tasks) + + async def collect(self) -> None: + """Perform collection for all registered collectors""" + if collectors := self.all_collectors: + tasks = [asyncio.create_task(item.collect()) for item in collectors] + + _, pending = await asyncio.wait(tasks, timeout=self._timeout_ms / 1000.0) + + # any tasks still pending has exceeded the timeout + if pending: + # manually cancel tasks since asyncio.wait will not + for remaining_task in pending: + remaining_task.cancel() + logger.debug(f"Execution of {len(pending)} collectors timed out.") + + async def shutdown(self) -> None: + """Release resources for all registered collectors""" + logger.debug(f"{type(self).__name__} shutting down collectors...") + if list(self.all_collectors): + shutdown_tasks = [] + # create an async tasks to execute all shutdowns in parallel + for item in self.all_collectors: + shutdown_tasks.append(asyncio.create_task(item.shutdown())) + # await until all shutdowns are complete + await asyncio.wait(shutdown_tasks) + logger.debug("Collector shutdown complete...") + + @property + def all_collectors(self) -> t.Sequence[Collector]: + """Get a list of all registered collectors + + :return: a collection of registered collectors for all entities + """ + # flatten and return all the lists-of-collectors that are registered + collectors = itertools.chain.from_iterable(self._collectors.values()) + return [collector for collector in collectors if collector.enabled] + + @property + def dead_collectors(self) -> t.Sequence[Collector]: + """Get a list of all disabled collectors + + :return: a collection of disabled collectors for all entities + """ + collectors = itertools.chain.from_iterable(self._collectors.values()) + return [collector for collector in collectors if not collector.enabled] + + def register_collectors(self, entity: JobEntity) -> None: + """Find all configured collectors for the entity and register them + + :param entity: a `JobEntity` instance that will have all configured collectors + registered for collection. Configuration is found in the `RuntimeManifest` + """ + collectors: t.List[Collector] = [] + + # ONLY db telemetry is implemented at this time. This resolver must + # be updated when non-database or always-on collectors are introduced + if entity.is_db and entity.telemetry_on: + if mem_out := entity.collectors.get("memory", None): + collectors.append(DBMemoryCollector(entity, FileSink(mem_out))) + + if con_out := entity.collectors.get("client", None): + collectors.append(DBConnectionCollector(entity, FileSink(con_out))) + + if num_out := entity.collectors.get("client_count", None): + collectors.append(DBConnectionCountCollector(entity, FileSink(num_out))) + else: + logger.debug(f"Collectors disabled for db {entity.name}") + + self.add_all(collectors) + + def register_all_collectors(self, entities: t.Sequence[JobEntity]) -> None: + """Find all configured collectors for the entity and register them + + :param entities: entities to call `register_collectors` for + """ + for entity in entities: + self.register_collectors(entity) diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py new file mode 100644 index 000000000..942fa4ae8 --- /dev/null +++ b/smartsim/_core/utils/telemetry/manifest.py @@ -0,0 +1,242 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import json +import logging +import pathlib +import time +import typing as t +from dataclasses import dataclass, field + +from smartsim._core.control.job import JobEntity + +logger = logging.getLogger("TelemetryMonitor") + + +@dataclass +class Run: + """ + A Run contains the collection of entities created when a `SmartSim` + driver script executes `Experiment.start`""" + + timestamp: int + """the timestamp at the time the `Experiment.start` is called""" + models: t.List[JobEntity] + """models started in this run""" + orchestrators: t.List[JobEntity] + """orchestrators started in this run""" + ensembles: t.List[JobEntity] + """ensembles started in this run""" + + def flatten( + self, filter_fn: t.Optional[t.Callable[[JobEntity], bool]] = None + ) -> t.Sequence[JobEntity]: + """Flatten all `JobEntity`'s in the `Run` into a 1-dimensional list + + :param filter_fn: optional boolean filter that returns + True for entities to include in the result + """ + entities = self.models + self.orchestrators + self.ensembles + if filter_fn: + entities = [entity for entity in entities if filter_fn(entity)] + return entities + + @staticmethod + def load_entity( + entity_type: str, + entity_dict: t.Dict[str, t.Any], + exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], + ) -> t.List[JobEntity]: + """Map entity data persisted in a manifest file to an object + + :param entity_type: type of the associated `SmartSimEntity` + :param entity_dict: raw dictionary deserialized from entity in manifest JSON + :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON + :return: list of loaded `JobEntity` instances + """ + entities = [] + + # an entity w/parent keys must create entities for the items that it + # comprises. traverse the children and create each entity + parent_keys = {"shards", "models"} + parent_keys = parent_keys.intersection(entity_dict.keys()) + if parent_keys: + container = "shards" if "shards" in parent_keys else "models" + child_type = "orchestrator" if container == "shards" else "model" + for child_entity in entity_dict[container]: + entity = JobEntity.from_manifest( + child_type, child_entity, str(exp_dir), raw_experiment + ) + entities.append(entity) + + return entities + + # not a parent type, just create the entity w/the entity_type passed in + entity = JobEntity.from_manifest( + entity_type, entity_dict, str(exp_dir), raw_experiment + ) + entities.append(entity) + return entities + + @staticmethod + def load_entities( + entity_type: str, + run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], + ) -> t.Dict[str, t.List[JobEntity]]: + """Map a collection of entity data persisted in a manifest file to an object + + :param entity_type: type of the associated `SmartSimEntity` + :param run: raw dictionary containing `Run` data deserialized from JSON + :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON + :return: list of loaded `JobEntity` instances + """ + persisted: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + } + for item in run[entity_type]: + entities = Run.load_entity(entity_type, item, exp_dir, raw_experiment) + for new_entity in entities: + persisted[new_entity.type].append(new_entity) + + return persisted + + @staticmethod + def load_run( + raw_run: t.Dict[str, t.Any], + exp_dir: pathlib.Path, + raw_experiment: t.Dict[str, t.Any], + ) -> "Run": + """Map run data persisted in a manifest file to an object + + :param raw_run: raw dictionary containing `Run` data deserialized from JSON + :param exp_dir: root path to experiment outputs + :param raw_experiment: raw experiment deserialized from manifest JSON + :return: populated `Run` instance + """ + + # create an output mapping to hold the deserialized entities + run_entities: t.Dict[str, t.List[JobEntity]] = { + "model": [], + "orchestrator": [], + "ensemble": [], + } + + # use the output mapping keys to load all the target + # entities from the deserialized JSON + for entity_type in run_entities: + _entities = Run.load_entities(entity_type, raw_run, exp_dir, raw_experiment) + + # load_entities may return a mapping containing types different from + # entity_type IF it was a parent entity. Iterate through the keys in + # the output dictionary and put them in the right place + for entity_type, new_entities in _entities.items(): + if not new_entities: + continue + run_entities[entity_type].extend(new_entities) + + loaded_run = Run( + raw_run["timestamp"], + run_entities["model"], + run_entities["orchestrator"], + run_entities["ensemble"], + ) + return loaded_run + + +@dataclass +class RuntimeManifest: + """The runtime manifest holds information about the entities created + at runtime during a SmartSim Experiment. The runtime manifest differs + from a standard manifest - it may contain multiple experiment + executions in a `runs` collection and holds information that is unknown + at design-time, such as IP addresses of host machines. + """ + + name: str + """The name of the `Experiment` associated to the `RuntimeManifest`""" + path: pathlib.Path + """The path to the `Experiment` working directory""" + launcher: str + """The launcher type used by the `Experiment`""" + runs: t.List[Run] = field(default_factory=list) + """A `List` of 0 to many `Run` instances""" + + @staticmethod + def load_manifest(file_path: str) -> t.Optional["RuntimeManifest"]: + """Load a persisted manifest and return the content + + :param file_path: path to the manifest file to load + :return: deserialized `RuntimeManifest` if the manifest file is found, + otherwise None + """ + manifest_dict: t.Optional[t.Dict[str, t.Any]] = None + try_count, max_attempts = 1, 5 + + # allow multiple read attempts in case the manifest is being + # written at the time load_manifest is called + while manifest_dict is None and try_count <= max_attempts: + source = pathlib.Path(file_path) + source = source.resolve() + time.sleep(0.01) # a tiny sleep avoids reading partially written json + + try: + if text := source.read_text(encoding="utf-8").strip(): + manifest_dict = json.loads(text) + except json.JSONDecodeError as ex: + print(f"Error loading manifest: {ex}") + # hack/fix: handle issues reading file before it is fully written + time.sleep(0.1 * try_count) + finally: + try_count += 1 + + if not manifest_dict: + return None + + # if we don't have an experiment, the manifest is malformed + exp = manifest_dict.get("experiment", None) + if not exp: + raise ValueError("Manifest missing required experiment") + + # if we don't have runs, the manifest is malformed + runs = manifest_dict.get("runs", None) + if runs is None: + raise ValueError("Manifest missing required runs") + + exp_dir = pathlib.Path(exp["path"]) + runs = [Run.load_run(raw_run, exp_dir, exp) for raw_run in runs] + + manifest = RuntimeManifest( + name=exp["name"], + path=exp_dir, + launcher=exp["launcher"], + runs=runs, + ) + return manifest diff --git a/smartsim/_core/utils/telemetry/sink.py b/smartsim/_core/utils/telemetry/sink.py new file mode 100644 index 000000000..afea791ea --- /dev/null +++ b/smartsim/_core/utils/telemetry/sink.py @@ -0,0 +1,81 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import abc +import logging +import pathlib +import typing as t + +logger = logging.getLogger("TelemetryMonitor") + + +class Sink(abc.ABC): + """Base class for output sinks. Represents a durable, read-only + storage mechanism""" + + @abc.abstractmethod + async def save(self, *args: t.Any) -> None: + """Save the args passed to this method to the underlying sink + + :param args: variadic list of values to save + """ + + +class FileSink(Sink): + """Telemetry sink that writes to a file""" + + def __init__(self, path: str) -> None: + """Initialize the FileSink + + :param filename: path to a file backing this `Sink` + """ + super().__init__() + self._check_init(path) + self._path = pathlib.Path(path) + + @staticmethod + def _check_init(filename: str) -> None: + """Validate initialization arguments and raise a ValueError + if an invalid filename is passed + + :param filename: path to a file backing this `Sink` + """ + if not filename: + raise ValueError("No filename provided to FileSink") + + @property + def path(self) -> pathlib.Path: + """The path to the file this FileSink writes + + :return: path to a file backing this `Sink` + """ + return self._path + + async def save(self, *args: t.Any) -> None: + self._path.parent.mkdir(parents=True, exist_ok=True) + + with open(self._path, "a+", encoding="utf-8") as sink_fp: + values = ",".join(map(str, args)) + "\n" + sink_fp.write(values) diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py new file mode 100644 index 000000000..e9e4c46bc --- /dev/null +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -0,0 +1,592 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import asyncio +import json +import logging +import os +import pathlib +import threading +import typing as t + +from watchdog.events import ( + FileSystemEvent, + LoggingEventHandler, + PatternMatchingEventHandler, +) +from watchdog.observers import Observer +from watchdog.observers.api import BaseObserver + +from smartsim._core.config import CONFIG +from smartsim._core.control.job import JobEntity, _JobKey +from smartsim._core.control.jobmanager import JobManager +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher +from smartsim._core.launcher.launcher import Launcher +from smartsim._core.launcher.local.local import LocalLauncher +from smartsim._core.launcher.lsf.lsfLauncher import LSFLauncher +from smartsim._core.launcher.pbs.pbsLauncher import PBSLauncher +from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim._core.utils.helpers import get_ts_ms +from smartsim._core.utils.serialize import MANIFEST_FILENAME +from smartsim._core.utils.telemetry.collector import CollectorManager +from smartsim._core.utils.telemetry.manifest import Run, RuntimeManifest +from smartsim._core.utils.telemetry.util import map_return_code, write_event +from smartsim.error.errors import SmartSimError +from smartsim.status import TERMINAL_STATUSES + +logger = logging.getLogger("TelemetryMonitor") + + +class ManifestEventHandler(PatternMatchingEventHandler): + """The ManifestEventHandler monitors an experiment and updates a + datastore as needed. This event handler is triggered by changes to + the experiment manifest written to physical disk by a driver. + + It also contains an event loop. The loop checks experiment entities for updates + at each timestep and executes a configurable set of metrics collectors.""" + + def __init__( + self, + pattern: str, + ignore_patterns: t.Optional[t.List[str]] = None, + ignore_directories: bool = True, + case_sensitive: bool = False, + timeout_ms: int = 1000, + ) -> None: + """Initialize the manifest event handler + + :param pattern: a pattern that identifies the files whose + events are of interest by matching their name + :param ignore_patterns: a pattern that identifies the files whose + events should be ignored + :param ignore_directories: set to `True` to avoid directory events + :param case_sensitive: set to `True` to require case sensitivity in + resource names in order to match input patterns + :param timeout_ms: maximum duration (in ms) of a call to the event + loop prior to cancelling tasks + """ + super().__init__( + [pattern], ignore_patterns, ignore_directories, case_sensitive + ) # type: ignore + self._tracked_runs: t.Dict[int, Run] = {} + self._tracked_jobs: t.Dict[_JobKey, JobEntity] = {} + self._completed_jobs: t.Dict[_JobKey, JobEntity] = {} + self._launcher: t.Optional[Launcher] = None + self.job_manager: JobManager = JobManager(threading.RLock()) + self._launcher_map: t.Dict[str, t.Type[Launcher]] = { + "slurm": SlurmLauncher, + "pbs": PBSLauncher, + "lsf": LSFLauncher, + "local": LocalLauncher, + "dragon": DragonLauncher, + } + self._collector_mgr = CollectorManager(timeout_ms) + + @property + def tracked_jobs(self) -> t.Sequence[JobEntity]: + """The collection of `JobEntity` that are actively being monitored + + :return: the collection + """ + return list(self._tracked_jobs.values()) + + def init_launcher(self, launcher: str) -> None: + """Initialize the controller with a specific type of launcher. + SmartSim currently supports Slurm, PBS(Pro), LSF, Dragon + and local launching + + :param launcher: the name of the workload manager used by the experiment + :raises ValueError: if a string is passed that is not + a supported launcher + :raises TypeError: if no launcher argument is provided. + """ + if not launcher: + raise TypeError("Must provide a 'launcher' argument") + + if launcher_type := self._launcher_map.get(launcher.lower(), None): + self._launcher = launcher_type() + return + + raise ValueError("Launcher type not supported: " + launcher) + + def init_job_manager(self) -> None: + """Initialize the job manager instance""" + if not self._launcher: + raise TypeError("self._launcher must be initialized") + + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() + + def set_launcher(self, launcher_type: str) -> None: + """Set the launcher for the experiment + :param launcher_type: the name of the workload manager used by the experiment + """ + self.init_launcher(launcher_type) + + if self._launcher is None: + raise SmartSimError("Launcher init failed") + + self.job_manager.set_launcher(self._launcher) + self.job_manager.start() + + def process_manifest(self, manifest_path: str) -> None: + """Read the manifest for the experiment. Process the + `RuntimeManifest` by updating the set of tracked jobs + and registered collectors + + :param manifest_path: full path to the manifest file + """ + try: + # it is possible to read the manifest prior to a completed + # write due to no access locking mechanism. log the issue + # and continue. it will retry on the next event loop iteration + manifest = RuntimeManifest.load_manifest(manifest_path) + if not manifest: + logger.debug("No manifest file exists") + return + except json.JSONDecodeError: + logger.error(f"Malformed manifest encountered: {manifest_path}") + return + except ValueError: + logger.error("Manifest content error", exc_info=True) + return + + if self._launcher is None: + self.set_launcher(manifest.launcher) + + if not self._launcher: + raise SmartSimError(f"Unable to set launcher from {manifest_path}") + + # filter out previously added items + runs = [run for run in manifest.runs if run.timestamp not in self._tracked_runs] + + # manifest is stored at /.smartsim/telemetry/manifest.json + exp_dir = pathlib.Path(manifest_path).parent.parent.parent + + for run in runs: + for entity in run.flatten( + filter_fn=lambda e: e.key not in self._tracked_jobs + ): + entity.path = str(exp_dir) + + # track everything coming in (managed and unmanaged) + self._tracked_jobs[entity.key] = entity + + # register collectors for new entities as needed + if entity.telemetry_on: + self._collector_mgr.register_collectors(entity) + + # persist a `start` event for each new entity in the manifest + write_event( + run.timestamp, + entity.task_id, + entity.step_id, + entity.type, + "start", + pathlib.Path(entity.status_dir), + ) + + if entity.is_managed: + # Tell JobManager the task is unmanaged. This collects + # status updates but does not try to start a new copy + self.job_manager.add_job( + entity.name, + entity.step_id, + entity, + False, + ) + # Tell the launcher it's managed so it doesn't attempt + # to look for a PID that may no longer exist + self._launcher.step_mapping.add( + entity.name, entity.step_id, "", True + ) + self._tracked_runs[run.timestamp] = run + + def on_modified(self, event: FileSystemEvent) -> None: + """Event handler for when a file or directory is modified. + + :param event: event representing file/directory modification. + """ + super().on_modified(event) + logger.debug(f"Processing manifest modified @ {event.src_path}") + self.process_manifest(event.src_path) + + def on_created(self, event: FileSystemEvent) -> None: + """Event handler for when a file or directory is created. + + :param event: event representing file/directory creation. + """ + super().on_created(event) + logger.debug(f"processing manifest created @ {event.src_path}") + self.process_manifest(event.src_path) + + async def _to_completed( + self, + timestamp: int, + entity: JobEntity, + step_info: StepInfo, + ) -> None: + """Move a monitored entity from the active to completed collection to + stop monitoring for updates during timesteps. + + :param timestamp: current timestamp for event logging + :param entity: running SmartSim Job + :param step_info: `StepInfo` received when requesting a Job status update + """ + # remember completed entities to ignore them after manifest updates + inactive_entity = self._tracked_jobs.pop(entity.key) + if entity.key not in self._completed_jobs: + self._completed_jobs[entity.key] = inactive_entity + + # remove all the registered collectors for the completed entity + await self._collector_mgr.remove(entity) + + job = self.job_manager[entity.name] + self.job_manager.move_to_completed(job) + + status_clause = f"status: {step_info.status}" + error_clause = f", error: {step_info.error}" if step_info.error else "" + + write_path = pathlib.Path(entity.status_dir) + + # persist a `stop` event for an entity that has completed + write_event( + timestamp, + entity.task_id, + entity.step_id, + entity.type, + "stop", + write_path, + detail=f"{status_clause}{error_clause}", + return_code=map_return_code(step_info), + ) + + async def on_timestep(self, timestamp: int) -> None: + """Called at polling frequency to request status updates on + monitored entities + + :param timestamp: current timestamp for event logging + """ + if not self._launcher: + return + + await self._collector_mgr.collect() + + # ensure unmanaged jobs move out of tracked jobs list + u_jobs = [job for job in self._tracked_jobs.values() if not job.is_managed] + for job in u_jobs: + job.check_completion_status() + if job.is_complete: + completed_entity = self._tracked_jobs.pop(job.key) + self._completed_jobs[job.key] = completed_entity + + # consider not using name to avoid collisions + m_jobs = [job for job in self._tracked_jobs.values() if job.is_managed] + if names := {entity.name: entity for entity in m_jobs}: + step_updates: t.List[t.Tuple[str, t.Optional[StepInfo]]] = [] + + try: + task_names = list(names.keys()) + updates = self._launcher.get_step_update(task_names) + step_updates.extend(updates) + logger.debug(f"Retrieved updates for: {task_names}") + except Exception: + logger.warning(f"Telemetry step updates failed for {names.keys()}") + + try: + for step_name, step_info in step_updates: + if step_info and step_info.status in TERMINAL_STATUSES: + completed_entity = names[step_name] + await self._to_completed(timestamp, completed_entity, step_info) + except Exception as ex: + msg = f"An error occurred getting step updates on {names}" + logger.error(msg, exc_info=ex) + + async def shutdown(self) -> None: + """Release all resources owned by the `ManifestEventHandler`""" + logger.debug(f"{type(self).__name__} shutting down...") + await self._collector_mgr.shutdown() + logger.debug(f"{type(self).__name__} shutdown complete...") + + +class TelemetryMonitorArgs: + """Strongly typed entity to house logic for validating + configuration passed to the telemetry monitor""" + + def __init__( + self, + exp_dir: str, + frequency: int, + cooldown: int, + log_level: int = logging.DEBUG, + ) -> None: + """Initialize the instance with inputs and defaults + + :param exp_dir: root path to experiment outputs + :param frequency: desired frequency of metric & status updates (in seconds) + :param frequency: cooldown period (in seconds) before automatic shutdown + :param log_level: log level to apply to python logging + """ + self.exp_dir: str = exp_dir + self.frequency: int = frequency # freq in seconds + self.cooldown: int = cooldown # cooldown in seconds + self.log_level: int = log_level + self._validate() + + @property + def min_frequency(self) -> int: + """The minimum duration (in seconds) for the monitoring loop to wait + between executions of the monitoring loop. Shorter frequencies may + not allow the monitoring loop to complete. Adjusting the minimum frequency + can result in inconsistent or missing outputs due to the telemetry + monitor cancelling processes that exceed the allotted frequency.""" + return 1 + + @property + def max_frequency(self) -> int: + """The maximum duration (in seconds) for the monitoring loop to wait + between executions of the monitoring loop. Longer frequencies potentially + keep the telemetry monitor alive unnecessarily.""" + return 600 + + @property + def min_cooldown(self) -> int: + """The minimum allowed cooldown period that can be configured. Ensures + the cooldown does not cause the telemetry monitor to shutdown prior to + completing a single pass through the monitoring loop""" + return min(self.frequency + 1, self.cooldown) + + @property + def max_cooldown(self) -> int: + """The maximum allowed cooldown period that can be configured. Ensures the + telemetry monitor can automatically shutdown if not needed""" + return self.max_frequency + + @property + def cooldown_ms(self) -> int: + """The duration of the time period (in ms) the telemetry monitor will + wait for new resources to monitor before shutting down""" + return self.cooldown * 1000 + + @property + def frequency_ms(self) -> int: + """The desired frequency (in ms) of the telemetry monitor attempts + to retrieve status updates and metrics""" + return self.frequency * 1000 + + def _check_exp_dir(self) -> None: + """Validate the existence of the experiment directory""" + if not pathlib.Path(self.exp_dir).exists(): + raise ValueError(f"Experiment directory cannot be found: {self.exp_dir}") + + def _check_frequency(self) -> None: + """Validate the frequency input is in the range + [`min_frequency`, `max_frequency`]""" + if self.max_frequency >= self.frequency >= self.min_frequency: + return + + freq_tpl = "Telemetry collection frequency must be in the range [{0}, {1}]" + raise ValueError(freq_tpl.format(self.min_frequency, self.max_frequency)) + + def _check_log_level(self) -> None: + """Validate the frequency log level input. Uses standard python log levels""" + if self.log_level not in [ + logging.DEBUG, + logging.INFO, + logging.WARNING, + logging.ERROR, + ]: + raise ValueError(f"Invalid log_level supplied: {self.log_level}") + + def _validate(self) -> None: + """Execute all validation functions""" + self._check_exp_dir() + self._check_frequency() + self._check_log_level() + + +class TelemetryMonitor: + """The telemetry monitor is a standalone process managed by SmartSim to perform + long-term retrieval of experiment status updates and resource usage + metrics. Note that a non-blocking driver script is likely to complete before + the SmartSim entities complete. Also, the JobManager performs status updates + only as long as the driver is running. This telemetry monitor entrypoint is + started automatically when a SmartSim experiment calls the `start` method + on resources. The entrypoint runs until it has no resources to monitor.""" + + def __init__(self, telemetry_monitor_args: TelemetryMonitorArgs): + """Initialize the telemetry monitor instance + + :param telemetry_monitor_args: configuration for the telemetry monitor + """ + self._observer: BaseObserver = Observer() + """an observer object that triggers the action handler""" + self._args = telemetry_monitor_args + """user-supplied arguments configuring telemetry monitor behavior""" + self._experiment_dir = pathlib.Path(self._args.exp_dir) + """path to the root directory where experiment outputs are written""" + self._telemetry_path = self._experiment_dir / CONFIG.telemetry_subdir + """path to the root directory where telemetry outputs are written""" + self._manifest_path = self._telemetry_path / MANIFEST_FILENAME + """path to the runtime manifest file""" + self._action_handler: t.Optional[ManifestEventHandler] = None + """an event listener holding action handlers for manifest on-change events""" + + def _can_shutdown(self) -> bool: + """Determines if the telemetry monitor can perform shutdown. An + automatic shutdown will occur if there are no active jobs being monitored. + Managed jobs and databases are considered separately due to the way they + are stored in the job manager + + :return: return True if capable of automatically shutting down + """ + managed_jobs = ( + list(self._action_handler.job_manager.jobs.values()) + if self._action_handler + else [] + ) + unmanaged_jobs = ( + list(self._action_handler.tracked_jobs) if self._action_handler else [] + ) + # get an individual count of databases for logging + n_dbs: int = len( + [ + job + for job in managed_jobs + unmanaged_jobs + if isinstance(job, JobEntity) and job.is_db + ] + ) + + # if we have no jobs currently being monitored we can shutdown + n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_dbs + shutdown_ok = n_jobs + n_dbs == 0 + + logger.debug(f"{n_jobs} active job(s), {n_dbs} active db(s)") + return shutdown_ok + + async def monitor(self) -> None: + """The main monitoring loop. Executes a busy wait and triggers + telemetry collectors using frequency from constructor arguments. + Continue monitoring until it satisfies automatic shutdown criteria.""" + elapsed: int = 0 + last_ts: int = get_ts_ms() + shutdown_in_progress = False + + if self._action_handler is None: + raise ValueError("The action handler must be initialized to monitor") + + # Event loop runs until the observer shuts down or + # an automatic shutdown is started. + while self._observer.is_alive() and not shutdown_in_progress: + duration_ms = 0 + start_ts = get_ts_ms() + await self._action_handler.on_timestep(start_ts) + + elapsed += start_ts - last_ts + last_ts = start_ts + + # check if there are no jobs being monitored + if self._can_shutdown(): + # cooldown period begins accumulating when no entities are monitored + if elapsed >= self._args.cooldown_ms: + shutdown_in_progress = True + logger.info("Cooldown complete. Beginning shutdown") + await self._action_handler.shutdown() + logger.debug("Beginning file monitor shutdown") + self._observer.stop() # type: ignore + logger.debug("Event loop shutdown complete") + break + else: + # reset cooldown any time jobs are running + elapsed = 0 + + # track time elapsed to execute metric collection + duration_ms = get_ts_ms() - start_ts + wait_ms = max(self._args.frequency_ms - duration_ms, 0) + + # delay next loop if collection time didn't exceed loop frequency + wait_sec = wait_ms / 1000 # convert to seconds for sleep + if elapsed > 0: + completion_pct = elapsed / self._args.cooldown_ms * 100 + logger.info(f"Cooldown {completion_pct:.2f}% complete") + logger.debug(f"Collection in {wait_sec:.2f}s") + await asyncio.sleep(wait_sec) + + logger.info("Exiting telemetry monitor event loop") + + async def run(self) -> int: + """Setup the monitoring entities and start the timer-based loop that + will poll for telemetry data + + :return: return code for the process + """ + logger.info("Executing telemetry monitor") + logger.info(f"Polling frequency: {self._args.frequency}s") + logger.info(f"Experiment directory: {self._experiment_dir}") + logger.info(f"Telemetry output: {self._telemetry_path}") + + # Convert second-based inputs to milliseconds + frequency_ms = int(self._args.frequency * 1000) + + # Create event handlers to trigger when target files are changed + log_handler = LoggingEventHandler(logger) + self._action_handler = ManifestEventHandler( + str(MANIFEST_FILENAME), + timeout_ms=frequency_ms, + ignore_patterns=["*.out", "*.err"], + ) + + try: + # The manifest may not exist when the telemetry monitor starts + if self._manifest_path.exists(): + self._action_handler.process_manifest(str(self._manifest_path)) + + # Add a handler to log file-system events + self._observer.schedule(log_handler, self._telemetry_path) # type:ignore + # Add a handler to perform actions on file-system events + self._observer.schedule( + self._action_handler, self._telemetry_path + ) # type:ignore + self._observer.start() # type: ignore + + # kick off the 'infinite' monitoring loop + await self.monitor() + return os.EX_OK + except Exception as ex: + logger.error(ex) + finally: + await self._action_handler.shutdown() + self.cleanup() + logger.info("Telemetry monitor shutdown complete") + + return os.EX_SOFTWARE + + def cleanup(self) -> None: + """Perform cleanup for all allocated resources""" + if self._observer is not None and self._observer.is_alive(): + logger.debug("Cleaning up manifest observer") + self._observer.stop() # type: ignore + self._observer.join() diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py new file mode 100644 index 000000000..2c51d9600 --- /dev/null +++ b/smartsim/_core/utils/telemetry/util.py @@ -0,0 +1,113 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024 Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# import asyncio +import json +import logging +import os +import pathlib +import typing as t + +from smartsim._core.launcher.stepInfo import StepInfo +from smartsim.status import TERMINAL_STATUSES, SmartSimStatus + +_EventClass = t.Literal["start", "stop", "timestep"] + +logger = logging.getLogger("TelemetryMonitor") + + +def write_event( + timestamp: int, + task_id: t.Union[int, str], + step_id: str, + entity_type: str, + event_type: _EventClass, + status_dir: pathlib.Path, + detail: str = "", + return_code: t.Optional[int] = None, +) -> None: + """Write a record to durable storage for a SmartSimEntity lifecycle event. + Does not overwrite existing records. + + :param timestamp: when the event occurred + :param task_id: the task_id of a managed task + :param step_id: the step_id of an unmanaged task + :param entity_type: the SmartSimEntity subtype + (e.g. `orchestrator`, `ensemble`, `model`, `dbnode`, ...) + :param event_type: the event subtype + :param status_dir: path where the SmartSimEntity outputs are written + :param detail: (optional) additional information to write with the event + :param return_code: (optional) the return code of a completed task + """ + tgt_path = status_dir / f"{event_type}.json" + tgt_path.parent.mkdir(parents=True, exist_ok=True) + + try: + if task_id: + task_id = int(task_id) + except ValueError: + if not isinstance(task_id, str): + logger.exception(f"Unable to parse task_id: {task_id}") + + entity_dict = { + "timestamp": timestamp, + "job_id": task_id, + "step_id": step_id, + "type": entity_type, + "action": event_type, + } + + if detail is not None: + entity_dict["detail"] = detail + + if return_code is not None: + entity_dict["return_code"] = return_code + + try: + if not tgt_path.exists(): + # Don't overwrite existing tracking files + bytes_written = tgt_path.write_text(json.dumps(entity_dict, indent=2)) + if bytes_written < 1: + logger.warning("event tracking failed to write tracking file.") + except Exception: + logger.error("Unable to write tracking file.", exc_info=True) + + +def map_return_code(step_info: StepInfo) -> t.Optional[int]: + """Converts a return code from a workload manager into a SmartSim status. + + A non-terminal status is converted to null. This indicates + that the process referenced in the `StepInfo` is running + and does not yet have a return code. + + :param step_info: step information produced by job manager status update queries + :return: a return code if the step is finished, otherwise None + """ + rc_map = {s: 1 for s in TERMINAL_STATUSES} # return `1` for all terminal statuses + rc_map.update( + {SmartSimStatus.STATUS_COMPLETED: os.EX_OK} + ) # return `0` for full success + + return rc_map.get(step_info.status, None) # return `None` when in-progress diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 431cb43c5..f6ce0310f 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -23,7 +23,11 @@ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# pylint: disable=too-many-lines + import itertools +import os.path as osp import sys import typing as t from os import environ, getcwd, getenv @@ -37,8 +41,13 @@ from .._core.utils import db_is_active from .._core.utils.helpers import is_valid_cmd, unpack_db_identifier from .._core.utils.network import get_ip_from_host -from ..entity import DBNode, EntityList -from ..error import SmartSimError, SSConfigError, SSUnsupportedError +from ..entity import DBNode, EntityList, TelemetryConfiguration +from ..error import ( + SmartSimError, + SSConfigError, + SSDBFilesNotParseable, + SSUnsupportedError, +) from ..log import get_logger from ..servertype import CLUSTERED, STANDALONE from ..settings import ( @@ -60,6 +69,7 @@ logger = get_logger(__name__) by_launcher: t.Dict[str, t.List[str]] = { + "dragon": [""], "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], @@ -71,7 +81,7 @@ def _detect_command(launcher: str) -> str: if launcher in by_launcher: for cmd in by_launcher[launcher]: - if launcher == "local": + if launcher in ["local", "dragon"]: return cmd if is_valid_cmd(cmd): return cmd @@ -105,10 +115,15 @@ def _check_run_command(launcher: str, run_command: str) -> None: raise SmartSimError(msg) -def _get_single_command(run_command: str, batch: bool, single_cmd: bool) -> bool: +def _get_single_command( + run_command: str, launcher: str, batch: bool, single_cmd: bool +) -> bool: if not single_cmd: return single_cmd + if launcher == "dragon": + return False + if run_command == "srun" and getenv("SLURM_HET_SIZE") is not None: msg = ( "srun can not launch an orchestrator with single_cmd=True in " @@ -138,6 +153,7 @@ def _check_local_constraints(launcher: str, batch: bool) -> None: raise SmartSimError(msg) +# pylint: disable-next=too-many-public-methods class Orchestrator(EntityList[DBNode]): """The Orchestrator is an in-memory database that can be launched alongside entities in SmartSim. Data can be transferred between @@ -147,6 +163,7 @@ class Orchestrator(EntityList[DBNode]): def __init__( self, + path: t.Optional[str] = getcwd(), port: int = 6379, interface: t.Union[str, t.List[str]] = "lo", launcher: str = "local", @@ -165,28 +182,39 @@ def __init__( db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> None: - """Initialize an Orchestrator reference for local launch - - :param port: TCP/IP port, defaults to 6379 - :type port: int, optional - :param interface: network interface(s), defaults to "lo" - :type interface: str, list[str], optional + """Initialize an ``Orchestrator`` reference for local launch Extra configurations for RedisAI - See https://oss.redislabs.com/redisai/configuration/ - + See https://oss.redis.com/redisai/configuration/ + + :param path: path to location of ``Orchestrator`` directory + :param port: TCP/IP port + :param interface: network interface(s) + :param launcher: type of launcher being used, options are "slurm", "pbs", + "lsf", or "local". If set to "auto", + an attempt will be made to find an available launcher + on the system. + :param run_command: specify launch binary or detect automatically + :param db_nodes: number of database shards + :param batch: run as a batch workload + :param hosts: specify hosts to launch on + :param account: account to run batch on + :param time: walltime for batch 'HH:MM:SS' format + :param alloc: allocation to launch database on + :param single_cmd: run all shards with one (MPMD) command :param threads_per_queue: threads per GPU device - :type threads_per_queue: int, optional - :param inter_op_threads: threads accross CPU operations - :type inter_op_threads: int, optional + :param inter_op_threads: threads across CPU operations :param intra_op_threads: threads per CPU operation - :type intra_op_threads: int, optional + :param db_identifier: an identifier to distinguish this orchestrator in + multiple-database experiments """ self.launcher, self.run_command = _autodetect(launcher, run_command) _check_run_command(self.launcher, self.run_command) _check_local_constraints(self.launcher, batch) - single_cmd = _get_single_command(self.run_command, batch, single_cmd) + single_cmd = _get_single_command( + self.run_command, self.launcher, batch, single_cmd + ) self.ports: t.List[int] = [] self._hosts: t.List[str] = [] self._user_hostlist: t.List[str] = [] @@ -197,16 +225,16 @@ def __init__( self.queue_threads = threads_per_queue self.inter_threads = inter_op_threads self.intra_threads = intra_op_threads + self._telemetry_cfg = TelemetryConfiguration() gpus_per_shard: t.Optional[int] = None cpus_per_shard: t.Optional[int] = None if self.launcher == "lsf": gpus_per_shard = int(kwargs.pop("gpus_per_shard", 0)) cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) - super().__init__( name=db_identifier, - path=getcwd(), + path=str(path), port=port, interface=interface, db_nodes=db_nodes, @@ -265,18 +293,16 @@ def db_identifier(self) -> str: """Return the DB identifier, which is common to a DB and all of its nodes :return: DB identifier - :rtype: str """ return self.name @property def num_shards(self) -> int: - """Return the number of DB shards contained in the orchestrator. + """Return the number of DB shards contained in the Orchestrator. This might differ from the number of ``DBNode`` objects, as each ``DBNode`` may start more than one shard (e.g. with MPMD). - :returns: num_shards - :rtype: int + :returns: the number of DB shards contained in the Orchestrator """ return sum(node.num_shards for node in self.entities) @@ -288,24 +314,30 @@ def db_nodes(self) -> int: an alias to the ``num_shards`` attribute. :returns: Number of database nodes - :rtype: int """ return self.num_shards @property def hosts(self) -> t.List[str]: - """Return the hostnames of orchestrator instance hosts + """Return the hostnames of Orchestrator instance hosts Note that this will only be populated after the orchestrator has been launched by SmartSim. - :return: hostnames - :rtype: list[str] + :return: the hostnames of Orchestrator instance hosts """ if not self._hosts: self._hosts = self._get_db_hosts() return self._hosts + @property + def telemetry(self) -> TelemetryConfiguration: + """Return the telemetry configuration for this entity. + + :returns: configuration of telemetry for this entity + """ + return self._telemetry_cfg + def reset_hosts(self) -> None: """Clear hosts or reset them to last user choice""" for node in self.entities: @@ -325,7 +357,6 @@ def get_address(self) -> t.List[str]: """Return database addresses :return: addresses - :rtype: list[str] :raises SmartSimError: If database address cannot be found or is not active """ @@ -345,12 +376,12 @@ def is_active(self) -> bool: """Check if the database is active :return: True if database is active, False otherwise - :rtype: bool """ - if not self._hosts: + try: + hosts = self.hosts + except SSDBFilesNotParseable: return False - - return db_is_active(self._hosts, self.ports, self.num_shards) + return db_is_active(hosts, self.ports, self.num_shards) @property def _rai_module(self) -> t.Tuple[str, ...]: @@ -358,7 +389,6 @@ def _rai_module(self) -> t.Tuple[str, ...]: :return: Tuple of args to pass to the orchestrator exe to load and configure the RedisAI - :rtype: tuple[str] """ module = ["--loadmodule", CONFIG.redisai] if self.queue_threads: @@ -377,6 +407,14 @@ def _redis_exe(self) -> str: def _redis_conf(self) -> str: return CONFIG.database_conf + @property + def checkpoint_file(self) -> str: + """Get the path to the checkpoint file for this Orchestrator + + :return: Path to the checkpoint file if it exists, otherwise a None + """ + return osp.join(self.path, "smartsim_db.dat") + def set_cpus(self, num_cpus: int) -> None: """Set the number of CPUs available to each database shard @@ -384,7 +422,6 @@ def set_cpus(self, num_cpus: int) -> None: compute threads, background threads, and network I/O. :param num_cpus: number of cpus to set - :type num_cpus: int """ if self.batch: if self.launcher == "pbs": @@ -408,7 +445,6 @@ def set_walltime(self, walltime: str) -> None: Note: This will only effect orchestrators launched as a batch :param walltime: amount of time e.g. 10 hours is 10:00:00 - :type walltime: str :raises SmartSimError: if orchestrator isn't launching as batch """ if not self.batch: @@ -421,7 +457,6 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) - :type host_list: str, list[str] :raises TypeError: if wrong type """ if isinstance(host_list, str): @@ -432,9 +467,8 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: raise TypeError("host_list argument must be list of strings") self._user_hostlist = host_list.copy() # TODO check length - if self.batch: - if hasattr(self, "batch_settings") and self.batch_settings: - self.batch_settings.set_hostlist(host_list) + if self.batch and hasattr(self, "batch_settings") and self.batch_settings: + self.batch_settings.set_hostlist(host_list) if self.launcher == "lsf": for db in self.entities: @@ -465,9 +499,7 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: by SmartSim and will not be allowed to be set. :param arg: batch argument to set e.g. "exclusive" - :type arg: str :param value: batch param - set to None if no param value - :type value: str | None :raises SmartSimError: if orchestrator not launching as batch """ if not hasattr(self, "batch_settings") or not self.batch_settings: @@ -479,8 +511,7 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: "it is a reserved keyword in Orchestrator" ) else: - if hasattr(self, "batch_settings") and self.batch_settings: - self.batch_settings.batch_args[arg] = value + self.batch_settings.batch_args[arg] = value def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: """Set a run argument the orchestrator should launch @@ -491,9 +522,7 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: For example, "n", "N", etc. :param arg: run argument to set - :type arg: str :param value: run parameter - set to None if no parameter value - :type value: str | None """ if arg in self._reserved_run_args[type(self.entities[0].run_settings)]: logger.warning( @@ -514,7 +543,6 @@ def enable_checkpoints(self, frequency: int) -> None: after 900 seconds if there is at least 1 change to the dataset. :param frequency: the given number of seconds before the DB saves - :type frequency: int """ self.set_db_conf("save", f"{frequency} 1") @@ -523,15 +551,15 @@ def set_max_memory(self, mem: str) -> None: Setting max memory to zero also results in no memory limit. Once a limit is surpassed, keys will be removed according to the eviction strategy. The specified memory size is case insensitive and supports the typical forms of: - 1k => 1000 bytes - 1kb => 1024 bytes - 1m => 1000000 bytes - 1mb => 1024*1024 bytes - 1g => 1000000000 bytes + + 1k => 1000 bytes \n + 1kb => 1024 bytes \n + 1m => 1000000 bytes \n + 1mb => 1024*1024 bytes \n + 1g => 1000000000 bytes \n 1gb => 1024*1024*1024 bytes :param mem: the desired max memory size e.g. 3gb - :type mem: str :raises SmartSimError: If 'mem' is an invalid memory value :raises SmartSimError: If database is not active """ @@ -543,7 +571,6 @@ def set_eviction_strategy(self, strategy: str) -> None: :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. - :type strategy: str :raises SmartSimError: If 'strategy' is an invalid maxmemory policy :raises SmartSimError: If database is not active """ @@ -556,7 +583,6 @@ def set_max_clients(self, clients: int = 50_000) -> None: incoming and another outgoing. :param clients: the maximum number of connected clients - :type clients: int, optional """ self.set_db_conf("maxclients", str(clients)) @@ -569,7 +595,6 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: to 1gb, use 1024*1024*1024. :param size: maximum message size in bytes - :type size: int, optional """ self.set_db_conf("proto-max-bulk-len", str(size)) @@ -580,9 +605,7 @@ def set_db_conf(self, key: str, value: str) -> None: will take effect starting with the next command executed. :param key: the configuration parameter - :type key: str :param value: the database configuration parameter's new value - :type value: str """ if self.is_active(): addresses = [] @@ -847,6 +870,7 @@ def _get_start_script_args( ] if cluster: cmd.append("+cluster") # is the shard part of a cluster + return cmd def _get_db_hosts(self) -> t.List[str]: diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 4566cd76f..40f03fcdd 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -27,7 +27,7 @@ from .dbnode import DBNode from .dbobject import * from .ensemble import Ensemble -from .entity import SmartSimEntity +from .entity import SmartSimEntity, TelemetryConfiguration from .entityList import EntityList, EntitySequence from .files import TaggedFilesHierarchy from .model import Model diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 9b67687f0..d371357f8 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -34,7 +34,7 @@ from dataclasses import dataclass from .._core.config import CONFIG -from ..error import SmartSimError +from ..error import SSDBFilesNotParseable from ..log import get_logger from ..settings.base import RunSettings from .entity import SmartSimEntity @@ -146,9 +146,7 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf This function should bu used if and only if ``_mpmd==True`` :param port: port number - :type port: int :return: the dbnode configuration file name - :rtype: str """ if self.num_shards == 1: return [f"nodes-{self.name}-{port}.conf"] @@ -186,9 +184,8 @@ def _parse_launched_shard_info_from_files( def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": """Parse the launched database shard info from the output files - :raises SmartSimError: if all shard info could not be found + :raises SSDBFilesNotParseable: if all shard info could not be found :return: The found launched shard info - :rtype: list[LaunchedShardData] """ ips: "t.List[LaunchedShardData]" = [] trials = CONFIG.database_file_parse_trials @@ -214,7 +211,7 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": f"{len(ips)} out of {self.num_shards} DB shards." ) logger.error(msg) - raise SmartSimError(msg) + raise SSDBFilesNotParseable(msg) return ips def _parse_db_hosts(self) -> t.List[str]: @@ -223,9 +220,8 @@ def _parse_db_hosts(self) -> t.List[str]: The IP address is preferred, but if hostname is only present then a lookup to /etc/hosts is done through the socket library. - :raises SmartSimError: if host/ip could not be found + :raises SSDBFilesNotParseable: if host/ip could not be found :return: ip addresses | hostnames - :rtype: list[str] """ return list({shard.hostname for shard in self.get_launched_shard_info()}) diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 0a495f066..5cb0d061f 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -27,7 +27,7 @@ import typing as t from pathlib import Path -from .._core.utils import init_default +from .._core._install.builder import Device from ..error import SSUnsupportedError __all__ = ["DBObject", "DBModel", "DBScript"] @@ -46,7 +46,7 @@ def __init__( name: str, func: t.Optional[_DBObjectFuncT], file_path: t.Optional[str], - device: t.Literal["CPU", "GPU"], + device: str, devices_per_node: int, first_device: int, ) -> None: @@ -75,9 +75,6 @@ def _check_tensor_args( inputs: t.Union[str, t.Optional[t.List[str]]], outputs: t.Union[str, t.Optional[t.List[str]]], ) -> t.Tuple[t.List[str], t.List[str]]: - inputs = init_default([], inputs, (list, str)) - outputs = init_default([], outputs, (list, str)) - if isinstance(inputs, str): inputs = [inputs] if isinstance(outputs, str): @@ -103,9 +100,9 @@ def _check_filepath(file: str) -> Path: return file_path @staticmethod - def _check_device(device: t.Literal["CPU", "GPU"]) -> str: - device = t.cast(t.Literal["CPU", "GPU"], device.upper()) - if not device.startswith("CPU") and not device.startswith("GPU"): + def _check_device(device: str) -> str: + valid_devices = [Device.CPU.value, Device.GPU.value] + if not any(device.lower().startswith(dev) for dev in valid_devices): raise ValueError("Device argument must start with either CPU or GPU") return device @@ -113,9 +110,7 @@ def _enumerate_devices(self) -> t.List[str]: """Enumerate devices for a DBObject :param dbobject: DBObject to enumerate - :type dbobject: DBObject :return: list of device names - :rtype: list[str] """ if self.device == "GPU" and self.devices_per_node > 1: @@ -130,16 +125,16 @@ def _enumerate_devices(self) -> t.List[str]: @staticmethod def _check_devices( - device: t.Literal["CPU", "GPU"], + device: str, devices_per_node: int, first_device: int, ) -> None: - if device == "CPU" and devices_per_node > 1: + if device.lower() == Device.CPU.value and devices_per_node > 1: raise SSUnsupportedError( "Cannot set devices_per_node>1 if CPU is specified under devices" ) - if device == "CPU" and first_device > 0: + if device.lower() == Device.CPU.value and first_device > 0: raise SSUnsupportedError( "Cannot set first_device>0 if CPU is specified under devices" ) @@ -160,7 +155,7 @@ def __init__( name: str, script: t.Optional[str] = None, script_path: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, ): @@ -178,17 +173,11 @@ def __init__( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code - :type script: str, optional - :param script_path: path to TorchScript code, defaults to None - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param script_path: path to TorchScript code + :param device: device for script execution :param devices_per_node: number of devices to store the script on - :type devices_per_node: int :param first_device: first devices to store the script on - :type first_device: int """ super().__init__( name, script, script_path, device, devices_per_node, first_device @@ -197,13 +186,13 @@ def __init__( raise ValueError("Either script or script_path must be provided") @property - def script(self) -> t.Optional[str]: + def script(self) -> t.Optional[t.Union[bytes, str]]: return self.func def __str__(self) -> str: desc_str = "Name: " + self.name + "\n" if self.func: - desc_str += "Func: " + self.func + "\n" + desc_str += "Func: " + str(self.func) + "\n" if self.file: desc_str += "File path: " + str(self.file) + "\n" devices_str = self.device + ( @@ -222,7 +211,7 @@ def __init__( backend: str, model: t.Optional[bytes] = None, model_file: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, batch_size: int = 0, @@ -238,31 +227,18 @@ def __init__( must be provided :param name: key to store model under - :type name: str :param model: model in memory - :type model: str, optional :param model_file: serialized model - :type model_file: file path to model, optional :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional + :param device: name of device for execution :param devices_per_node: number of devices to store the model on - :type devices_per_node: int :param first_device: The first device to store the model on - :type first_device: int - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ super().__init__( name, model, model_file, device, devices_per_node, first_device diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index b30f82542..cab138685 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -24,13 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os.path as osp import typing as t from copy import deepcopy from os import getcwd from tabulate import tabulate -from .._core.utils.helpers import init_default +from .._core._install.builder import Device from ..error import ( EntityExistsError, SmartSimError, @@ -61,6 +62,7 @@ def __init__( self, name: str, params: t.Dict[str, t.Any], + path: t.Optional[str] = getcwd(), params_as_args: t.Optional[t.List[str]] = None, batch_settings: t.Optional[BatchSettings] = None, run_settings: t.Optional[RunSettings] = None, @@ -73,43 +75,33 @@ def __init__( parameters to the permutation strategy. :param name: name of the ensemble - :type name: str :param params: parameters to expand into ``Model`` members - :type params: dict[str, Any] :param params_as_args: list of params that should be used as command line arguments to the ``Model`` member executables and not written to generator files - :type params_as_args: list[str] :param batch_settings: describes settings for ``Ensemble`` as batch workload - :type batch_settings: BatchSettings, optional :param run_settings: describes how each ``Model`` should be executed - :type run_settings: RunSettings, optional :param replicas: number of ``Model`` replicas to create - a keyword argument of kwargs - :type replicas: int, optional :param perm_strategy: strategy for expanding ``params`` into ``Model`` instances from params argument options are "all_perm", "step", "random" - or a callable function. Defaults to "all_perm". - :type perm_strategy: str + or a callable function. :return: ``Ensemble`` instance - :rtype: ``Ensemble`` """ - self.params = init_default({}, params, dict) - self.params_as_args = init_default({}, params_as_args, (list, str)) + self.params = params or {} + self.params_as_args = params_as_args or [] self._key_prefixing_enabled = True - self.batch_settings = init_default({}, batch_settings, BatchSettings) - self.run_settings = init_default({}, run_settings, RunSettings) + self.batch_settings = batch_settings + self.run_settings = run_settings + self.replicas: str - super().__init__(name, getcwd(), perm_strat=perm_strat, **kwargs) + super().__init__(name, str(path), perm_strat=perm_strat, **kwargs) @property - def models(self) -> t.Iterable[Model]: - """ - Helper property to cast self.entities to Model type for type correctness - """ - model_entities = [node for node in self.entities if isinstance(node, Model)] - return model_entities + def models(self) -> t.Collection[Model]: + """An alias for a shallow copy of the ``entities`` attribute""" + return list(self.entities) def _initialize_entities(self, **kwargs: t.Any) -> None: """Initialize all the models within the ensemble based @@ -120,6 +112,7 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: """ strategy = self._set_strategy(kwargs.pop("perm_strat")) replicas = kwargs.pop("replicas", None) + self.replicas = replicas # if a ensemble has parameters and run settings, create # the ensemble and assign run_settings to each member @@ -139,9 +132,9 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: run_settings = deepcopy(self.run_settings) model_name = "_".join((self.name, str(i))) model = Model( - model_name, - param_set, - self.path, + name=model_name, + params=param_set, + path=osp.join(self.path, model_name), run_settings=run_settings, params_as_args=self.params_as_args, ) @@ -163,9 +156,9 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: for i in range(replicas): model_name = "_".join((self.name, str(i))) model = Model( - model_name, - {}, - self.path, + name=model_name, + params={}, + path=osp.join(self.path, model_name), run_settings=deepcopy(self.run_settings), ) model.enable_key_prefixing() @@ -191,7 +184,6 @@ def add_model(self, model: Model) -> None: """Add a model to this ensemble :param model: model instance to be added - :type model: Model :raises TypeError: if model is not an instance of ``Model`` :raises EntityExistsError: if model already exists in this ensemble """ @@ -222,7 +214,6 @@ def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: Only python clients can have multiple incoming connections :param incoming_entity: The entity that data will be received from - :type incoming_entity: SmartSimEntity """ for model in self.models: model.register_incoming_entity(incoming_entity) @@ -238,7 +229,6 @@ def query_key_prefixing(self) -> bool: """Inquire as to whether each model within the ensemble will prefix their keys :returns: True if all models have key prefixing enabled, False otherwise - :rtype: bool """ return all(model.query_key_prefixing() for model in self.models) @@ -264,12 +254,9 @@ def attach_generator_files( would like to change. The tag is settable but defaults to a semicolon e.g. THERMO = ;10; - :param to_copy: files to copy, defaults to [] - :type to_copy: list, optional - :param to_symlink: files to symlink, defaults to [] - :type to_symlink: list, optional - :param to_configure: input files with tagged parameters, defaults to [] - :type to_configure: list, optional + :param to_copy: files to copy + :param to_symlink: files to symlink + :param to_configure: input files with tagged parameters """ for model in self.models: model.attach_generator_files( @@ -282,7 +269,6 @@ def attached_files_table(self) -> str: attached to models belonging to this ensemble. :returns: A table of all files attached to all models - :rtype: str """ if not self.models: return "The ensemble is empty, no files to show." @@ -305,10 +291,8 @@ def _set_strategy(strategy: str) -> StrategyFunction: the ensemble :param strategy: name of the strategy or callable function - :type strategy: str :raises SSUnsupportedError: if str name is not supported :return: strategy function - :rtype: callable """ if strategy == "all_perm": return create_all_permutations @@ -328,7 +312,6 @@ def _read_model_parameters(self) -> t.Tuple[t.List[str], t.List[t.List[str]]]: :raises TypeError: if params are of the wrong type :return: param names and values for permutation strategy - :rtype: tuple[list, list] """ if not isinstance(self.params, dict): @@ -359,7 +342,7 @@ def add_ml_model( backend: str, model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, batch_size: int = 0, @@ -379,33 +362,19 @@ def add_ml_model( must be provided :param name: key to store model under - :type name: str :param model: model in memory - :type model: str | bytes | None :param model_path: serialized model - :type model_path: file path to model :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional - :param devices_per_node: number of GPUs per node in multiGPU nodes, - defaults to 1 - :type devices_per_node: int, optional + :param device: name of device for execution + :param devices_per_node: number of GPUs per node in multiGPU nodes :param first_device: first device in multi-GPU nodes to use for execution, defaults to 0; ignored if devices_per_node is 1 - :type first_device: int, optional - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ db_model = DBModel( name=name, @@ -443,7 +412,7 @@ def add_script( name: str, script: t.Optional[str] = None, script_path: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, ) -> None: @@ -463,17 +432,11 @@ def add_script( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code - :type script: str, optional :param script_path: path to TorchScript code - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: number of devices on each host - :type devices_per_node: int :param first_device: first device to use on each host - :type first_device: int """ db_script = DBScript( name=name, @@ -503,7 +466,7 @@ def add_function( self, name: str, function: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, ) -> None: @@ -524,15 +487,10 @@ def add_function( being stored on nodes M through M + N - 1. :param name: key to store function under - :type name: str :param function: TorchScript code - :type function: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: number of devices on each host - :type devices_per_node: int :param first_device: first device to use on each host - :type first_device: int """ db_script = DBScript( name=name, @@ -568,9 +526,7 @@ def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: found. Otherwise, it appends the given list of DBModels to the Ensemble. :param model: SmartSim Model object. - :type model: Model :param db_models: List of DBModels to append to the Ensemble. - :type db_models: t.List[DBModel] """ for add_ml_model in db_models: dupe = next( @@ -598,9 +554,7 @@ def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> Non Ensemble. :param model: SmartSim Model object. - :type model: Model :param db_scripts: List of DBScripts to append to the Ensemble. - :type db_scripts: t.List[DBScript] """ for add_script in db_scripts: dupe = next( diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index 46202ca6a..012a76744 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -31,6 +31,64 @@ import smartsim.settings.base +class TelemetryConfiguration: + """A base class for configuraing telemetry production behavior on + existing `SmartSimEntity` subclasses. Any class that will have + optional telemetry collection must expose access to an instance + of `TelemetryConfiguration` such as: + + ``` + @property + def telemetry(self) -> TelemetryConfiguration: + # Return the telemetry configuration for this entity. + # :returns: Configuration object indicating the configuration + # status of telemetry for this entity + return self._telemetry_producer + ``` + + An instance will be used by to conditionally serialize + values to the `RuntimeManifest` + """ + + def __init__(self, enabled: bool = False) -> None: + """Initialize the telemetry producer and immediately call the `_on_enable` hook. + + :param enabled: flag indicating the initial state of telemetry + """ + self._is_on = enabled + + if self._is_on: + self._on_enable() + else: + self._on_disable() + + @property + def is_enabled(self) -> bool: + """Boolean flag indicating if telemetry is currently enabled + + :returns: `True` if enabled, `False` otherwise + """ + return self._is_on + + def enable(self) -> None: + """Enable telemetry for this producer""" + self._is_on = True + self._on_enable() + + def disable(self) -> None: + """Disable telemetry for this producer""" + self._is_on = False + self._on_disable() + + def _on_enable(self) -> None: + """Overridable hook called after telemetry is `enabled`. Allows subclasses + to perform actions when attempts to change configuration are made""" + + def _on_disable(self) -> None: + """Overridable hook called after telemetry is `disabled`. Allows subclasses + to perform actions when attempts to change configuration are made""" + + class SmartSimEntity: def __init__( self, name: str, path: str, run_settings: "smartsim.settings.base.RunSettings" @@ -42,12 +100,9 @@ def __init__( share these attributes. :param name: Name of the entity - :type name: str :param path: path to output, error, and configuration files - :type path: str :param run_settings: Launcher settings specified in the experiment entity - :type run_settings: dict """ self.name = name self.run_settings = run_settings diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index 6d958bda6..edaa88668 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -91,16 +91,14 @@ def db_scripts(self) -> t.Iterable["smartsim.entity.DBScript"]: @property def batch(self) -> bool: - try: - if not hasattr(self, "batch_settings"): - return False - - if self.batch_settings: - return True - return False - # local orchestrator cannot launch with batches - except AttributeError: - return False + """Property indicating whether or not the entity sequence should be + launched as a batch job + + :return: ``True`` if entity sequence should be launched as a batch job, + ``False`` if the members will be launched individually. + """ + # pylint: disable-next=no-member + return hasattr(self, "batch_settings") and self.batch_settings @property def type(self) -> str: diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 9c282b94e..d00e946e2 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -58,13 +58,10 @@ def __init__( """Initialize an EntityFiles instance :param tagged: tagged files for model configuration - :type tagged: list of str :param copy: files or directories to copy into model or node directories - :type copy: list of str :param symlink: files to symlink into model or node directories - :type symlink: list of str """ self.tagged = tagged or [] self.copy = copy or [] @@ -102,12 +99,9 @@ def _type_check_files( """Check the type of the files provided by the user. :param file_list: either tagged, copy, or symlink files - :type file_list: list of str :param file_type: name of the file type e.g. "tagged" - :type file_type: str :raises TypeError: if incorrect type is provided by user :return: file list provided - :rtype: list of str """ if file_list: if not isinstance(file_list, list): @@ -128,10 +122,8 @@ def _check_path(file_path: str) -> str: the directory or file and create a full path. :param file_path: path to a specific file or directory - :type file_path: str :raises FileNotFoundError: if file or directory does not exist :return: full path to file or directory - :rtype: str """ full_path = path.abspath(file_path) if path.isfile(full_path): @@ -183,12 +175,10 @@ def __init__(self, parent: t.Optional[t.Any] = None, subdir_name: str = "") -> N :param parent: The parent hierarchy of the new hierarchy, must be None if creating a root hierarchy, must be provided if creating a subhierachy - :type parent: TaggedFilesHierarchy | None, optional :param subdir_name: Name of subdirectory representd by the new hierarchy, must be "" if creating a root hierarchy, must be any valid dir name if subhierarchy, invalid names are ".", ".." or contain path seperators - :type subdir_name: str, optional :raises ValueError: if given a subdir_name without a parent, if given a parent without a subdir_name, or if the subdir_name is invalid @@ -232,15 +222,12 @@ def from_list_paths( :param path_list: list of absolute paths to tagged files or dirs containing tagged files - :type path_list: list[str] :param dir_contents_to_base: When a top level dir is encountered, if this value is truthy, files in the dir are put into the base hierarchy level. Otherwise, a new sub level is created for the dir - :type dir_contents_to_base: bool :return: A built tagged file hierarchy for the given files - :rtype: TaggedFilesHierarchy """ tagged_file_hierarchy = cls() if dir_contents_to_base: @@ -261,7 +248,6 @@ def _add_file(self, file: str) -> None: """Add a file to the current level in the file hierarchy :param file: absoute path to a file to add to the hierarchy - :type file: str """ self.files.add(file) @@ -271,7 +257,6 @@ def _add_dir(self, dir_path: str) -> None: the new level sub level tagged file hierarchy :param dir: absoute path to a dir to add to the hierarchy - :type dir: str """ tagged_file_hierarchy = TaggedFilesHierarchy(self, path.basename(dir_path)) # pylint: disable-next=protected-access @@ -285,7 +270,6 @@ def _add_paths(self, paths: t.List[str]) -> None: TaggedFilesHierarchy. :param paths: list of paths to files or dirs to add to the hierarchy - :type paths: list[str] :raises ValueError: if link to dir is found :raises FileNotFoundError: if path does not exist """ diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index c7b8731c2..3f78e042c 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -26,14 +26,16 @@ from __future__ import annotations -import collections.abc +import itertools import re import sys import typing as t import warnings +from os import getcwd from os import path as osp -from .._core.utils.helpers import cat_arg_and_value, init_default +from .._core._install.builder import Device +from .._core.utils.helpers import cat_arg_and_value from ..error import EntityExistsError, SSUnsupportedError from ..log import get_logger from ..settings.base import BatchSettings, RunSettings @@ -49,31 +51,25 @@ def __init__( self, name: str, params: t.Dict[str, str], - path: str, run_settings: RunSettings, + path: t.Optional[str] = getcwd(), params_as_args: t.Optional[t.List[str]] = None, batch_settings: t.Optional[BatchSettings] = None, ): """Initialize a ``Model`` :param name: name of the model - :type name: str :param params: model parameters for writing into configuration files or to be passed as command line arguments to executable. - :type params: dict :param path: path to output, error, and configuration files - :type path: str :param run_settings: launcher settings specified in the experiment - :type run_settings: RunSettings :param params_as_args: list of parameters which have to be interpreted as command line arguments to be added to run_settings - :type params_as_args: list[str] :param batch_settings: Launcher settings for running the individual - model as a batch job, defaults to None - :type batch_settings: BatchSettings | None + model as a batch job """ - super().__init__(name, path, run_settings) + super().__init__(name, str(path), run_settings) self.params = params self.params_as_args = params_as_args self.incoming_entities: t.List[SmartSimEntity] = [] @@ -85,17 +81,26 @@ def __init__( @property def db_models(self) -> t.Iterable[DBModel]: - """Return an immutable collection of attached models""" + """Retrieve an immutable collection of attached models + + :return: Return an immutable collection of attached models + """ return (model for model in self._db_models) @property def db_scripts(self) -> t.Iterable[DBScript]: - """Return an immutable collection attached of scripts""" + """Retrieve an immutable collection attached of scripts + + :return: Return an immutable collection of attached scripts + """ return (script for script in self._db_scripts) @property def colocated(self) -> bool: - """Return True if this Model will run with a colocated Orchestrator""" + """Return True if this Model will run with a colocated Orchestrator + + :return: Return True of the Model will run with a colocated Orchestrator + """ return bool(self.run_settings.colocated_db_settings) def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: @@ -106,7 +111,6 @@ def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: with that entity :param incoming_entity: The entity that data will be received from - :type incoming_entity: SmartSimEntity :raises SmartSimError: if incoming entity has already been registered """ if incoming_entity.name in [ @@ -128,7 +132,10 @@ def disable_key_prefixing(self) -> None: self._key_prefixing_enabled = False def query_key_prefixing(self) -> bool: - """Inquire as to whether this entity will prefix its keys with its name""" + """Inquire as to whether this entity will prefix its keys with its name + + :return: Return True if entity will prefix its keys with its name + """ return self._key_prefixing_enabled def attach_generator_files( @@ -155,16 +162,13 @@ def attach_generator_files( would like to change. The tag is settable but defaults to a semicolon e.g. THERMO = ;10; - :param to_copy: files to copy, defaults to [] - :type to_copy: list, optional - :param to_symlink: files to symlink, defaults to [] - :type to_symlink: list, optional - :param to_configure: input files with tagged parameters, defaults to [] - :type to_configure: list, optional + :param to_copy: files to copy + :param to_symlink: files to symlink + :param to_configure: input files with tagged parameters """ - to_copy = init_default([], to_copy, (list, str)) - to_symlink = init_default([], to_symlink, (list, str)) - to_configure = init_default([], to_configure, (list, str)) + to_copy = to_copy or [] + to_symlink = to_symlink or [] + to_configure = to_configure or [] # Check that no file collides with the parameter file written # by Generator. We check the basename, even though it is more @@ -185,7 +189,6 @@ def attached_files_table(self) -> str: """Return a list of attached files as a plain text table :returns: String version of table - :rtype: str """ if not self.files: return "No file attached to this model." @@ -239,18 +242,12 @@ def colocate_db_uds( Generally these don't need to be changed. :param unix_socket: path to where the socket file will be created - :type unix_socket: str, optional :param socket_permissions: permissions for the socketfile - :type socket_permissions: int, optional - :param db_cpus: number of cpus to use for orchestrator, defaults to 1 - :type db_cpus: int, optional + :param db_cpus: number of cpus to use for orchestrator :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty iterable disables pinning - :type custom_pinning: iterable of ints or iterable of ints, optional :param debug: launch Model with extra debug information about the colocated db - :type debug: bool, optional :param kwargs: additional keyword arguments to pass to the orchestrator database - :type kwargs: dict, optional """ if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): @@ -305,20 +302,13 @@ def colocate_db_tcp( Generally these don't need to be changed. - :param port: port to use for orchestrator database, defaults to 6379 - :type port: int, optional - :param ifname: interface to use for orchestrator, defaults to "lo" - :type ifname: str | list[str], optional - :param db_cpus: number of cpus to use for orchestrator, defaults to 1 - :type db_cpus: int, optional + :param port: port to use for orchestrator database + :param ifname: interface to use for orchestrator + :param db_cpus: number of cpus to use for orchestrator :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty iterable disables pinning - :type custom_pinning: iterable of ints or iterable of ints, optional :param debug: launch Model with extra debug information about the colocated db - :type debug: bool, optional :param kwargs: additional keyword arguments to pass to the orchestrator database - :type kwargs: dict, optional - """ tcp_options = {"port": port, "ifname": ifname} @@ -414,9 +404,10 @@ def _set_colocated_db_settings( def _create_pinning_string( pin_ids: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], cpus: int ) -> t.Optional[str]: - """Create a comma-separated string CPU ids. By default, None returns - 0,1,...,cpus-1; an empty iterable will disable pinning altogether, - and an iterable constructs a comma separate string (e.g. 0,2,5) + """Create a comma-separated string of CPU ids. By default, ``None`` + returns 0,1,...,cpus-1; an empty iterable will disable pinning + altogether, and an iterable constructs a comma separated string of + integers (e.g. ``[0, 2, 5]`` -> ``"0,2,5"``) """ def _stringify_id(_id: int) -> str: @@ -428,40 +419,34 @@ def _stringify_id(_id: int) -> str: raise TypeError(f"Argument is of type '{type(_id)}' not 'int'") - _invalid_input_message = ( - "Expected a cpu pinning specification of type iterable of ints or " - f"iterables of ints. Instead got type `{type(pin_ids)}`" - ) + try: + pin_ids = tuple(pin_ids) if pin_ids is not None else None + except TypeError: + raise TypeError( + "Expected a cpu pinning specification of type iterable of ints or " + f"iterables of ints. Instead got type `{type(pin_ids)}`" + ) from None # Deal with MacOSX limitations first. The "None" (default) disables pinning - # and is equivalent to []. The only invalid option is an iterable + # and is equivalent to []. The only invalid option is a non-empty pinning if sys.platform == "darwin": - if pin_ids is None or not pin_ids: - return None - - if isinstance(pin_ids, collections.abc.Iterable): + if pin_ids: warnings.warn( "CPU pinning is not supported on MacOSX. Ignoring pinning " "specification.", RuntimeWarning, ) - return None - raise TypeError(_invalid_input_message) + return None + # Flatten the iterable into a list and check to make sure that the resulting # elements are all ints if pin_ids is None: return ",".join(_stringify_id(i) for i in range(cpus)) if not pin_ids: return None - if isinstance(pin_ids, collections.abc.Iterable): - pin_list = [] - for pin_id in pin_ids: - if isinstance(pin_id, collections.abc.Iterable): - pin_list.extend([_stringify_id(j) for j in pin_id]) - else: - pin_list.append(_stringify_id(pin_id)) - return ",".join(sorted(set(pin_list))) - raise TypeError(_invalid_input_message) + pin_ids = ((x,) if isinstance(x, int) else x for x in pin_ids) + to_fmt = itertools.chain.from_iterable(pin_ids) + return ",".join(sorted({_stringify_id(x) for x in to_fmt})) def params_to_args(self) -> None: """Convert parameters to command line arguments and update run settings.""" @@ -487,7 +472,7 @@ def add_ml_model( backend: str, model: t.Optional[bytes] = None, model_path: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, batch_size: int = 0, @@ -507,35 +492,22 @@ def add_ml_model( must be provided :param name: key to store model under - :type name: str :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str :param model: A model in memory (only supported for non-colocated orchestrators) - :type model: byte string, optional :param model_path: serialized model - :type model_path: file path to model - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional + :param device: name of device for execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ db_model = DBModel( name=name, @@ -559,7 +531,7 @@ def add_script( name: str, script: t.Optional[str] = None, script_path: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, ) -> None: @@ -581,21 +553,15 @@ def add_script( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code (only supported for non-colocated orchestrators) - :type script: str, optional :param script_path: path to TorchScript code - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int """ db_script = DBScript( name=name, @@ -611,7 +577,7 @@ def add_function( self, name: str, function: t.Optional[str] = None, - device: t.Literal["CPU", "GPU"] = "CPU", + device: str = Device.CPU.value.upper(), devices_per_node: int = 1, first_device: int = 0, ) -> None: @@ -630,19 +596,14 @@ def add_function( in the model being stored in the first N devices of type ``device``. :param name: key to store function under - :type name: str :param function: TorchScript function code - :type function: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int """ db_script = DBScript( name=name, diff --git a/smartsim/error/__init__.py b/smartsim/error/__init__.py index 4268905e6..3a40548e7 100644 --- a/smartsim/error/__init__.py +++ b/smartsim/error/__init__.py @@ -32,6 +32,7 @@ ShellError, SmartSimError, SSConfigError, + SSDBFilesNotParseable, SSDBIDConflictError, SSInternalError, SSReservedKeywordError, diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index 9a6954907..333258a34 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -87,6 +87,12 @@ class SSDBIDConflictError(SmartSimError): """ +class SSDBFilesNotParseable(SmartSimError): + """Raised when the files related to the database cannot be parsed. + Includes the case when the files do not exist. + """ + + # Internal Exceptions @@ -149,3 +155,7 @@ class UnproxyableStepError(TelemetryError): class SmartSimCLIActionCancelled(SmartSimError): """Raised when a `smart` CLI command is terminated""" + + +class PreviewFormatError(SSUnsupportedError): + """Raised when the output format of the preview method call is not supported""" diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 9fcc7b13e..6b9d6a1fb 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,19 +24,28 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# pylint: disable=too-many-lines + import os import os.path as osp import typing as t -from os import getcwd +from os import environ, getcwd from tabulate import tabulate +from smartsim._core.config import CONFIG from smartsim.error.errors import SSUnsupportedError +from smartsim.status import SmartSimStatus -from ._core import Controller, Generator, Manifest -from ._core.utils import init_default +from ._core import Controller, Generator, Manifest, previewrenderer from .database import Orchestrator -from .entity import Ensemble, Model, SmartSimEntity +from .entity import ( + Ensemble, + EntitySequence, + Model, + SmartSimEntity, + TelemetryConfiguration, +) from .error import SmartSimError from .log import ctx_exp_path, get_logger, method_contextualizer from .settings import Container, base, settings @@ -54,11 +63,26 @@ def _exp_path_map(exp: "Experiment") -> str: _contextualize = method_contextualizer(ctx_exp_path, _exp_path_map) +class ExperimentTelemetryConfiguration(TelemetryConfiguration): + """Customized telemetry configuration for an `Experiment`. Ensures + backwards compatible behavior with drivers using environment variables + to enable experiment telemetry""" + + def __init__(self) -> None: + super().__init__(enabled=CONFIG.telemetry_enabled) + + def _on_enable(self) -> None: + """Modify the environment variable to enable telemetry.""" + environ["SMARTSIM_FLAG_TELEMETRY"] = "1" + + def _on_disable(self) -> None: + """Modify the environment variable to disable telemetry.""" + environ["SMARTSIM_FLAG_TELEMETRY"] = "0" + + # pylint: disable=no-self-use class Experiment: - """Experiments are the Python user interface for SmartSim. - - Experiment is a factory class that creates stages of a workflow + """Experiment is a factory class that creates stages of a workflow and manages their execution. The instances created by an Experiment represent executable code @@ -80,7 +104,7 @@ def __init__( exp_path: t.Optional[str] = None, launcher: str = "local", ): - """Initialize an Experiment instance + """Initialize an Experiment instance. With the default settings, the Experiment will use the local launcher, which will start all Experiment created @@ -101,10 +125,10 @@ def __init__( exp = Experiment(name="my_exp", launcher="slurm") - If you wish your driver script and Experiment to be run across + If you want your Experiment driver script to be run across multiple system with different schedulers (workload managers) - you can also use the `auto` argument to have the Experiment guess - which launcher to use based on system installed binaries and libraries + you can also use the `auto` argument to have the Experiment detect + which launcher to use based on system installed binaries and libraries. .. highlight:: python .. code-block:: python @@ -118,15 +142,11 @@ def __init__( from the Experiment. :param name: name for the ``Experiment`` - :type name: str - :param exp_path: path to location of ``Experiment`` directory if generated - :type exp_path: str, optional + :param exp_path: path to location of ``Experiment`` directory :param launcher: type of launcher being used, options are "slurm", "pbs", "lsf", or "local". If set to "auto", an attempt will be made to find an available launcher on the system. - Defaults to "local" - :type launcher: str, optional """ self.name = name if exp_path: @@ -135,21 +155,37 @@ def __init__( if not osp.isdir(osp.abspath(exp_path)): raise NotADirectoryError("Experiment path provided does not exist") exp_path = osp.abspath(exp_path) - self.exp_path: str = init_default(osp.join(getcwd(), name), exp_path, str) + else: + exp_path = osp.join(getcwd(), name) - if launcher == "auto": - launcher = detect_launcher() - if launcher == "cobalt": - raise SSUnsupportedError("Cobalt launcher is no longer supported.") + self.exp_path = exp_path - self._control = Controller(launcher=launcher) self._launcher = launcher.lower() + + if self._launcher == "auto": + self._launcher = detect_launcher() + if self._launcher == "cobalt": + raise SSUnsupportedError("Cobalt launcher is no longer supported.") + + if launcher == "dragon": + self._set_dragon_server_path() + + self._control = Controller(launcher=self._launcher) + self.db_identifiers: t.Set[str] = set() + self._telemetry_cfg = ExperimentTelemetryConfiguration() + + def _set_dragon_server_path(self) -> None: + """Set path for dragon server through environment varialbes""" + if not "SMARTSIM_DRAGON_SERVER_PATH" in environ: + environ["SMARTSIM_DRAGON_SERVER_PATH_EXP"] = osp.join( + self.exp_path, CONFIG.dragon_default_subdir + ) @_contextualize def start( self, - *args: t.Any, + *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], block: bool = True, summary: bool = False, kill_on_interrupt: bool = True, @@ -168,7 +204,7 @@ def start( model = exp.create_model("my_model", settings) exp.start(model) - Multiple instance can also be passed to the start method + Multiple entity instances can also be passed to the start method at once no matter which type of instance they are. These will all be launched together. @@ -194,18 +230,13 @@ def start( zombie processes will need to be manually killed. :param block: block execution until all non-database - jobs are finished, defaults to True - :type block: bool, optional - :param summary: print a launch summary prior to launch, - defaults to False - :type summary: bool, optional + jobs are finished + :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) signal is received. - - :type kill_on_interrupt: bool, optional """ - start_manifest = Manifest(*args) + self._create_entity_dir(start_manifest) try: if summary: self._launch_summary(start_manifest) @@ -221,7 +252,9 @@ def start( raise @_contextualize - def stop(self, *args: t.Any) -> None: + def stop( + self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> None: """Stop specific instances launched by this ``Experiment`` Instances of ``Model``, ``Ensemble`` and ``Orchestrator`` @@ -241,6 +274,7 @@ def stop(self, *args: t.Any) -> None: # multiple exp.stop(model_1, model_2, db, ensemble) + :param args: One or more SmartSimEntity or EntitySequence objects. :raises TypeError: if wrong type :raises SmartSimError: if stop request fails """ @@ -260,15 +294,15 @@ def stop(self, *args: t.Any) -> None: @_contextualize def generate( self, - *args: t.Any, + *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]], tag: t.Optional[str] = None, overwrite: bool = False, verbose: bool = False, ) -> None: """Generate the file structure for an ``Experiment`` - ``Experiment.generate`` creates directories for each instance - passed to organize Experiments that launch many instances. + ``Experiment.generate`` creates directories for each entity + passed to organize Experiments that launch many entities. If files or directories are attached to ``Model`` objects using ``Model.attach_generator_files()``, those files or @@ -279,12 +313,8 @@ def generate( can all be passed as arguments to the generate method. :param tag: tag used in `to_configure` generator files - :type tag: str, optional - :param overwrite: overwrite existing folders and contents, - defaults to False - :type overwrite: bool, optional + :param overwrite: overwrite existing folders and contents :param verbose: log parameter settings to std out - :type verbose: bool """ try: generator = Generator(self.exp_path, overwrite=overwrite, verbose=verbose) @@ -324,14 +354,10 @@ def poll( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. - :param interval: frequency (in seconds) of logging to stdout, - defaults to 10 seconds - :type interval: int, optional - :param verbose: set verbosity, defaults to True - :type verbose: bool, optional + :param interval: frequency (in seconds) of logging to stdout + :param verbose: set verbosity :param kill_on_interrupt: flag for killing jobs when SIGINT is received - :type kill_on_interrupt: bool, optional - :raises SmartSimError: + :raises SmartSimError: if poll request fails """ try: self._control.poll(interval, verbose, kill_on_interrupt=kill_on_interrupt) @@ -351,9 +377,7 @@ def finished(self, entity: SmartSimEntity) -> bool: by the user. :param entity: object launched by this ``Experiment`` - :type entity: Model | Ensemble - :returns: True if job has completed, False otherwise - :rtype: bool + :returns: True if the job has finished, False otherwise :raises SmartSimError: if entity has not been launched by this ``Experiment`` """ @@ -364,8 +388,10 @@ def finished(self, entity: SmartSimEntity) -> bool: raise @_contextualize - def get_status(self, *args: t.Any) -> t.List[str]: - """Query the status of launched instances + def get_status( + self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] + ) -> t.List[SmartSimStatus]: + """Query the status of launched entity instances Return a smartsim.status string representing the status of the launched instance. @@ -387,12 +413,11 @@ def get_status(self, *args: t.Any) -> t.List[str]: assert all(complete) :returns: status of the instances passed as arguments - :rtype: list[str] :raises SmartSimError: if status retrieval fails """ try: manifest = Manifest(*args) - statuses: t.List[str] = [] + statuses: t.List[SmartSimStatus] = [] for entity in manifest.models: statuses.append(self._control.get_entity_status(entity)) for entity_list in manifest.all_entity_lists: @@ -411,6 +436,7 @@ def create_ensemble( run_settings: t.Optional[base.RunSettings] = None, replicas: t.Optional[int] = None, perm_strategy: str = "all_perm", + path: t.Optional[str] = None, **kwargs: t.Any, ) -> Ensemble: """Create an ``Ensemble`` of ``Model`` instances @@ -419,7 +445,7 @@ def create_ensemble( if using a non-local launcher. e.g. slurm Ensembles require one of the following combinations - of arguments + of arguments: - ``run_settings`` and ``params`` - ``run_settings`` and ``replicas`` @@ -428,44 +454,43 @@ def create_ensemble( - ``batch_settings``, ``run_settings``, and ``replicas`` If given solely batch settings, an empty ensemble - will be created that models can be added to manually + will be created that Models can be added to manually through ``Ensemble.add_model()``. - The entire ensemble will launch as one batch. + The entire Ensemble will launch as one batch. Provided batch and run settings, either ``params`` or ``replicas`` must be passed and the entire ensemble will launch as a single batch. Provided solely run settings, either ``params`` - or ``replicas`` must be passed and the ensemble members + or ``replicas`` must be passed and the Ensemble members will each launch sequentially. The kwargs argument can be used to pass custom input parameters to the permutation strategy. - :param name: name of the ensemble - :type name: str + :param name: name of the ``Ensemble`` :param params: parameters to expand into ``Model`` members - :type params: dict[str, Any] :param batch_settings: describes settings for ``Ensemble`` as batch workload - :type batch_settings: BatchSettings :param run_settings: describes how each ``Model`` should be executed - :type run_settings: RunSettings :param replicas: number of replicas to create - :type replicas: int :param perm_strategy: strategy for expanding ``params`` into ``Model`` instances from params argument options are "all_perm", "step", "random" - or a callable function. Default is "all_perm". - :type perm_strategy: str, optional + or a callable function. :raises SmartSimError: if initialization fails :return: ``Ensemble`` instance - :rtype: Ensemble """ + if name is None: + raise AttributeError("Entity has no name. Please set name attribute.") + check_path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(check_path) + try: new_ensemble = Ensemble( - name, - params or {}, + name=name, + params=params or {}, + path=entity_path, batch_settings=batch_settings, run_settings=run_settings, perm_strat=perm_strategy, @@ -497,27 +522,27 @@ def create_model( ``Model`` instances can be launched sequentially, as a batch job, or as a group by adding them into an ``Ensemble``. - All models require a reference to run settings to specify which + All ``Models`` require a reference to run settings to specify which executable to launch as well provide options for how to launch the executable with the underlying WLM. Furthermore, batch a - reference to a batch settings can be added to launch the model - as a batch job through ``Experiment.start``. If a model with + reference to a batch settings can be added to launch the ``Model`` + as a batch job through ``Experiment.start``. If a ``Model`` with a reference to a set of batch settings is added to a larger entity with its own set of batch settings (for e.g. an ``Ensemble``) the batch settings of the larger entity will take - precedence and the batch setting of the model will be + precedence and the batch setting of the ``Model`` will be strategically ignored. Parameters supplied in the `params` argument can be written into - configuration files supplied at runtime to the model through + configuration files supplied at runtime to the ``Model`` through ``Model.attach_generator_files``. `params` can also be turned into executable arguments by calling ``Model.params_to_args`` By default, ``Model`` instances will be executed in the - current working directory if no `path` argument is supplied. + exp_path/model_name directory if no `path` argument is supplied. If a ``Model`` instance is passed to ``Experiment.generate``, a directory within the ``Experiment`` directory will be created - to house the input and output files from the model. + to house the input and output files from the ``Model``. Example initialization of a ``Model`` instance @@ -553,36 +578,31 @@ def create_model( deprecated, but remains as an alias for ``Model.colocate_db_tcp`` for backward compatibility. - :param name: name of the model - :type name: str + :param name: name of the ``Model`` :param run_settings: defines how ``Model`` should be run - :type run_settings: RunSettings - :param params: model parameters for writing into configuration files - :type params: dict, optional - :param path: path to where the model should be executed at runtime - :type path: str, optional - :param enable_key_prefixing: If True, data sent to the Orchestrator + :param params: ``Model`` parameters for writing into configuration files + :param path: path to where the ``Model`` should be executed at runtime + :param enable_key_prefixing: If True, data sent to the ``Orchestrator`` using SmartRedis from this ``Model`` will be prefixed with the ``Model`` name. - Default is True. - :type enable_key_prefixing: bool, optional - :param batch_settings: Settings to run model individually as a batch job, - defaults to None - :type batch_settings: BatchSettings | None + :param batch_settings: Settings to run ``Model`` individually as a batch job. :raises SmartSimError: if initialization fails :return: the created ``Model`` - :rtype: Model """ - path = init_default(getcwd(), path, str) - - if path is None: - path = getcwd() + if name is None: + raise AttributeError("Entity has no name. Please set name attribute.") + check_path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(check_path) if params is None: params = {} try: new_model = Model( - name, params, path, run_settings, batch_settings=batch_settings + name=name, + params=params, + path=entity_path, + run_settings=run_settings, + batch_settings=batch_settings, ) if enable_key_prefixing: new_model.enable_key_prefixing() @@ -605,7 +625,7 @@ def create_run_settings( """Create a ``RunSettings`` instance. run_command="auto" will attempt to automatically - match a run command on the system with a RunSettings + match a run command on the system with a ``RunSettings`` class in SmartSim. If found, the class corresponding to that run_command will be created and returned. @@ -626,19 +646,12 @@ class in SmartSim. If found, the class corresponding - jsrun (LSF) :param run_command: command to run the executable - :type run_command: str :param exe: executable to run - :type exe: str :param exe_args: arguments to pass to the executable - :type exe_args: list[str], optional :param run_args: arguments to pass to the ``run_command`` - :type run_args: dict[str, t.Union[int, str, float, None]], optional :param env_vars: environment variables to pass to the executable - :type env_vars: dict[str, str], optional :param container: if execution environment is containerized - :type container: Container, optional :return: the created ``RunSettings`` - :rtype: RunSettings """ try: @@ -689,18 +702,12 @@ def create_batch_settings( batch_args=batch_args) bs.set_account("default") - :param nodes: number of nodes for batch job, defaults to 1 - :type nodes: int, optional - :param time: length of batch job, defaults to "" - :type time: str, optional - :param queue: queue or partition (if slurm), defaults to "" - :type queue: str, optional - :param account: user account name for batch system, defaults to "" - :type account: str, optional - :param batch_args: additional batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch job + :param time: length of batch job + :param queue: queue or partition (if slurm) + :param account: user account name for batch system + :param batch_args: additional batch arguments :return: a newly created BatchSettings instance - :rtype: BatchSettings :raises SmartSimError: if batch creation fails """ try: @@ -721,11 +728,12 @@ def create_batch_settings( def create_database( self, port: int = 6379, + path: t.Optional[str] = None, db_nodes: int = 1, batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, run_command: str = "auto", - interface: str = "ipogif0", + interface: t.Union[str, t.List[str]] = "ipogif0", account: t.Optional[str] = None, time: t.Optional[str] = None, queue: t.Optional[str] = None, @@ -733,60 +741,49 @@ def create_database( db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> Orchestrator: - """Initialize an Orchestrator database + """Initialize an ``Orchestrator`` database The ``Orchestrator`` database is a key-value store based - on Redis that can be launched together with other Experiment + on Redis that can be launched together with other ``Experiment`` created instances for online data storage. When launched, ``Orchestrator`` can be used to communicate data between Fortran, Python, C, and C++ applications. Machine Learning models in Pytorch, Tensorflow, and ONNX (i.e. scikit-learn) - can also be stored within the Orchestrator database where they + can also be stored within the ``Orchestrator`` database where they can be called remotely and executed on CPU or GPU where the database is hosted. To enable a SmartSim ``Model`` to communicate with the database the workload must utilize the SmartRedis clients. For more information on the database, and SmartRedis clients see the - documentation at www.craylabs.org - - :param port: TCP/IP port, defaults to 6379 - :type port: int, optional - :param db_nodes: number of database shards, defaults to 1 - :type db_nodes: int, optional - :param batch: run as a batch workload, defaults to False - :type batch: bool, optional - :param hosts: specify hosts to launch on, defaults to None - :type hosts: list[str], optional - :param run_command: specify launch binary or detect automatically, - defaults to "auto" - :type run_command: str, optional - :param interface: Network interface, defaults to "ipogif0" - :type interface: str, optional - :param account: account to run batch on, defaults to None - :type account: str, optional - :param time: walltime for batch 'HH:MM:SS' format, defaults to None - :type time: str, optional - :param queue: queue to run the batch on, defaults to None - :type queue: str, optional - :param single_cmd: run all shards with one (MPMD) command, defaults to True - :type single_cmd: bool, optional + documentation at https://www.craylabs.org/docs/smartredis.html + + :param port: TCP/IP port + :param db_nodes: number of database shards + :param batch: run as a batch workload + :param hosts: specify hosts to launch on + :param run_command: specify launch binary or detect automatically + :param interface: Network interface + :param account: account to run batch on + :param time: walltime for batch 'HH:MM:SS' format + :param queue: queue to run the batch on + :param single_cmd: run all shards with one (MPMD) command :param db_identifier: an identifier to distinguish this orchestrator in - multiple-database experiments, defaults to "orchestrator" - :type db_identifier: str, optional + multiple-database experiments :raises SmartSimError: if detection of launcher or of run command fails :raises SmartSimError: if user indicated an incompatible run command for the launcher - :return: Orchestrator - :rtype: Orchestrator or derived class + :return: Orchestrator or derived class """ - self.append_to_db_identifier_list(db_identifier) - + self._append_to_db_identifier_list(db_identifier) + check_path = path or osp.join(self.exp_path, db_identifier) + entity_path: str = osp.abspath(check_path) return Orchestrator( port=port, + path=entity_path, db_nodes=db_nodes, batch=batch, hosts=hosts, @@ -813,7 +810,6 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: :param checkpoint: the `smartsim_db.dat` file created when an ``Orchestrator`` is launched - :type checkpoint: str """ try: orc = self._control.reload_saved_db(checkpoint) @@ -822,6 +818,53 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: logger.error(e) raise + def preview( + self, + *args: t.Any, + verbosity_level: previewrenderer.Verbosity = previewrenderer.Verbosity.INFO, + output_format: previewrenderer.Format = previewrenderer.Format.PLAINTEXT, + output_filename: t.Optional[str] = None, + ) -> None: + """Preview entity information prior to launch. This method + aggregates multiple pieces of information to give users insight + into what and how entities will be launched. Any instance of + ``Model``, ``Ensemble``, or ``Orchestrator`` created by the + Experiment can be passed as an argument to the preview method. + + Verbosity levels: + - info: Display user-defined fields and entities. + - debug: Display user-defined field and entities and auto-generated + fields. + - developer: Display user-defined field and entities, auto-generated + fields, and run commands. + + :param verbosity_level: verbosity level specified by user, defaults to info. + :param output_format: Set output format. The possible accepted + output formats are ``plain_text``. + Defaults to ``plain_text``. + :param output_filename: Specify name of file and extension to write + preview data to. If no output filename is set, the preview will be + output to stdout. Defaults to None. + """ + + # Retrieve any active orchestrator jobs + active_dbjobs = self._control.active_orchestrator_jobs + + preview_manifest = Manifest(*args) + + previewrenderer.render( + self, + preview_manifest, + verbosity_level, + output_format, + output_filename, + active_dbjobs, + ) + + @property + def launcher(self) -> str: + return self._launcher + @_contextualize def summary(self, style: str = "github") -> str: """Return a summary of the ``Experiment`` @@ -830,12 +873,9 @@ def summary(self, style: str = "github") -> str: launched and completed in this ``Experiment`` :param style: the style in which the summary table is formatted, - for a full list of styles see: - https://github.com/astanin/python-tabulate#table-format, - defaults to "github" - :type style: str, optional + for a full list of styles see the table-format section of: + https://github.com/astanin/python-tabulate :return: tabulate string of ``Experiment`` history - :rtype: str """ values = [] headers = [ @@ -869,11 +909,18 @@ def summary(self, style: str = "github") -> str: disable_numparse=True, ) + @property + def telemetry(self) -> TelemetryConfiguration: + """Return the telemetry configuration for this entity. + + :returns: configuration of telemetry for this entity + """ + return self._telemetry_cfg + def _launch_summary(self, manifest: Manifest) -> None: """Experiment pre-launch summary of entities that will be launched :param manifest: Manifest of deployables. - :type manifest: Manifest """ summary = "\n\n=== Launch Summary ===\n" @@ -894,10 +941,27 @@ def _launch_summary(self, manifest: Manifest) -> None: logger.info(summary) + def _create_entity_dir(self, start_manifest: Manifest) -> None: + def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: + if not os.path.isdir(entity.path): + os.makedirs(entity.path) + + for model in start_manifest.models: + create_entity_dir(model) + + for orch in start_manifest.dbs: + create_entity_dir(orch) + + for ensemble in start_manifest.ensembles: + create_entity_dir(ensemble) + + for member in ensemble.models: + create_entity_dir(member) + def __str__(self) -> str: return self.name - def append_to_db_identifier_list(self, db_identifier: str) -> None: + def _append_to_db_identifier_list(self, db_identifier: str) -> None: """Check if db_identifier already exists when calling create_database""" if db_identifier in self.db_identifiers: logger.warning( @@ -907,35 +971,3 @@ def append_to_db_identifier_list(self, db_identifier: str) -> None: ) # Otherwise, add self.db_identifiers.add(db_identifier) - - def enable_telemetry(self) -> None: - """Experiments will start producing telemetry for all entities run - through ``Experiment.start`` - - .. warning:: - - This method is currently implemented so that ALL ``Experiment`` - instances will begin producing telemetry data. In the future it - is planned to have this method work on a "per instance" basis! - """ - self._set_telemetry(True) - - def disable_telemetry(self) -> None: - """Experiments will stop producing telemetry for all entities run - through ``Experiment.start`` - - .. warning:: - - This method is currently implemented so that ALL ``Experiment`` - instances will stop producing telemetry data. In the future it - is planned to have this method work on a "per instance" basis! - """ - self._set_telemetry(False) - - @staticmethod - def _set_telemetry(switch: bool, /) -> None: - tm_key = "SMARTSIM_FLAG_TELEMETRY" - if switch: - os.environ[tm_key] = "1" - else: - os.environ[tm_key] = "0" diff --git a/smartsim/log.py b/smartsim/log.py index 55cb88afb..3d6c0860e 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -27,6 +27,7 @@ import functools import logging import pathlib +import socket import sys import threading import typing as t @@ -39,7 +40,8 @@ # constants DEFAULT_DATE_FORMAT: t.Final[str] = "%H:%M:%S" DEFAULT_LOG_FORMAT: t.Final[str] = ( - "%(asctime)s %(hostname)s %(name)s[%(process)d] %(levelname)s %(message)s" + "%(asctime)s %(hostname)s %(name)s[%(process)d:%(threadName)s] " + "%(levelname)s %(message)s" ) EXPERIMENT_LOG_FORMAT = DEFAULT_LOG_FORMAT.replace("s[%", "s {%(exp_path)s} [%") @@ -74,9 +76,7 @@ def _translate_log_level(user_log_level: str = "info") -> str: extremely verbose logging. :param user_log_level: log level specified by user, defaults to info - :type user_log_level: str :returns: Log level for coloredlogs - :rtype: str """ user_log_level = user_log_level.lower() if user_log_level in ["info", "debug", "warning"]: @@ -94,17 +94,12 @@ def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib. Returns None for both paths if experiment context is unavailable. :returns: 2-tuple of paths to experiment logs in form (output_path, error_path) - if telemetry is enabled, a 2-tuple of None otherwise - :rtype: Tuple[pathlib.Path | None, pathlib.Path | None] """ default_paths = None, None - if not CONFIG.telemetry_enabled: - return default_paths - - if _exp_path := ctx_exp_path.get(): - file_out = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.out" - file_err = pathlib.Path(_exp_path) / CONFIG.telemetry_subdir / "smartsim.err" + if _path := ctx_exp_path.get(): + file_out = pathlib.Path(_path) / CONFIG.telemetry_subdir / "logs/smartsim.out" + file_err = pathlib.Path(_path) / CONFIG.telemetry_subdir / "logs/smartsim.err" return file_out, file_err return default_paths @@ -127,14 +122,34 @@ def filter(self, record: logging.LogRecord) -> bool: """Enrich log records with active experiment context :param record: the record to evaluate for filtering - :type record: logging.LogRecord :returns: always True - :rtype: bool """ record.exp_path = ctx_exp_path.get() return True +class HostnameFilter(logging.Filter): + """Filter that performs enrichment of a log record by adding + the hostname of the machine executing the code""" + + def __init__(self, name: str = "") -> None: + super().__init__(name) + self._hostname = "" + + @property + @functools.lru_cache + def hostname(self) -> str: + """Returns the hostname of the machine executing the code""" + self._hostname = socket.gethostname() + return self._hostname + + def filter(self, record: logging.LogRecord) -> bool: + # the hostname may already added if using the `ColoredLogs` plugin + if not hasattr(record, "hostname"): + record.hostname = self.hostname + return True + + class ContextAwareLogger(logging.Logger): """A logger customized to automatically write experiment logs to a dynamic target directory by inspecting the value of a context var""" @@ -194,13 +209,9 @@ def get_logger( logger.warning("This is a warning message") :param name: the name of the desired logger - :type name: str :param log_level: what level to set the logger to - :type log_level: str :param fmt: the format of the log messages - :type fmt: str :returns: logger instance - :rtype: logging.Logger """ # if name is None, then logger is the root logger # if not root logger, get the name of file without prefix. @@ -225,7 +236,6 @@ def __init__(self, maximum_level: str = "INFO"): """Create a low-pass log filter allowing messages below a specific log level :param maximum_level: The maximum log level to be passed by the filter - :type maximum_level: str """ super().__init__() self.max = maximum_level @@ -234,9 +244,7 @@ def filter(self, record: logging.LogRecord) -> bool: """Filter log records; pass those less than or equal to the maximum level :param record: the record to evaluate for filtering - :type record: logging.LogRecord :returns: True if record level passes filter, False otherwise - :rtype: bool """ # If a string representation of the level is passed in, # the corresponding numeric value is returned. @@ -249,12 +257,9 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: allowing subsequent logging calls to be sent to filename. :param filename: the name of the desired log file. - :type filename: str - :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: str """ logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with @@ -274,19 +279,13 @@ def log_to_exp_file( allowing subsequent logging calls to be sent to filename. :param filename: the name of the desired log file. - :type filename: str :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: int | str :param logger: an existing logger to add the handler to - :type logger: (optional) logging.Logger :param fmt: a log format for the handler (otherwise, EXPERIMENT_LOG_FORMAT) - :type fmt: (optional) str :param log_filter: log filter to attach to handler - :type log_filter: (optional) logging.Filter :return: logging.Handler - :rtype: logging.Handler """ # ensure logs are written even if specified dir doesn't exist log_path = pathlib.Path(filename) @@ -322,9 +321,8 @@ def method_contextualizer( must accept an instance of matching type. :param ctx_var: The ContextVar that will be modified - :type ctx_var: ContextVar :param ctx_map: A function that returns the value to be set to ctx_var - :type ctx_map: t.Callable[[_T], _ContextT]""" + """ def _contextualize( fn: "t.Callable[Concatenate[_T, _PR], _RT]", / diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 4cdc27c06..6175259b2 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -57,13 +57,9 @@ class DataInfo: can be accessed in ``DataInfo.sample_name`` and ``DataInfo.target_name``. :param list_name: Name of the aggregation list used for sample datasets - :type list_name: str :param sample_name: Name of tensor holding training samples in stored datasets. - :type sample_name: str :param target_name: Name of tensor holding targets or labels in stored datasets. - :type target_name: str :num_classes: Number of classes (for categorical data). - :type num_classes: int | None """ def __init__( @@ -86,7 +82,6 @@ def publish(self, client: Client) -> None: stored as metastrings and integers stored as metascalars. :param client: Client to connect to Database - :type client: SmartRedis.Client """ info_ds = Dataset(self._ds_name) info_ds.add_meta_string("sample_name", self.sample_name) @@ -104,16 +99,16 @@ def download(self, client: Client) -> None: on the DB, the object members are not modified. :param client: Client to connect to Database - :type client: SmartRedis.Client """ try: info_ds = client.get_dataset(self._ds_name) - except RedisReplyError: + except RedisReplyError as e: # If the info was not published, proceed with default parameters logger.warning( "Could not retrieve data for DataInfo object, the following " "values will be kept." ) + logger.error(f"Original error from Redis was {e}") logger.warning(str(self)) return self.sample_name = info_ds.get_meta_strings("sample_name")[0] @@ -148,21 +143,13 @@ class TrainingDataUploader: by the attributes of this class. :param list_name: Name of the dataset as stored on the Orchestrator - :type list_name: str :param sample_name: Name of samples tensor in uploaded Datasets - :type sample_name: str :param target_name: Name of targets tensor (if needed) in uploaded Datasets - :type target_name: str :param num_classes: Number of classes of targets, if categorical - :type num_classes: int :param cluster: Whether the SmartSim Orchestrator is being run as a cluster - :type cluster: bool :param address: Address of Redis DB as : - :type address: str :param rank: Rank of DataUploader in multi-process application (e.g. MPI rank). - :type rank: int :param verbose: If output should be logged to screen. - :type verbose: bool """ @@ -266,35 +253,23 @@ class DataDownloader: - shuffle the dataset if `shuffle` is set to ``True``. :param batch_size: Size of batches obtained with __iter__ - :type batch_size: int :param dynamic: Whether new batches should be donwnloaded when ``update_data`` is called. - :type dtnamic: bool :param shuffle: whether order of samples has to be shuffled when calling `update_data` - :type shuffle: bool :param data_info_or_list_name: DataInfo object with details about dataset to download, if a string is passed, it is used to download DataInfo data from DB, assuming it was stored with ``list_name=data_info_or_list_name`` - :type data_info_or_list_name: DataInfo | str :param list_name: Name of aggregation list used to upload data - :type list_name: str :param cluster: Whether the Orchestrator will be run as a cluster - :type cluster: bool :param address: Address of Redis client as : - :type address: str :param replica_rank: When StaticDataDownloader is used distributedly, indicates the rank of this object - :type replica_rank: int :param num_replicas: When BatchDownlaoder is used distributedly, indicates the total number of ranks - :type num_replicas: int :param verbose: Whether log messages should be printed - :type verbose: bool :param init_samples: whether samples should be initialized in the constructor - :type init_samples: bool :param max_fetch_trials: maximum number of attempts to initialize data - :type max_fetch_trials: int """ def __init__( @@ -310,6 +285,7 @@ def __init__( verbose: bool = False, init_samples: bool = True, max_fetch_trials: int = -1, + wait_interval: float = 10.0, ) -> None: self.address = address self.cluster = cluster @@ -336,7 +312,7 @@ def __init__( self.set_replica_parameters(replica_rank, num_replicas) if init_samples: - self.init_samples(max_fetch_trials) + self.init_samples(max_fetch_trials, wait_interval) @property def client(self) -> Client: @@ -378,7 +354,6 @@ def need_targets(self) -> bool: """Compute if targets have to be downloaded. :return: Whether targets (or labels) should be downloaded - :rtype: bool """ return bool(self.target_name) and not self.autoencoding @@ -404,13 +379,13 @@ def __iter__( self._data_generation(self._calc_indices(idx)) for idx in range(len(self)) ) - def init_samples(self, init_trials: int = -1) -> None: + def init_samples(self, init_trials: int = -1, wait_interval: float = 10.0) -> None: """Initialize samples (and targets, if needed). A new attempt to download samples will be made every ten seconds, for ``init_trials`` times. + :param init_trials: maximum number of attempts to fetch data - :type init_trials: int """ self._client = Client(self.cluster, self.address) @@ -418,10 +393,10 @@ def init_samples(self, init_trials: int = -1) -> None: max_trials = init_trials or -1 while not self and num_trials != max_trials: self._update_samples_and_targets() - self.log( - "DataLoader could not download samples, will try again in 10 seconds" - ) - time.sleep(10) + msg = "DataLoader could not download samples, will try again in " + msg += f"{wait_interval} seconds" + self.log(msg) + time.sleep(wait_interval) num_trials += 1 if not self: diff --git a/smartsim/ml/tf/__init__.py b/smartsim/ml/tf/__init__.py index eb3cb565e..46d89d733 100644 --- a/smartsim/ml/tf/__init__.py +++ b/smartsim/ml/tf/__init__.py @@ -35,21 +35,20 @@ try: import tensorflow as tf - - installed_tf = Version_(tf.__version__) - assert installed_tf >= "2.4.0" - except ImportError: # pragma: no cover raise ModuleNotFoundError( f"TensorFlow {TF_VERSION} is not installed. " - "Please install it to use smartsim.tf" + "Please install it to use smartsim.ml.tf" ) from None + +try: + installed_tf = Version_(tf.__version__) + assert installed_tf >= TF_VERSION except AssertionError: # pragma: no cover - msg = ( + raise SmartSimError( f"TensorFlow >= {TF_VERSION} is required for smartsim. " f"tf, you have {tf.__version__}" - ) - raise SmartSimError() from None + ) from None # pylint: disable=wrong-import-position diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index 69c8e2580..cf69b65e5 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -47,13 +47,9 @@ def freeze_model( a trained model and put it inside an ``orchestrator`` instance :param model: TensorFlow or Keras model - :type model: tf.Module :param output_dir: output dir to save model file to - :type output_dir: str :param file_name: name of model file to create - :type file_name: str :return: path to model file, model input layer names, model output layer names - :rtype: str, list[str], list[str] """ # TODO figure out why layer names don't match up to # specified name in Model init. @@ -93,9 +89,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] a trained model and put it inside an ``orchestrator`` instance. :param model: TensorFlow or Keras model - :type model: tf.Module :return: serialized model, model input layer names, model output layer names - :rtype: str, list[str], list[str] """ full_model = tf.function(model) diff --git a/smartsim/settings/__init__.py b/smartsim/settings/__init__.py index d417c9ef8..6e8f0bc96 100644 --- a/smartsim/settings/__init__.py +++ b/smartsim/settings/__init__.py @@ -27,6 +27,7 @@ from .alpsSettings import AprunSettings from .base import RunSettings, SettingsBase from .containers import Container, Singularity +from .dragonRunSettings import DragonRunSettings from .lsfSettings import BsubBatchSettings, JsrunSettings from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings from .palsSettings import PalsMpiexecSettings @@ -46,6 +47,7 @@ "SbatchSettings", "SrunSettings", "PalsMpiexecSettings", + "DragonRunSettings", "Container", "Singularity", ] diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 5357312a5..54b9c7525 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -46,13 +46,9 @@ def __init__( ``AprunSettings`` can be used for the `pbs` launcher. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__( exe, @@ -71,7 +67,6 @@ def make_mpmd(self, settings: RunSettings) -> None: into a single MPMD command joined with ':' :param settings: ``AprunSettings`` instance - :type settings: AprunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -89,7 +84,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-pe`` :param cpus_per_task: number of cpus to use per task - :type cpus_per_task: int """ self.run_args["cpus-per-pe"] = int(cpus_per_task) @@ -99,7 +93,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--pes`` :param tasks: number of tasks - :type tasks: int """ self.run_args["pes"] = int(tasks) @@ -109,7 +102,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This sets ``--pes-per-node`` :param tasks_per_node: number of tasks per node - :type tasks_per_node: int """ self.run_args["pes-per-node"] = int(tasks_per_node) @@ -117,7 +109,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -134,7 +125,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--node-list-file`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["node-list-file"] = file_path @@ -142,7 +132,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -159,7 +148,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: This sets ``--cpu-binding`` :param bindings: List of cpu numbers - :type bindings: list[int] | int """ if isinstance(bindings, int): bindings = [bindings] @@ -171,7 +159,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: This sets ``--memory-per-pe`` in megabytes :param memory_per_node: Per PE memory limit in megabytes - :type memory_per_node: int """ self.run_args["memory-per-pe"] = int(memory_per_node) @@ -181,7 +168,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--debug`` arg to the highest level :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["debug"] = 7 @@ -194,7 +180,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -205,7 +190,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of ALPS formatted run arguments :return: list of ALPS arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -228,7 +212,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for aprun :return: list of env vars - :rtype: list[str] """ formatted = [] if self.env_vars: @@ -242,6 +225,5 @@ def set_walltime(self, walltime: str) -> None: Walltime is given in total number of seconds :param walltime: wall time - :type walltime: str """ self.run_args["cpu-time-limit"] = str(walltime) diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index 284d435c0..6373b52fd 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -75,19 +75,11 @@ def __init__( rs = RunSettings("echo", "hello", "mpirun", run_args={"-np": "2"}) :param exe: executable to run - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_command: launch binary (e.g. "srun"), defaults to empty str - :type run_command: str, optional - :param run_args: arguments for run command (e.g. `-np` for `mpiexec`), - defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional - :param container: container type for workload (e.g. "singularity"), - defaults to None - :type container: Container, optional + :param exe_args: executable arguments + :param run_command: launch binary (e.g. "srun") + :param run_args: arguments for run command (e.g. `-np` for `mpiexec`) + :param env_vars: environment vars to launch job with + :param container: container type for workload (e.g. "singularity") """ # Do not expand executable if running within a container self.exe = [exe] if container else [expand_exe_path(exe)] @@ -117,26 +109,50 @@ def __init__( @property def exe_args(self) -> t.Union[str, t.List[str]]: + """Return an immutable list of attached executable arguments. + + :returns: attached executable arguments + """ return self._exe_args @exe_args.setter def exe_args(self, value: t.Union[str, t.List[str], None]) -> None: + """Set the executable arguments. + + :param value: executable arguments + """ self._exe_args = self._build_exe_args(value) @property def run_args(self) -> t.Dict[str, t.Union[int, str, float, None]]: + """Return an immutable list of attached run arguments. + + :returns: attached run arguments + """ return self._run_args @run_args.setter def run_args(self, value: t.Dict[str, t.Union[int, str, float, None]]) -> None: + """Set the run arguments. + + :param value: run arguments + """ self._run_args = copy.deepcopy(value) @property def env_vars(self) -> t.Dict[str, t.Optional[str]]: + """Return an immutable list of attached environment variables. + + :returns: attached environment variables + """ return self._env_vars @env_vars.setter def env_vars(self, value: t.Dict[str, t.Optional[str]]) -> None: + """Set the environment variables. + + :param value: environment variables + """ self._env_vars = copy.deepcopy(value) # To be overwritten by subclasses. Set of reserved args a user cannot change @@ -146,7 +162,6 @@ def set_nodes(self, nodes: int) -> None: """Set the number of nodes :param nodes: number of nodes to run with - :type nodes: int """ logger.warning( ( @@ -159,7 +174,6 @@ def set_tasks(self, tasks: int) -> None: """Set the number of tasks to launch :param tasks: number of tasks to launch - :type tasks: int """ logger.warning( ( @@ -172,7 +186,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ logger.warning( ( @@ -185,7 +198,6 @@ def set_task_map(self, task_mapping: str) -> None: """Set a task mapping :param task_mapping: task mapping - :type task_mapping: str """ logger.warning( ( @@ -198,7 +210,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: """Set the number of cpus per task :param cpus_per_task: number of cpus per task - :type cpus_per_task: int """ logger.warning( ( @@ -211,7 +222,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] """ logger.warning( ( @@ -224,7 +234,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: """Use the contents of a file to specify the hostlist for this job :param file_path: Path to the hostlist file - :type file_path: str """ logger.warning( ( @@ -237,7 +246,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: str | list[str] """ logger.warning( ( @@ -250,7 +258,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: """Set the cores to which MPI processes are bound :param bindings: List specifing the cores to which MPI processes are bound - :type bindings: list[int] | int """ logger.warning( ( @@ -263,7 +270,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: """Set the amount of memory required per node in megabytes :param memory_per_node: Number of megabytes per node - :type memory_per_node: int """ logger.warning( ( @@ -276,7 +282,6 @@ def set_verbose_launch(self, verbose: bool) -> None: """Set the job to run in verbose mode :param verbose: Whether the job should be run verbosely - :type verbose: bool """ logger.warning( ( @@ -289,7 +294,6 @@ def set_quiet_launch(self, quiet: bool) -> None: """Set the job to run in quiet mode :param quiet: Whether the job should be run quietly - :type quiet: bool """ logger.warning( ( @@ -302,7 +306,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: """Copy executable file to allocated compute nodes :param dest_path: Path to copy an executable file - :type dest_path: str | None """ logger.warning( ( @@ -315,16 +318,25 @@ def set_time(self, hours: int = 0, minutes: int = 0, seconds: int = 0) -> None: """Automatically format and set wall time :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int """ return self.set_walltime( self._fmt_walltime(int(hours), int(minutes), int(seconds)) ) + def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + """Specify the node feature for this job + + :param feature_list: node feature to launch on + """ + logger.warning( + ( + "Feature specification not implemented for this " + f"RunSettings type: {type(self)}" + ) + ) + @staticmethod def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: """Convert hours, minutes, and seconds into valid walltime format @@ -332,13 +344,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: By defualt the formatted wall time is the total number of seconds. :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ time_ = hours * 3600 time_ += minutes * 60 @@ -349,7 +357,6 @@ def set_walltime(self, walltime: str) -> None: """Set the formatted walltime :param walltime: Time in format required by launcher`` - :type walltime: str """ logger.warning( ( @@ -362,7 +369,6 @@ def set_binding(self, binding: str) -> None: """Set binding :param binding: Binding - :type binding: str """ logger.warning( ( @@ -375,7 +381,6 @@ def set_mpmd_preamble(self, preamble_lines: t.List[str]) -> None: """Set preamble to a file to make a job MPMD :param preamble_lines: lines to put at the beginning of a file. - :type preamble_lines: list[str] """ logger.warning( ( @@ -388,7 +393,6 @@ def make_mpmd(self, settings: RunSettings) -> None: """Make job an MPMD job :param settings: ``RunSettings`` instance - :type settings: RunSettings """ logger.warning( ( @@ -404,7 +408,6 @@ def run_command(self) -> t.Optional[str]: Attempt to expand the path to the executable if possible :returns: launch binary e.g. mpiexec - :type: str | None """ cmd = self._run_command @@ -428,7 +431,6 @@ def update_env(self, env_vars: t.Dict[str, t.Union[str, int, float, bool]]) -> N :param env_vars: environment variables to update or add - :type env_vars: dict[str, Union[str, int, float, bool]] :raises TypeError: if env_vars values cannot be coerced to strings """ val_types = (str, int, float, bool) @@ -445,16 +447,8 @@ def add_exe_args(self, args: t.Union[str, t.List[str]]) -> None: """Add executable arguments to executable :param args: executable arguments - :type args: str | list[str] - :raises TypeError: if exe args are not strings """ - if isinstance(args, str): - args = args.split() - - for arg in args: - if not isinstance(arg, str): - raise TypeError("Executable arguments should be a list of str") - + args = self._build_exe_args(args) self._exe_args.extend(args) def set( @@ -503,11 +497,8 @@ def set( # otherwise returns ["exclusive", "None"] :param arg: name of the argument - :type arg: str :param value: value of the argument - :type value: str | None :param conditon: set the argument if condition evaluates to True - :type condition: bool """ if not isinstance(arg, str): raise TypeError("Argument name should be of type str") @@ -533,26 +524,26 @@ def set( @staticmethod def _build_exe_args(exe_args: t.Optional[t.Union[str, t.List[str]]]) -> t.List[str]: - """Convert exe_args input to a desired collection format""" - if exe_args: - if isinstance(exe_args, str): - return exe_args.split() - if isinstance(exe_args, list): - exe_args = copy.deepcopy(exe_args) - plain_type = all(isinstance(arg, (str)) for arg in exe_args) - if not plain_type: - nested_type = all( - all(isinstance(arg, (str)) for arg in exe_args_list) - for exe_args_list in exe_args - ) - if not nested_type: - raise TypeError( - "Executable arguments were not list of str or str" - ) - return exe_args - return exe_args - raise TypeError("Executable arguments were not list of str or str") - return [] + """Check and convert exe_args input to a desired collection format""" + if not exe_args: + return [] + + if isinstance(exe_args, list): + exe_args = copy.deepcopy(exe_args) + + if not ( + isinstance(exe_args, str) + or ( + isinstance(exe_args, list) + and all(isinstance(arg, str) for arg in exe_args) + ) + ): + raise TypeError("Executable arguments were not a list of str or a str.") + + if isinstance(exe_args, str): + return exe_args.split() + + return exe_args def format_run_args(self) -> t.List[str]: """Return formatted run arguments @@ -561,7 +552,6 @@ def format_run_args(self) -> t.List[str]: literally with no formatting. :return: list run arguments for these settings - :rtype: list[str] """ formatted = [] for arg, value in self.run_args.items(): @@ -573,7 +563,6 @@ def format_env_vars(self) -> t.List[str]: """Build environment variable string :returns: formatted list of strings to export variables - :rtype: list[str] """ formatted = [] for key, val in self.env_vars.items(): @@ -619,7 +608,6 @@ def batch_cmd(self) -> str: command. If we cannot, returns the batch command as is. :returns: batch command - :type: str """ if is_valid_cmd(self._batch_cmd): return expand_exe_path(self._batch_cmd) @@ -628,10 +616,18 @@ def batch_cmd(self) -> str: @property def batch_args(self) -> t.Dict[str, t.Optional[str]]: + """Retrieve attached batch arguments + + :returns: attached batch arguments + """ return self._batch_args @batch_args.setter def batch_args(self, value: t.Dict[str, t.Optional[str]]) -> None: + """Attach batch arguments + + :param value: dictionary of batch arguments + """ self._batch_args = copy.deepcopy(value) if value else {} def set_nodes(self, num_nodes: int) -> None: @@ -656,7 +652,6 @@ def set_batch_command(self, command: str) -> None: """Set the command used to launch the batch e.g. ``sbatch`` :param command: batch command - :type command: str """ self._batch_cmd = command @@ -667,7 +662,6 @@ def add_preamble(self, lines: t.List[str]) -> None: start virtual environments before running the executables. :param line: lines to add to preamble. - :type line: str or list[str] """ if isinstance(lines, str): self._preamble += [lines] @@ -678,7 +672,10 @@ def add_preamble(self, lines: t.List[str]) -> None: @property def preamble(self) -> t.Iterable[str]: - """Return an iterable of preamble clauses to be prepended to the batch file""" + """Return an iterable of preamble clauses to be prepended to the batch file + + :return: attached preamble clauses + """ return (clause for clause in self._preamble) def __str__(self) -> str: # pragma: no-cover diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py index bdba1ce88..d2fd4fca2 100644 --- a/smartsim/settings/containers.py +++ b/smartsim/settings/containers.py @@ -39,13 +39,9 @@ class Container: launch a workload within a container into a single object. :param image: local or remote path to container image - :type image: str :param args: arguments to container command - :type args: str | list[str], optional :param mount: paths to mount (bind) from host machine into image. - :type mount: str | list[str] | dict[str, str], optional :param working_directory: path of the working directory within the container - :type working_directory: str """ def __init__( @@ -70,7 +66,6 @@ def _containerized_run_command(self, run_command: str) -> str: """Return modified run_command with container commands prepended. :param run_command: run command from a RunSettings class - :type run_command: str """ raise NotImplementedError( "Containerized run command specification not implemented for this " @@ -99,11 +94,8 @@ class Singularity(Container): :param image: local or remote path to container image, e.g. ``docker://sylabsio/lolcow`` - :type image: str :param args: arguments to 'singularity exec' command - :type args: str | list[str], optional :param mount: paths to mount (bind) from host machine into image. - :type mount: str | list[str] | dict[str, str], optional """ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: diff --git a/smartsim/settings/dragonRunSettings.py b/smartsim/settings/dragonRunSettings.py new file mode 100644 index 000000000..b8baa4708 --- /dev/null +++ b/smartsim/settings/dragonRunSettings.py @@ -0,0 +1,78 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import typing as t + +from ..log import get_logger +from .base import RunSettings + +logger = get_logger(__name__) + + +class DragonRunSettings(RunSettings): + def __init__( + self, + exe: str, + exe_args: t.Optional[t.Union[str, t.List[str]]] = None, + env_vars: t.Optional[t.Dict[str, t.Optional[str]]] = None, + **kwargs: t.Any, + ) -> None: + """Initialize run parameters for a Dragon process + + ``DragonRunSettings`` should only be used on systems where Dragon + is available and installed in the current environment. + + If an allocation is specified, the instance receiving these run + parameters will launch on that allocation. + + :param exe: executable to run + :param exe_args: executable arguments, defaults to None + :param env_vars: environment variables for job, defaults to None + :param alloc: allocation ID if running on existing alloc, defaults to None + """ + super().__init__( + exe, + exe_args, + run_command="", + env_vars=env_vars, + **kwargs, + ) + + def set_nodes(self, nodes: int) -> None: + """Set the number of nodes + + :param nodes: number of nodes to run with + """ + self.run_args["nodes"] = nodes + + def set_tasks_per_node(self, tasks_per_node: int) -> None: + """Set the number of tasks for this job + + :param tasks_per_node: number of tasks per node + """ + self.run_args["tasks-per-node"] = tasks_per_node diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index 32902c8c6..bce0581c5 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -51,13 +51,9 @@ def __init__( ``JsrunSettings`` should only be used on LSF-based systems. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__( exe, @@ -81,7 +77,6 @@ def set_num_rs(self, num_rs: t.Union[str, int]) -> None: This sets ``--nrs``. :param num_rs: Number of resource sets or `ALL_HOSTS` - :type num_rs: int or str """ if isinstance(num_rs, str): self.run_args["nrs"] = num_rs @@ -94,7 +89,6 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: This sets ``--cpu_per_rs`` :param cpus_per_rs: number of cpus to use per resource set or ALL_CPUS - :type cpus_per_rs: int or str """ if self.colocated_db_settings: db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) @@ -117,7 +111,6 @@ def set_gpus_per_rs(self, gpus_per_rs: int) -> None: This sets ``--gpu_per_rs`` :param gpus_per_rs: number of gpus to use per resource set or ALL_GPUS - :type gpus_per_rs: int or str """ if isinstance(gpus_per_rs, str): self.run_args["gpu_per_rs"] = gpus_per_rs @@ -130,7 +123,6 @@ def set_rs_per_host(self, rs_per_host: int) -> None: This sets ``--rs_per_host`` :param rs_per_host: number of resource sets to use per host - :type rs_per_host: int """ self.run_args["rs_per_host"] = int(rs_per_host) @@ -140,7 +132,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--np`` :param tasks: number of tasks - :type tasks: int """ self.run_args["np"] = int(tasks) @@ -150,7 +141,6 @@ def set_tasks_per_rs(self, tasks_per_rs: int) -> None: This sets ``--tasks_per_rs`` :param tasks_per_rs: number of tasks per resource set - :type tasks_per_rs: int """ self.run_args["tasks_per_rs"] = int(tasks_per_rs) @@ -160,7 +150,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This function is an alias for `set_tasks_per_rs`. :param tasks_per_node: number of tasks per resource set - :type tasks_per_node: int """ self.set_tasks_per_rs(int(tasks_per_node)) @@ -170,7 +159,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This function is an alias for `set_cpus_per_rs`. :param cpus_per_task: number of cpus per resource set - :type cpus_per_task: int """ self.set_cpus_per_rs(int(cpus_per_task)) @@ -180,7 +168,6 @@ def set_memory_per_rs(self, memory_per_rs: int) -> None: This sets ``--memory_per_rs`` :param memory_per_rs: Number of megabytes per rs - :type memory_per_rs: int """ self.run_args["memory_per_rs"] = int(memory_per_rs) @@ -190,7 +177,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: Alias for `set_memory_per_rs`. :param memory_per_node: Number of megabytes per rs - :type memory_per_node: int """ self.set_memory_per_rs(int(memory_per_node)) @@ -200,7 +186,6 @@ def set_binding(self, binding: str) -> None: This sets ``--bind`` :param binding: Binding, e.g. `packed:21` - :type binding: str """ self.run_args["bind"] = binding @@ -213,7 +198,6 @@ def make_mpmd(self, settings: RunSettings) -> None: the list of settings to be launched in the same ERF file. :param settings: ``JsrunSettings`` instance - :type settings: JsrunSettings, optional """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -231,7 +215,6 @@ def set_mpmd_preamble(self, preamble_lines: t.List[str]) -> None: :param preamble_lines: lines to put at the beginning of the ERF file. - :type preamble_lines: list[str] """ self.mpmd_preamble_lines = preamble_lines @@ -249,7 +232,6 @@ def set_erf_sets(self, erf_sets: t.Dict[str, str]) -> None: only `rank` is used. :param hosts: dictionary of resources - :type hosts: dict[str,str] """ self.erf_sets = copy.deepcopy(erf_sets) @@ -259,7 +241,6 @@ def format_env_vars(self) -> t.List[str]: its value is propagated from the current environment. :returns: formatted list of strings to export variables - :rtype: list[str] """ format_str = [] for k, v in self.env_vars.items(): @@ -279,8 +260,6 @@ def set_individual_output(self, suffix: t.Optional[str] = None) -> None: :param suffix: Optional suffix to add to output file names, it can contain `%j`, `%h`, `%p`, or `%t`, as specified by `jsrun` options. - :type suffix: str, optional - """ self.run_args["stdio_mode"] = "individual" if suffix: @@ -290,7 +269,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of LSF formatted run arguments :return: list of LSF arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -403,16 +381,11 @@ def __init__( ) -> None: """Specify ``bsub`` batch parameters for a job - :param nodes: number of nodes for batch, defaults to None - :type nodes: int, optional - :param time: walltime for batch job in format hh:mm, defaults to None - :type time: str, optional - :param project: project for batch launch, defaults to None - :type project: str, optional - :param batch_args: overrides for LSF batch arguments, defaults to None - :type batch_args: dict[str, str], optional - :param smts: SMTs, defaults to 0 - :type smts: int, optional + :param nodes: number of nodes for batch + :param time: walltime for batch job in format hh:mm + :param project: project for batch launch + :param batch_args: overrides for LSF batch arguments + :param smts: SMTs """ self.project: t.Optional[str] = None @@ -445,7 +418,6 @@ def set_walltime(self, walltime: str) -> None: :param walltime: Time in hh:mm format, e.g. "10:00" for 10 hours, if time is supplied in hh:mm:ss format, seconds will be ignored and walltime will be set as ``hh:mm`` - :type walltime: str """ # For compatibility with other launchers, as explained in docstring if walltime: @@ -461,7 +433,6 @@ def set_smts(self, smts: int) -> None: takes precedence. :param smts: SMT (e.g on Summit: 1, 2, or 4) - :type smts: int """ self.smts = smts @@ -471,7 +442,6 @@ def set_project(self, project: str) -> None: This sets ``-P``. :param time: project name - :type time: str """ if project: self.project = project @@ -482,7 +452,6 @@ def set_account(self, account: str) -> None: this function is an alias for `set_project`. :param account: project name - :type account: str """ self.set_project(account) @@ -492,7 +461,6 @@ def set_nodes(self, num_nodes: int) -> None: This sets ``-nnodes``. :param nodes: number of nodes - :type nodes: int """ if num_nodes: self.batch_args["nnodes"] = str(int(num_nodes)) @@ -503,6 +471,9 @@ def set_expert_mode_req(self, res_req: str, slots: int) -> None: disregard all other allocation options. This sets ``-csm -n slots -R res_req`` + + :param res_req: specific resource requirements + :param slots: number of resources to allocate """ self.expert_mode = True self.batch_args["csm"] = "y" @@ -513,7 +484,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -530,7 +500,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``-n`` :param tasks: number of tasks - :type tasks: int """ self.batch_args["n"] = str(int(tasks)) @@ -538,7 +507,6 @@ def set_queue(self, queue: str) -> None: """Set the queue for this job :param queue: The queue to submit the job on - :type queue: str """ if queue: self.batch_args["q"] = queue @@ -573,7 +541,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: list of batch arguments for Qsub - :rtype: list[str] """ opts = [] diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index ce132bcc5..c64c66cbf 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -61,16 +61,11 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with :param fail_if_missing_exec: Throw an exception of the MPI command is missing. Otherwise, throw a warning - :type fail_if_missing_exec: bool, optional """ super().__init__( exe, @@ -101,7 +96,6 @@ def make_mpmd(self, settings: RunSettings) -> None: Model instance :param settings: MpirunSettings instance - :type settings: MpirunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -117,7 +111,6 @@ def set_task_map(self, task_mapping: str) -> None: For examples, see the man page for ``mpirun`` :param task_mapping: task mapping - :type task_mapping: str """ self.run_args["map-by"] = task_mapping @@ -130,7 +123,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: and will soon be replaced. :param cpus_per_task: number of tasks - :type cpus_per_task: int """ self.run_args["cpus-per-proc"] = int(cpus_per_task) @@ -140,7 +132,6 @@ def set_cpu_binding_type(self, bind_type: str) -> None: This sets ``--bind-to`` for MPI compliant implementations :param bind_type: binding type - :type bind_type: str """ self.run_args["bind-to"] = bind_type @@ -148,7 +139,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ self.run_args["npernode"] = int(tasks_per_node) @@ -158,7 +148,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``-n`` for MPI compliant implementations :param tasks: number of tasks - :type tasks: int """ self.run_args["n"] = int(tasks) @@ -168,7 +157,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--host`` :param host_list: list of host names - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -185,7 +173,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--hostfile`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["hostfile"] = file_path @@ -195,7 +182,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--verbose`` :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["verbose"] = None @@ -208,7 +194,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -221,7 +206,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--preload-binary`` :param dest_path: Destination path (Ignored) - :type dest_path: str | None """ if dest_path is not None and isinstance(dest_path, str): logger.warning( @@ -238,7 +222,6 @@ def set_walltime(self, walltime: str) -> None: This sets ``--timeout`` :param walltime: number like string of seconds that a job will run in secs - :type walltime: str """ self.run_args["timeout"] = walltime @@ -246,7 +229,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -265,7 +247,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for mpirun :return: list of env vars - :rtype: list[str] """ formatted = [] env_string = "-x" @@ -299,13 +280,9 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "mpirun", run_args, env_vars, **kwargs) @@ -330,13 +307,9 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "mpiexec", run_args, env_vars, **kwargs) @@ -370,12 +343,8 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "orterun", run_args, env_vars, **kwargs) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index e43cd9466..4100e8efe 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -45,13 +45,9 @@ class PalsMpiexecSettings(_BaseMPISettings): None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ def __init__( @@ -74,16 +70,11 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with :param fail_if_missing_exec: Throw an exception of the MPI command is missing. Otherwise, throw a warning - :type fail_if_missing_exec: bool, optional """ super().__init__( exe, @@ -103,7 +94,6 @@ def set_task_map(self, task_mapping: str) -> None: For examples, see the man page for ``mpirun`` :param task_mapping: task mapping - :type task_mapping: str """ logger.warning("set_task_map not supported under PALS") @@ -116,7 +106,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: and will soon be replaced. :param cpus_per_task: number of tasks - :type cpus_per_task: int """ logger.warning("set_cpus_per_task not supported under PALS") @@ -126,7 +115,6 @@ def set_cpu_binding_type(self, bind_type: str) -> None: This sets ``--bind-to`` for MPI compliant implementations :param bind_type: binding type - :type bind_type: str """ self.run_args["cpu-bind"] = bind_type @@ -134,7 +122,6 @@ def set_tasks(self, tasks: int) -> None: """Set the number of tasks :param tasks: number of total tasks to launch - :type tasks: int """ self.run_args["np"] = int(tasks) @@ -142,7 +129,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ self.run_args["ppn"] = int(tasks_per_node) @@ -152,7 +138,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ logger.warning("set_quiet_launch not supported under PALS") @@ -163,7 +148,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--preload-binary`` :param dest_path: Destination path (Ignored) - :type dest_path: str | None """ if dest_path is not None and isinstance(dest_path, str): logger.warning( @@ -178,7 +162,6 @@ def set_walltime(self, walltime: str) -> None: """Set the maximum number of seconds that a job will run :param walltime: number like string of seconds that a job will run in secs - :type walltime: str """ logger.warning("set_walltime not supported under PALS") @@ -186,7 +169,6 @@ def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None: """Set the GPU affinity through a bash script :param affinity: path to the affinity script - :type affinity: str """ self.affinity_script.append(str(affinity)) for arg in args: @@ -196,7 +178,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -219,7 +200,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for mpirun :return: list of env vars - :rtype: list[str] """ formatted = [] @@ -242,7 +222,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--hosts`` :param host_list: list of host names - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 19a58b11c..09d48181a 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -53,20 +53,13 @@ def __init__( the value for select statement supplied in ``resources`` will override. - :param nodes: number of nodes for batch, defaults to None - :type nodes: int, optional - :param ncpus: number of cpus per node, defaults to None - :type ncpus: int, optional - :param time: walltime for batch job, defaults to None - :type time: str, optional - :param queue: queue to run batch in, defaults to None - :type queue: str, optional - :param account: account for batch launch, defaults to None - :type account: str, optional - :param resources: overrides for resource arguments, defaults to None - :type resources: dict[str, str], optional - :param batch_args: overrides for PBS batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch + :param ncpus: number of cpus per node + :param time: walltime for batch job + :param queue: queue to run batch in + :param account: account for batch launch + :param resources: overrides for resource arguments + :param batch_args: overrides for PBS batch arguments """ self._ncpus = ncpus @@ -112,7 +105,6 @@ def set_nodes(self, num_nodes: int) -> None: nodes here is sets the 'nodes' resource. :param num_nodes: number of nodes - :type num_nodes: int """ if num_nodes: @@ -122,7 +114,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -143,7 +134,6 @@ def set_walltime(self, walltime: str) -> None: this value will be overridden :param walltime: wall time - :type walltime: str """ if walltime: self.set_resource("walltime", walltime) @@ -152,7 +142,6 @@ def set_queue(self, queue: str) -> None: """Set the queue for the batch job :param queue: queue name - :type queue: str """ if queue: self.batch_args["q"] = str(queue) @@ -165,7 +154,6 @@ def set_ncpus(self, num_cpus: t.Union[int, str]) -> None: this value will be overridden :param num_cpus: number of cpus per node in select - :type num_cpus: int """ self._ncpus = int(num_cpus) @@ -173,7 +161,6 @@ def set_account(self, account: str) -> None: """Set the account for this batch job :param acct: account id - :type acct: str """ if account: self.batch_args["A"] = str(account) @@ -185,9 +172,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: arguments will be overridden. Likewise for Walltime :param resource_name: name of resource, e.g. walltime - :type resource_name: str :param value: value - :type value: str """ # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) @@ -200,7 +185,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Qsub - :rtype: list[str] :raises ValueError: if options are supplied without values """ opts = self._create_resource_list() diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 6e6172507..5f7fc3fe2 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -32,6 +32,7 @@ AprunSettings, BsubBatchSettings, Container, + DragonRunSettings, JsrunSettings, MpiexecSettings, MpirunSettings, @@ -63,19 +64,12 @@ def create_batch_settings( :param launcher: launcher for this experiment, if set to 'auto', an attempt will be made to find an available launcher on the system - :type launcher: str - :param nodes: number of nodes for batch job, defaults to 1 - :type nodes: int, optional - :param time: length of batch job, defaults to "" - :type time: str, optional - :param queue: queue or partition (if slurm), defaults to "" - :type queue: str, optional - :param account: user account name for batch system, defaults to "" - :type account: str, optional - :param batch_args: additional batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch job + :param time: length of batch job + :param queue: queue or partition (if slurm) + :param account: user account name for batch system + :param batch_args: additional batch arguments :return: a newly created BatchSettings instance - :rtype: BatchSettings :raises SmartSimError: if batch creation fails """ # all supported batch class implementations @@ -83,10 +77,13 @@ def create_batch_settings( "pbs": QsubBatchSettings, "slurm": SbatchSettings, "lsf": BsubBatchSettings, + "pals": QsubBatchSettings, } - if launcher == "auto": + if launcher in ["auto", "dragon"]: launcher = detect_launcher() + if launcher == "dragon": + by_launcher["dragon"] = by_launcher[launcher] if launcher == "local": raise SmartSimError("Local launcher does not support batch workloads") @@ -127,21 +124,13 @@ def create_run_settings( :param launcher: launcher to create settings for, if set to 'auto', an attempt will be made to find an available launcher on the system - :type launcher: str :param run_command: command to run the executable - :type run_command: str :param exe: executable to run - :type exe: str :param exe_args: arguments to pass to the executable - :type exe_args: list[str], optional :param run_args: arguments to pass to the ``run_command`` - :type run_args: list[str], optional :param env_vars: environment variables to pass to the executable - :type env_vars: dict[str, str], optional - :param container: container type for workload (e.g. "singularity"), defaults to None - :type container: Container, optional + :param container: container type for workload (e.g. "singularity") :return: the created ``RunSettings`` - :rtype: RunSettings :raises SmartSimError: if run_command=="auto" and detection fails """ # all supported RunSettings child classes @@ -159,6 +148,7 @@ def create_run_settings( # run commands supported by each launcher # in order of suspected user preference by_launcher = { + "dragon": [""], "slurm": ["srun", "mpirun", "mpiexec"], "pbs": ["aprun", "mpirun", "mpiexec"], "pals": ["mpiexec"], @@ -171,7 +161,7 @@ def create_run_settings( def _detect_command(launcher: str) -> str: if launcher in by_launcher: - if launcher == "local": + if launcher in ["local", "dragon"]: return "" for cmd in by_launcher[launcher]: @@ -193,6 +183,11 @@ def _detect_command(launcher: str) -> str: # no auto detection for local, revert to false run_command = _detect_command(launcher) + if launcher == "dragon": + return DragonRunSettings( + exe=exe, exe_args=exe_args, env_vars=env_vars, container=container, **kwargs + ) + # if user specified and supported or auto detection worked if run_command and run_command in supported: return supported[run_command](launcher)( diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 935a8df39..64f73fa9c 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -55,15 +55,10 @@ def __init__( parameters will launch on that allocation. :param exe: executable to run - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: list[str] | str, optional - :param run_args: srun arguments without dashes, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment variables for job, defaults to None - :type env_vars: dict[str, str], optional - :param alloc: allocation ID if running on existing alloc, defaults to None - :type alloc: str, optional + :param exe_args: executable arguments + :param run_args: srun arguments without dashes + :param env_vars: environment variables for job + :param alloc: allocation ID if running on existing alloc """ super().__init__( exe, @@ -84,7 +79,6 @@ def set_nodes(self, nodes: int) -> None: Effectively this is setting: ``srun --nodes `` :param nodes: number of nodes to run with - :type nodes: int """ self.run_args["nodes"] = int(nodes) @@ -95,7 +89,6 @@ def make_mpmd(self, settings: RunSettings) -> None: Model instance :param settings: SrunSettings instance - :type settings: SrunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -117,7 +110,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--nodelist`` :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -134,7 +126,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--nodefile`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["nodefile"] = file_path @@ -142,7 +133,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: list[str] :raises TypeError: """ if isinstance(host_list, str): @@ -159,7 +149,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-task`` :param num_cpus: number of cpus to use per task - :type num_cpus: int """ self.run_args["cpus-per-task"] = int(cpus_per_task) @@ -169,7 +158,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--ntasks`` :param tasks: number of tasks - :type tasks: int """ self.run_args["ntasks"] = int(tasks) @@ -179,7 +167,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This sets ``--ntasks-per-node`` :param tasks_per_node: number of tasks per node - :type tasks_per_node: int """ self.run_args["ntasks-per-node"] = int(tasks_per_node) @@ -189,7 +176,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: This sets ``--cpu-bind`` using the ``map_cpu:`` option :param bindings: List specifing the cores to which MPI processes are bound - :type bindings: list[int] | int """ if isinstance(bindings, int): bindings = [bindings] @@ -203,7 +189,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: This sets ``--mem`` in megabytes :param memory_per_node: Amount of memory per node in megabytes - :type memory_per_node: int """ self.run_args["mem"] = f"{int(memory_per_node)}M" @@ -213,7 +198,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--verbose`` :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["verbose"] = None @@ -226,7 +210,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -239,10 +222,23 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--bcast`` :param dest_path: Path to copy an executable file - :type dest_path: str | None """ self.run_args["bcast"] = dest_path + def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: + """Specify the node feature for this job + + This sets ``-C`` + + :param feature_list: node feature to launch on + :raises TypeError: if not str or list of str + """ + if isinstance(feature_list, str): + feature_list = [feature_list.strip()] + elif not all(isinstance(feature, str) for feature in feature_list): + raise TypeError("node_feature argument must be string or list of strings") + self.run_args["C"] = ",".join(feature_list) + @staticmethod def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: """Convert hours, minutes, and seconds into valid walltime format @@ -250,13 +246,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: Converts time to format HH:MM:SS :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ return fmt_walltime(hours, minutes, seconds) @@ -266,7 +258,6 @@ def set_walltime(self, walltime: str) -> None: format = "HH:MM:SS" :param walltime: wall time - :type walltime: str """ self.run_args["time"] = str(walltime) @@ -276,7 +267,6 @@ def set_het_group(self, het_group: t.Iterable[int]) -> None: this sets `--het-group` :param het_group: list of heterogeneous groups - :type het_group: int or iterable of ints """ het_size_env = os.getenv("SLURM_HET_SIZE") if het_size_env is None: @@ -305,7 +295,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of slurm formatted run arguments :return: list of slurm arguments for these settings - :rtype: list[str] """ # add additional slurm arguments based on key length opts = [] @@ -338,7 +327,7 @@ def check_env_vars(self) -> None: "environment. If the job is running in an interactive " f"allocation, the value {v} will not be set. Please " "consider removing the variable from the environment " - "and re-run the experiment." + "and re-running the experiment." ) logger.warning(msg) @@ -346,7 +335,6 @@ def format_env_vars(self) -> t.List[str]: """Build bash compatible environment variable string for Slurm :returns: the formatted string of environment variables - :rtype: list[str] """ self.check_env_vars() return [f"{k}={v}" for k, v in self.env_vars.items() if "," not in str(v)] @@ -359,7 +347,6 @@ def format_comma_sep_env_vars(self) -> t.Tuple[str, t.List[str]]: for more information on this, see the slurm documentation for srun :returns: the formatted string of environment variables - :rtype: tuple[str, list[str]] """ self.check_env_vars() exportable_env, compound_env, key_only = [], [], [] @@ -392,13 +379,9 @@ def fmt_walltime(hours: int, minutes: int, seconds: int) -> str: Converts time to format HH:MM:SS :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) fmt_str = str(delta) @@ -427,14 +410,10 @@ def __init__( Initialization values provided (nodes, time, account) will overwrite the same arguments in ``batch_args`` if present - :param nodes: number of nodes, defaults to None - :type nodes: int, optional + :param nodes: number of nodes :param time: walltime for job, e.g. "10:00:00" for 10 hours - :type time: str, optional - :param account: account for job, defaults to None - :type account: str, optional - :param batch_args: extra batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param account: account for job + :param batch_args: extra batch arguments """ super().__init__( "sbatch", @@ -451,7 +430,6 @@ def set_walltime(self, walltime: str) -> None: format = "HH:MM:SS" :param walltime: wall time - :type walltime: str """ # TODO check for formatting here if walltime: @@ -461,7 +439,6 @@ def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job :param num_nodes: number of nodes - :type num_nodes: int """ if num_nodes: self.batch_args["nodes"] = str(int(num_nodes)) @@ -470,7 +447,6 @@ def set_account(self, account: str) -> None: """Set the account for this batch job :param account: account id - :type account: str """ if account: self.batch_args["account"] = account @@ -479,7 +455,6 @@ def set_partition(self, partition: str) -> None: """Set the partition for the batch job :param partition: partition name - :type partition: str """ self.batch_args["partition"] = str(partition) @@ -489,7 +464,6 @@ def set_queue(self, queue: str) -> None: Sets the partition for the slurm batch job :param queue: the partition to run the batch job on - :type queue: str """ if queue: self.set_partition(queue) @@ -500,7 +474,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-task`` :param num_cpus: number of cpus to use per task - :type num_cpus: int """ self.batch_args["cpus-per-task"] = str(int(cpus_per_task)) @@ -508,7 +481,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -523,7 +495,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Sbatch - :rtype: list[str] """ opts = [] # TODO add restricted here diff --git a/smartsim/status.py b/smartsim/status.py index 409ec8c1a..e42ef3191 100644 --- a/smartsim/status.py +++ b/smartsim/status.py @@ -24,27 +24,21 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from enum import Enum -# Statuses that are applied to jobs -STATUS_RUNNING = "Running" -STATUS_COMPLETED = "Completed" -STATUS_CANCELLED = "Cancelled" -STATUS_FAILED = "Failed" -STATUS_NEW = "New" -STATUS_PAUSED = "Paused" -STATUS_NEVER_STARTED = "NeverStarted" -# SmartSim status mapping -SMARTSIM_STATUS = { - "Running": STATUS_RUNNING, - "Paused": STATUS_PAUSED, - "Completed": STATUS_COMPLETED, - "Cancelled": STATUS_CANCELLED, - "Failed": STATUS_FAILED, - "New": STATUS_NEW, - "NeverStarted": STATUS_NEVER_STARTED, -} +class SmartSimStatus(Enum): + STATUS_RUNNING = "Running" + STATUS_COMPLETED = "Completed" + STATUS_CANCELLED = "Cancelled" + STATUS_FAILED = "Failed" + STATUS_NEW = "New" + STATUS_PAUSED = "Paused" + STATUS_NEVER_STARTED = "NeverStarted" + -# Status groupings -TERMINAL_STATUSES = {STATUS_CANCELLED, STATUS_COMPLETED, STATUS_FAILED} -LIVE_STATUSES = {STATUS_RUNNING, STATUS_PAUSED, STATUS_NEW} +TERMINAL_STATUSES = { + SmartSimStatus.STATUS_CANCELLED, + SmartSimStatus.STATUS_COMPLETED, + SmartSimStatus.STATUS_FAILED, +} diff --git a/smartsim/templates/templates/preview/plain_text/activeinfra.template b/smartsim/templates/templates/preview/plain_text/activeinfra.template new file mode 100644 index 000000000..8f403fbc0 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/activeinfra.template @@ -0,0 +1,9 @@ + + = Database Identifier: {{ db.entity.db_identifier }} = + Shards: {{ db.entity.num_shards }} + TCP/IP Port(s): + {%- for port in db.entity.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ db.entity.run_settings.exe_args | get_ifname }} + Type: {{ config.database_cli | get_dbtype }} diff --git a/smartsim/templates/templates/preview/plain_text/base.template b/smartsim/templates/templates/preview/plain_text/base.template new file mode 100644 index 000000000..511712554 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/base.template @@ -0,0 +1,52 @@ + +{% include "experiment.template" %} +{%- if manifest.has_deployable or active_dbjobs %} + +=== Entity Preview === + + {%- if active_dbjobs %} + + == Active Infrastructure == + {%- for name, db in active_dbjobs.items() %} + {% include "activeinfra.template" %} + {%- endfor %} + {%- endif %} + {%- if manifest.dbs %} + + == Orchestrators == + {%- for db in manifest.dbs %} + {%- if db.is_active() %} + WARNING: Cannot preview {{ db.name }}, because it is already started. + {%- else %} + {% include "orchestrator.template" %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if manifest.models %} + + == Models == + {%- for model in manifest.models %} + + = Model Name: {{ model.name }} = + {%- include "model.template" %} + {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- endfor %} + {%- endif %} + + {%- if manifest.ensembles %} + + == Ensembles == + {%- for ensemble in manifest.ensembles %} + {%- include "ensemble.template" %} + {%- endfor %} + {%- endif %} + +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig.template b/smartsim/templates/templates/preview/plain_text/clientconfig.template new file mode 100644 index 000000000..3342918d9 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "clientconfig_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "clientconfig_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template new file mode 100644 index 000000000..51dafd0d1 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template @@ -0,0 +1,29 @@ + + {%- for db in manifest.dbs %} + {%- if db.name %} + Database Identifier: {{ db.name }} + {%- endif %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Database Backend: {{ config.database_cli | get_dbtype }} + TCP/IP Port(s): + {%- for port in db.ports %} + {{ port }} + {%- endfor %} + Type: Standalone + {%- endif %} + {%- endfor %} + {%- if model.incoming_entities %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Incoming Entities (Available Data Sources): + {%- for incoming in model.incoming_entities %} + {{ incoming.name }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_info.template b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template new file mode 100644 index 000000000..164f4bd4a --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template @@ -0,0 +1,19 @@ + + {%- for db in manifest.dbs %} + {%- if db.name %} + Database Identifier: {{ db.name }} + {%- endif %} + Database Backend: {{ config.database_cli | get_dbtype }} + TCP/IP Port(s): + {%- for port in db.ports %} + {{ port }} + {%- endfor %} + Type: Standalone + {%- endfor %} + {%- if model.query_key_prefixing() %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template new file mode 100644 index 000000000..c1278a19a --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "clientconfigcolo_info.template" %} +{% endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "clientconfigcolo_debug.template" %} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template new file mode 100644 index 000000000..303fd0dca --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template @@ -0,0 +1,37 @@ + + {%- if model.run_settings.colocated_db_settings.db_identifier %} + Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- else %} + Database Identifier: N/A + {%- endif %} + Database Backend: {{ config.database_cli | get_dbtype }} + {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_db_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_db_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.ifname %} + {%- if model.run_settings.colocated_db_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname[0] }} + {%- else %} + Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname }} + {%- endif %} + {%- endif %} + Type: Colocated + {%- if model.incoming_entities %} + Incoming Entities (Available Data Sources): + {%- for incoming in model.incoming_entities %} + {{ incoming.name }} + {%- endfor %} + {%- endif %} + {%- endif %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template new file mode 100644 index 000000000..e03d7ce3b --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template @@ -0,0 +1,22 @@ + + {%- if model.run_settings.colocated_db_settings.db_identifier %} + Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- endif %} + Database Backend: {{ config.database_cli | get_dbtype }} + {%- if model.run_settings.colocated_db_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_db_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + {%- endif %} + Type: Colocated + {%- if model.query_key_prefixing() %} + Outgoing Key Collision Prevention (Key Prefixing): + Tensors: {{ model.query_key_prefixing() | as_toggle }} + Datasets: {{ model.query_key_prefixing() | as_toggle }} + ML Models/Torch Scripts: {{ False | as_toggle }} + Aggregation Lists: {{ model.query_key_prefixing() | as_toggle }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble.template b/smartsim/templates/templates/preview/plain_text/ensemble.template new file mode 100644 index 000000000..040737cc9 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "ensemble_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "ensemble_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_debug.template b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template new file mode 100644 index 000000000..862db6032 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template @@ -0,0 +1,62 @@ + + {% for ensemble in manifest.ensembles %} + = Ensemble Name: {{ ensemble.name }} = + {%- if ensemble.path %} + Path: {{ ensemble.path }} + {%- endif %} + Members: {{ ensemble|length }} + {%- if ensemble.params %} + Ensemble Parameters: + {%- for key, value in ensemble.params.items() %} + {{ key }}: {{ value | join(", ") | wordwrap(150) | safe | replace('\n', '\n ') }} + {%- endfor %} + {%- endif %} + {%- if ensemble.replicas %} + Replicas: {{ ensemble.replicas }} + {%- elif ensemble.perm_strat %} + Permutation Strategy: {{ ensemble.perm_strat }} + {%- endif %} + {%- if ensemble.batch_settings %} + Batch Launch: True + Batch Command: {{ ensemble.batch_settings.batch_cmd }} + {%- endif %} + {%- if ensemble.batch_settings.batch_args %} + Batch Arguments: + {%- for key, value in ensemble.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + + {%- if verbosity_level == Verbosity.DEBUG %} + {%- for model in ensemble.entities %} + + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- endfor %} + {%- endif %} + {%- if verbosity_level == Verbosity.DEVELOPER %} + {%- for model in ensemble.entities %} + + - Model Name: {{ model.name }} - + {%- include 'model_debug.template' %} + {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- endif %} +{% endfor %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_info.template b/smartsim/templates/templates/preview/plain_text/ensemble_info.template new file mode 100644 index 000000000..17d1a4054 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/ensemble_info.template @@ -0,0 +1,51 @@ + = Ensemble Name: {{ ensemble.name }} = + Members: {{ ensemble|length }} + {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} + {%- if ensemble.params %} + Ensemble Parameters: + {%- for key, value in ensemble.params.items() %} + {{ key }}: {{ '{:^9}'.format(value|string)|truncate(81,true,'...')}} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if ensemble.models | length > 2 %} + {% set model = ensemble.models[0] %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + ... + {% set model = ensemble.models[(ensemble.models | length)-1] %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {% if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {%- else %} + {% for model in ensemble %} + - Model Name: {{ model.name }} - + {%- include 'model.template' %} + {% if model.run_settings.colocated_db_settings or manifest.dbs %} + Client Configuration: + {%- if model.run_settings.colocated_db_settings %} + {%- include "clientconfigcolo.template" %} + {%- endif %} + {%- if manifest.dbs %} + {%- include "clientconfig.template" %} + {%- endif %} + {%- endif %} + {% endfor %} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/experiment.template b/smartsim/templates/templates/preview/plain_text/experiment.template new file mode 100644 index 000000000..d2ef16c05 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/experiment.template @@ -0,0 +1,5 @@ +=== Experiment Overview === + + Experiment Name: {{ exp_entity.name }} + Experiment Path: {{ exp_entity.exp_path }} + Launcher: {{ exp_entity.launcher }} diff --git a/smartsim/templates/templates/preview/plain_text/model.template b/smartsim/templates/templates/preview/plain_text/model.template new file mode 100644 index 000000000..303beac67 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "model_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "model_debug.template" -%} +{%- endif -%} diff --git a/smartsim/templates/templates/preview/plain_text/model_debug.template b/smartsim/templates/templates/preview/plain_text/model_debug.template new file mode 100644 index 000000000..186746186 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model_debug.template @@ -0,0 +1,114 @@ + + {%- if model is defined %} + {%- if model.path %} + Path: {{ model.path }} + {%- endif %} + Executable: {{ model.run_settings.exe[0] }} + Executable Arguments: + {%- for param in model.run_settings.exe_args %} + {{ param }} + {%- endfor %} + {%- if model.run_settings.run_command %} + Run Command: {{ model.run_settings.run_command }} + {%- endif %} + {%- if model.run_settings.run_args %} + Run Arguments: + {%- for key, value in model.run_settings.run_args.items() %} + {{ key }} {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.batch_settings %} + Batch Launch: True + Batch Command: {{ model.batch_settings.batch_cmd }} + Batch Arguments: + {%- for key, value in model.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.params %} + Model Parameters: + {%- for param, value in model.params.items() %} + {{ param }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- if model.files %} + {%- if model.files.tagged %} + Tagged Files for Model Configuration: + {%- for tagged in model.files.tagged %} + {{ tagged }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- if model.files.copy %} + Copy Files: + {%- for copy in model.files.copy %} + {{ copy }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- if model.files.link %} + Symlink Files: + {%- for link in model.files.link %} + {{ link }} + -> {{ model.path }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if model.run_settings.colocated_db_settings %} + Colocated: + {%- if model.run_settings.colocated_db_settings.db_identifier %} + Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_db_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.ifname %} + {%- if model.run_settings.colocated_db_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname[0] }} + {%- else %} + Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname }} + {%- endif %} + {%- endif %} + CPUs: {{ model.run_settings.colocated_db_settings.cpus }} + Custom Pinning: {{ model.run_settings.colocated_db_settings.custom_pinning }} + {%- endif %} + {%- if model._db_scripts %} + Torch Scripts: + {%- for script in model._db_scripts%} + Name: {{ script.name }} + Path: {{ script.file }} + Backend: {{ script.device }} + Devices Per Node: {{ script.devices_per_node }} + {%- endfor %} + {%- endif %} + {%- if model._db_models %} + ML Models: + {%- for mlmodel in model._db_models %} + Name: {{ mlmodel.name }} + Path: {{ mlmodel.file }} + Backend: {{ mlmodel.backend }} + Device: {{ mlmodel.device }} + Devices Per Node: {{ mlmodel.devices_per_node }} + {%- if mlmodel.device == "GPU" %} + First Device: {{ mlmodel.first_device }} + {%- endif %} + {%- for input in mlmodel.inputs %} + Inputs: + {{ input }} + {%- endfor %} + {%- for output in mlmodel.outputs %} + Outputs: + {{ output }} + {%- endfor %} + {%- endfor %} + {%- endif %} + {%- if model.query_key_prefixing()%} + Key Prefix: {{ model.name }} + {%- endif %} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/model_info.template b/smartsim/templates/templates/preview/plain_text/model_info.template new file mode 100644 index 000000000..f746208e5 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/model_info.template @@ -0,0 +1,54 @@ + + + {%- if model.batch_settings %} + Batch Launch: True + {% endif %} + {%- if model.params %} + Model Parameters: + {%- for param, value in model.params.items() %} + {{ param }}: {{ value }} + {%- endfor %} + {%- endif %} + + {%- if model.run_settings.colocated_db_settings %} + Colocated: + {%- if model.run_settings.colocated_db_settings.db_identifier %} + Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.port %} + Connection Type: TCP + TCP/IP Port(s): + {{ model.run_settings.colocated_db_settings.port }} + {%- endif %} + {%- if model.run_settings.colocated_db_settings.unix_socket %} + Connection Type: UDS + Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + {%- endif %} + {%- endif %} + + {%- if model.run_settings.colocated_db_settings['db_scripts'] %} + Torch Scripts: + {%- for script in model.run_settings.colocated_db_settings['db_scripts'] %} + Name: {{ script.name }} + Path: {{ script.script_path }} + {%- endfor %} + {%- endif %} + {%- if model.run_settings.colocated_db_settings['db_models'] %} + ML Models: + {%- for mlmodel in model.run_settings.colocated_db_settings['db_models'] %} + Name: {{ mlmodel.name }} + Path: {{ mlmodel.model_file }} + Backend: {{ mlmodel.backend }} + {%- for input in mlmodel.inputs %} + Inputs: + {{ input }} + {%- endfor %} + {%- for output in mlmodel.outputs %} + Outputs: + {{ output }} + {%- endfor %} + {%- endfor %} + {%- endif %} + {%- if model.query_key_prefixing() %} + Key Prefix: {{ model.name }} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator.template b/smartsim/templates/templates/preview/plain_text/orchestrator.template new file mode 100644 index 000000000..813b062b3 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator.template @@ -0,0 +1,7 @@ + +{%- if verbosity_level == Verbosity.INFO %} +{%- include "orchestrator_info.template" -%} +{%- endif %} +{%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} +{%- include "orchestrator_debug.template" -%} +{%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template new file mode 100644 index 000000000..127a4949e --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template @@ -0,0 +1,33 @@ + + = Database Identifier: {{ db.name }} = + {%- if db.path %} + Path: {{ db.path }} + {%- endif %} + Shards: {{ db.num_shards }} + TCP/IP Port(s): + {%- for port in db.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ db._interfaces[0] }} + Type: {{ config.database_cli | get_dbtype }} + Executable: {{ config.database_exe }} + {%- if db.run_settings %} + Run Command: {{ db.run_settings.run_command }} + {%- if db.run_settings.run_args %} + Run Arguments: + {%- for key, value in db.run_settings.run_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} + {%- endif %} + {%- if db.run_command %} + Run Command: {{ db.run_command }} + {%- endif %} + {%- if db.batch_settings %} + Batch Launch: True + Batch Command: {{ db.batch_settings.batch_cmd }} + Batch Arguments: + {%- for key, value in db.batch_settings.batch_args.items() %} + {{ key }}: {{ value }} + {%- endfor %} + {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_info.template b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template new file mode 100644 index 000000000..11608d6c5 --- /dev/null +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template @@ -0,0 +1,11 @@ + + = Database Identifier: {{ db.name }} = + TCP/IP Port(s): + {%- for port in db.ports %} + {{ port }} + {%- endfor %} + Network Interface: {{ db._interfaces[0] }} + Type: {{ config.database_cli | get_dbtype }} + {%- if db.batch %} + Batch Launch: {{ db.batch }} + {%- endif %} diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index 3a82a81e5..a5d20d0c9 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -75,9 +75,7 @@ def get_hosts(launcher: t.Optional[str] = None) -> t.List[str]: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Names of the hosts - :rtype: list[str] :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -94,9 +92,7 @@ def get_queue(launcher: t.Optional[str] = None) -> str: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Name of the queue - :rtype: str :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -113,9 +109,7 @@ def get_tasks(launcher: t.Optional[str] = None) -> int: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Number of tasks - :rtype: int :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -132,9 +126,7 @@ def get_tasks_per_node(launcher: t.Optional[str] = None) -> t.Dict[str, int]: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Map of nodes to number of processes on that node - :rtype: dict[str, int] :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: diff --git a/smartsim/wlm/pbs.py b/smartsim/wlm/pbs.py index eda5baf24..5b559c1e6 100644 --- a/smartsim/wlm/pbs.py +++ b/smartsim/wlm/pbs.py @@ -38,7 +38,6 @@ def get_hosts() -> t.List[str]: """Get the name of the hosts used in a PBS allocation. :returns: Names of the host nodes - :rtype: list[str] :raises SmartSimError: ``PBS_NODEFILE`` is not set """ hosts = [] @@ -59,7 +58,6 @@ def get_queue() -> str: """Get the name of queue in a PBS allocation. :returns: The name of the queue - :rtype: str :raises SmartSimError: ``PBS_QUEUE`` is not set """ if "PBS_QUEUE" in os.environ: @@ -76,7 +74,6 @@ def get_tasks() -> int: node from which it is run. :returns: Then number of tasks in the allocation - :rtype: int :raises LauncherError: Could not access ``qstat`` :raises SmartSimError: ``PBS_JOBID`` is not set """ @@ -103,8 +100,7 @@ def get_tasks_per_node() -> t.Dict[str, int]: This method requires ``qstat`` be installed on the node from which it is run. - :returns: Map of chunks to number of processes on that chunck - :rtype: dict[str, int] + :returns: Map of chunks to number of processes on that chunk :raises LauncherError: Could not access ``qstat`` :raises SmartSimError: ``PBS_JOBID`` is not set """ diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 9308eea98..ae7299f28 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -31,7 +31,6 @@ from .._core.launcher.slurm.slurmCommands import salloc, scancel, scontrol, sinfo from .._core.launcher.slurm.slurmParser import parse_salloc, parse_salloc_error from .._core.launcher.util.launcherUtil import ComputeNode, Partition -from .._core.utils.helpers import init_default from ..error import ( AllocationError, LauncherError, @@ -60,31 +59,26 @@ def get_allocation( The options can be used to pass extra settings to the workload manager such as the following for Slurm: - - nodelist="nid00004" + - nodelist="nid00004" For arguments without a value, pass None or and empty string as the value. For Slurm: - - exclusive=None + - exclusive=None - :param nodes: number of nodes for the allocation, defaults to 1 - :type nodes: int, optional - :param time: wall time of the allocation, HH:MM:SS format, defaults to None - :type time: str, optional - :param account: account id for allocation, defaults to None - :type account: str, optional - :param options: additional options for the slurm wlm, defaults to None - :type options: dict[str, str], optional + :param nodes: number of nodes for the allocation + :param time: wall time of the allocation, HH:MM:SS format + :param account: account id for allocation + :param options: additional options for the slurm wlm :raises LauncherError: if the allocation is not successful :return: the id of the allocation - :rtype: str """ if not which("salloc"): raise LauncherError( "Attempted slurm function without access to slurm(salloc) at the call site" ) - options = init_default({}, options, dict) + options = options or {} salloc_args = _get_alloc_cmd(nodes, time, account, options=options) debug_msg = " ".join(salloc_args[1:]) @@ -108,7 +102,6 @@ def release_allocation(alloc_id: str) -> None: """Free an allocation's resources :param alloc_id: allocation id - :type alloc_id: str :raises LauncherError: if allocation could not be freed """ if not which("scancel"): @@ -137,15 +130,11 @@ def validate(nodes: int = 1, ppn: int = 1, partition: t.Optional[str] = None) -> if no partition is provided, the default partition is found and used. - :param nodes: Override the default node count to validate, defaults to 1 - :type nodes: int, optional - :param ppn: Override the default processes per node to validate, defaults to 1 - :type ppn: int, optional - :param partition: partition to validate, defaults to None - :type partition: str, optional + :param nodes: Override the default node count to validate + :param ppn: Override the default processes per node to validate + :param partition: partition to validate :raises: LauncherError :returns: True if resources are available, False otherwise - :rtype: bool """ sys_partitions = _get_system_partition_info() @@ -189,7 +178,6 @@ def get_default_partition() -> str: a star following its partition name in sinfo output :returns: the name of the default partition - :rtype: str """ sinfo_output, _ = sinfo(["--noheader", "--format", "%P"]) @@ -206,7 +194,6 @@ def get_default_partition() -> str: def _get_system_partition_info() -> t.Dict[str, Partition]: """Build a dictionary of slurm partitions :returns: dict of Partition objects - :rtype: dict """ sinfo_output, _ = sinfo(["--noheader", "--format", "%R %n %c"]) @@ -280,9 +267,7 @@ def _validate_time_format(time: str) -> str: By defualt the formatted wall time is the total number of seconds. :param time: number of hours to run job - :type time: str :returns: Formatted walltime - :rtype: str """ try: hours, minutes, seconds = map(int, time.split(":")) @@ -302,7 +287,6 @@ def get_hosts() -> t.List[str]: on which it is run :returns: Names of the host nodes - :rtype: list[str] :raises LauncherError: Could not access ``scontrol`` :raises SmartSimError: ``SLURM_JOB_NODELIST`` is not set """ @@ -325,7 +309,6 @@ def get_queue() -> str: """Get the name of queue in a slurm allocation. :returns: The name of the queue - :rtype: str :raises SmartSimError: ``SLURM_JOB_PARTITION`` is not set """ if job_partition := os.environ.get("SLURM_JOB_PARTITION", None): @@ -337,7 +320,6 @@ def get_tasks() -> int: """Get the number of tasks in a slurm allocation. :returns: Then number of tasks in the allocation - :rtype: int :raises SmartSimError: ``SLURM_NTASKS`` is not set """ if ntasks_str := os.environ.get("SLURM_NTASKS", 0): @@ -354,7 +336,6 @@ def get_tasks_per_node() -> t.Dict[str, int]: on which it is run :returns: Map of nodes to number of tasks on that node - :rtype: dict[str, int] :raises SmartSimError: ``SLURM_TASKS_PER_NODE`` is not set """ if "SLURM_TASKS_PER_NODE" in os.environ: diff --git a/tests/backends/test_cli_mini_exp.py b/tests/backends/test_cli_mini_exp.py index f02f44270..2fde2ff5f 100644 --- a/tests/backends/test_cli_mini_exp.py +++ b/tests/backends/test_cli_mini_exp.py @@ -31,6 +31,7 @@ import smartredis import smartsim._core._cli.validate +import smartsim._core._install.builder as build from smartsim._core.utils.helpers import installed_redisai_backends sklearn_available = True @@ -47,6 +48,7 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( + prepare_db, local_db, test_dir, monkeypatch, @@ -56,9 +58,11 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( to ensure that it does not accidentally report false positive/negatives """ + db = prepare_db(local_db).orchestrator + @contextmanager def _mock_make_managed_local_orc(*a, **kw): - (client_addr,) = local_db.get_address() + (client_addr,) = db.get_address() yield smartredis.Client(False, address=client_addr) monkeypatch.setattr( @@ -67,7 +71,7 @@ def _mock_make_managed_local_orc(*a, **kw): _mock_make_managed_local_orc, ) backends = installed_redisai_backends() - (db_port,) = local_db.ports + (db_port,) = db.ports smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc @@ -75,7 +79,7 @@ def _mock_make_managed_local_orc(*a, **kw): location=test_dir, port=db_port, # Always test on CPU, heads don't always have GPU - device="CPU", + device=build.Device.CPU, # Test the backends the dev has installed with_tf="tensorflow" in backends, with_pt="torch" in backends, diff --git a/tests/backends/test_dataloader.py b/tests/backends/test_dataloader.py index d02f3f33c..de4bf6d8e 100644 --- a/tests/backends/test_dataloader.py +++ b/tests/backends/test_dataloader.py @@ -35,7 +35,7 @@ from smartsim.experiment import Experiment from smartsim.log import get_logger from smartsim.ml.data import DataInfo, TrainingDataUploader -from smartsim.status import STATUS_COMPLETED +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -167,19 +167,16 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(test_dir, wlmutils): - exp = Experiment( - "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() - ) - orc: Orchestrator = wlmutils.get_orchestrator() - exp.generate(orc) - exp.start(orc) +def test_tf_dataloaders(wlm_experiment, prepare_db, single_db, monkeypatch): + + db = prepare_db(single_db).orchestrator + orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + monkeypatch.setenv("SSDB", orc.get_address()[0]) + monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: - os.environ["SSDB"] = orc.get_address()[0] data_info = run_local_uploaders(mpi_size=2, format="tf") - os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" for rank in range(2): tf_dynamic = TFDataGenerator( data_info_or_list_name="test_data_list", @@ -190,6 +187,7 @@ def test_tf_dataloaders(test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=False, # catch wrong arg + wait_interval=0.1, ) train_tf(tf_dynamic) assert len(tf_dynamic) == 4 @@ -204,6 +202,7 @@ def test_tf_dataloaders(test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=True, # catch wrong arg + wait_interval=0.1, ) train_tf(tf_static) assert len(tf_static) == 4 @@ -211,11 +210,6 @@ def test_tf_dataloaders(test_dir, wlmutils): except Exception as e: raise e - finally: - exp.stop(orc) - os.environ.pop("SSDB", "") - os.environ.pop("SSKEYIN", "") - os.environ.pop("SSKEYOUT", "") def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @@ -234,20 +228,18 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") -def test_torch_dataloaders(fileutils, test_dir, wlmutils): - exp = Experiment( - "test_tf_dataloaders", test_dir, launcher=wlmutils.get_test_launcher() - ) - orc: Orchestrator = wlmutils.get_orchestrator() +def test_torch_dataloaders( + wlm_experiment, prepare_db, single_db, fileutils, test_dir, wlmutils, monkeypatch +): config_dir = fileutils.get_test_dir_path("ml") - exp.generate(orc) - exp.start(orc) + db = prepare_db(single_db).orchestrator + orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + monkeypatch.setenv("SSDB", orc.get_address()[0]) + monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: - os.environ["SSDB"] = orc.get_address()[0] data_info = run_local_uploaders(mpi_size=2) - os.environ["SSKEYIN"] = "test_uploader_0,test_uploader_1" for rank in range(2): torch_dynamic = TorchDataGenerator( data_info_or_list_name="test_data_list", @@ -258,11 +250,12 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): batch_size=4, max_fetch_trials=5, dynamic=False, # catch wrong arg - init_samples=True, # catch wrong arg + init_samples=True, + wait_interval=0.1, ) check_dataloader(torch_dynamic, rank, dynamic=True) - torch_dynamic.init_samples(5) + torch_dynamic.init_samples(5, 0.1) for _ in range(2): for _ in torch_dynamic: continue @@ -278,26 +271,22 @@ def test_torch_dataloaders(fileutils, test_dir, wlmutils): max_fetch_trials=5, dynamic=True, # catch wrong arg init_samples=True, # catch wrong arg + wait_interval=0.1, ) check_dataloader(torch_static, rank, dynamic=False) - torch_static.init_samples(5) + torch_static.init_samples(5, 0.1) for _ in range(2): for _ in torch_static: continue - trainer = create_trainer_torch(exp, config_dir, wlmutils) - exp.start(trainer, block=True) + trainer = create_trainer_torch(wlm_experiment, config_dir, wlmutils) + wlm_experiment.start(trainer, block=True) - assert exp.get_status(trainer)[0] == STATUS_COMPLETED + assert wlm_experiment.get_status(trainer)[0] == SmartSimStatus.STATUS_COMPLETED except Exception as e: raise e - finally: - exp.stop(orc) - os.environ.pop("SSDB", "") - os.environ.pop("SSKEYIN", "") - os.environ.pop("SSKEYOUT", "") def test_data_info_repr(): @@ -331,15 +320,9 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(test_dir, wlmutils): - exp = Experiment( - "test-wrong-dataloaders", - exp_path=test_dir, - launcher=wlmutils.get_test_launcher(), - ) - orc = wlmutils.get_orchestrator() - exp.generate(orc) - exp.start(orc) +def test_wrong_dataloaders(wlm_experiment, prepare_db, single_db): + db = prepare_db(single_db).orchestrator + orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) if shouldrun_tf: with pytest.raises(SSInternalError): @@ -365,5 +348,3 @@ def test_wrong_dataloaders(test_dir, wlmutils): cluster=False, ) torch_data_gen.init_samples(init_trials=1) - - exp.stop(orc) diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 75e9f515d..6155b6884 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -29,12 +29,13 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity import Ensemble from smartsim.entity.dbobject import DBModel from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -145,36 +146,30 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): +def test_tf_db_model( + wlm_experiment, prepare_db, single_db, fileutils, test_dir, mlutils +): """Test TensorFlow DB Models on remote DB""" - # Set experiment name - exp_name = "test-tf-db-model" - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create database - host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -211,50 +206,41 @@ def test_tf_db_model(fileutils, test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._db_models) == 2 - exp.generate(smartsim_model) + wlm_experiment.generate(smartsim_model) # Launch and check successful completion - try: - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all( - stat == status.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(db) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): +def test_pt_db_model( + wlm_experiment, prepare_db, single_db, fileutils, test_dir, mlutils +): """Test PyTorch DB Models on remote DB""" - # Set experiment name - exp_name = "test-pt-db-model" - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_pt_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create database - host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") @@ -278,55 +264,46 @@ def test_pt_db_model(fileutils, test_dir, wlmutils, mlutils): # Assert we have added both models assert len(smartsim_model._db_models) == 1 - exp.generate(smartsim_model) + wlm_experiment.generate(smartsim_model) # Launch and check successful completion - try: - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all( - stat == status.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(db) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): +def test_db_model_ensemble( + wlm_experiment, prepare_db, single_db, fileutils, test_dir, wlmutils, mlutils +): """Test DBModels on remote DB, with an ensemble""" - # Set experiment name - exp_name = "test-db-model-ensemble" - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = 1 # TF backend fails on multiple GPUs test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create ensemble - smartsim_ensemble = exp.create_ensemble( + smartsim_ensemble = wlm_experiment.create_ensemble( "smartsim_model", run_settings=run_settings, replicas=2 ) # Create Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create database - host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -379,17 +356,14 @@ def test_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): # Assert we have added two models to each entity assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) - exp.generate(smartsim_ensemble) + wlm_experiment.generate(smartsim_ensemble) # Launch and check successful completion - try: - exp.start(db, smartsim_ensemble, block=True) - statuses = exp.get_status(smartsim_ensemble) - assert all( - stat == status.STATUS_COMPLETED for stat in statuses - ), f"Statuses: {statuses}" - finally: - exp.stop(db) + wlm_experiment.start(smartsim_ensemble, block=True) + statuses = wlm_experiment.get_status(smartsim_ensemble) + assert all( + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses + ), f"Statuses: {statuses}" @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") @@ -458,7 +432,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -518,7 +492,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_model) @@ -557,7 +531,6 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, debug=True, ifname=test_interface ) @@ -620,7 +593,7 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @@ -724,7 +697,7 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" finally: exp.stop(colo_ensemble) @@ -756,7 +729,6 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, debug=True, ifname=test_interface ) @@ -813,7 +785,6 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): colo_ensemble2 = exp.create_ensemble( "colocated_ens", run_settings=colo_settings2, replicas=2 ) - colo_ensemble2.set_path(test_dir) colo_ensemble2.add_ml_model( "cnn", "TF", diff --git a/tests/backends/test_dbscript.py b/tests/backends/test_dbscript.py index 2bffd1da6..2c04bf5db 100644 --- a/tests/backends/test_dbscript.py +++ b/tests/backends/test_dbscript.py @@ -30,12 +30,13 @@ import pytest from smartredis import * -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity.dbobject import DBScript from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.settings import MpiexecSettings, MpirunSettings +from smartsim.status import SmartSimStatus logger = get_logger(__name__) @@ -56,37 +57,29 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(fileutils, test_dir, wlmutils, mlutils): +def test_db_script(wlm_experiment, prepare_db, single_db, fileutils, mlutils): """Test DB scripts on remote DB""" - # Set experiment name - exp_name = "test-db-script" - - # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # Create the SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create the RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) # Create the SmartSim Model - smartsim_model = exp.create_model("smartsim_model", run_settings) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create the SmartSim database - host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db, smartsim_model) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + wlm_experiment.generate(smartsim_model) # Define the torch script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -122,51 +115,42 @@ def test_db_script(fileutils, test_dir, wlmutils, mlutils): assert len(smartsim_model._db_scripts) == 3 # Launch and check successful completion - try: - exp.start(db, smartsim_model, block=True) - statuses = exp.get_status(smartsim_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) - finally: - exp.stop(db) + wlm_experiment.start(smartsim_model, block=True) + statuses = wlm_experiment.get_status(smartsim_model) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): +def test_db_script_ensemble(wlm_experiment, prepare_db, single_db, fileutils, mlutils): """Test DB scripts on remote DB""" - # Set experiment name - exp_name = "test-db-script" + # Set wlm_experimenteriment name + wlm_experiment_name = "test-db-script" # Retrieve parameters from testing environment - test_launcher = wlmutils.get_test_launcher() - test_interface = wlmutils.get_test_interface() - test_port = wlmutils.get_test_port() test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") torch_script = fileutils.get_test_conf_path("torchscript.py") - # Create SmartSim Experiment - exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create RunSettings - run_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + run_settings = wlm_experiment.create_run_settings( + exe=sys.executable, exe_args=test_script + ) run_settings.set_nodes(1) run_settings.set_tasks(1) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + # Create Ensemble with two identical models - ensemble = exp.create_ensemble( + ensemble = wlm_experiment.create_ensemble( "dbscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim model - smartsim_model = exp.create_model("smartsim_model", run_settings) - - # Create SmartSim database - host = wlmutils.choose_host(run_settings) - db = exp.create_database(port=test_port, interface=test_interface, hosts=host) - exp.generate(db) + smartsim_model = wlm_experiment.create_model("smartsim_model", run_settings) # Create the script string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" @@ -216,14 +200,11 @@ def test_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): # Assert we have added all three models to entities in ensemble assert all([len(entity._db_scripts) == 3 for entity in ensemble]) - exp.generate(ensemble) + wlm_experiment.generate(ensemble) - try: - exp.start(db, ensemble, block=True) - statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) - finally: - exp.stop(db) + wlm_experiment.start(ensemble, block=True) + statuses = wlm_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") @@ -288,7 +269,7 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_model) @@ -388,7 +369,7 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_ensemble) @@ -486,7 +467,7 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m try: exp.start(colo_ensemble, block=True) statuses = exp.get_status(colo_ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) finally: exp.stop(colo_ensemble) diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 7c0e97e41..29771bb1c 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -25,13 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import sys from pathlib import Path import pytest from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus sklearn_available = True try: @@ -56,7 +57,7 @@ ) -def test_sklearn_onnx(test_dir, mlutils, wlmutils): +def test_sklearn_onnx(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 some sklearn models here we test the following sklearn models: @@ -73,29 +74,24 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - - exp_name = "test_sklearn_onnx" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) - db = wlmutils.get_orchestrator(nodes=1) - db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( - "python", f"run_sklearn_onnx.py --device={test_device}" + run_settings = wlm_experiment.create_run_settings( + sys.executable, f"run_sklearn_onnx.py --device={test_device}" ) - model = exp.create_model("onnx_models", run_settings) + if wlmutils.get_test_launcher() != "local": + run_settings.set_tasks(1) + model = wlm_experiment.create_model("onnx_models", run_settings) script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = Path(script_dir, "run_sklearn_onnx.py").resolve() model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + wlm_experiment.generate(model) - exp.start(model, block=True) + wlm_experiment.start(model, block=True) - exp.stop(db) # if model failed, test will fail - model_status = exp.get_status(model) - assert model_status[0] != STATUS_FAILED + model_status = wlm_experiment.get_status(model) + assert model_status[0] != SmartSimStatus.STATUS_FAILED diff --git a/tests/backends/test_tf.py b/tests/backends/test_tf.py index af04c89cb..adf0e9daa 100644 --- a/tests/backends/test_tf.py +++ b/tests/backends/test_tf.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.error import SmartSimError -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus tf_available = True try: @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(test_dir, mlutils, wlmutils): +def test_keras_model(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -60,34 +60,28 @@ def test_keras_model(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - exp_name = "test_keras_model" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) test_device = mlutils.get_test_device() + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) - db = wlmutils.get_orchestrator(nodes=1) - db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( + run_settings = wlm_experiment.create_run_settings( "python", f"run_tf.py --device={test_device}" ) if wlmutils.get_test_launcher() != "local": run_settings.set_tasks(1) - model = exp.create_model("tf_script", run_settings) + model = wlm_experiment.create_model("tf_script", run_settings) script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = Path(script_dir, "run_tf.py").resolve() model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + wlm_experiment.generate(model) - exp.start(model, block=True) + wlm_experiment.start(model, block=True) - exp.stop(db) # if model failed, test will fail - model_status = exp.get_status(model)[0] - assert model_status != STATUS_FAILED + model_status = wlm_experiment.get_status(model)[0] + assert model_status != SmartSimStatus.STATUS_FAILED def create_tf_model(): diff --git a/tests/backends/test_torch.py b/tests/backends/test_torch.py index 76a989a2e..c995f76ca 100644 --- a/tests/backends/test_torch.py +++ b/tests/backends/test_torch.py @@ -31,7 +31,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.status import STATUS_FAILED +from smartsim.status import SmartSimStatus torch_available = True try: @@ -48,7 +48,9 @@ ) -def test_torch_model_and_script(test_dir, mlutils, wlmutils): +def test_torch_model_and_script( + wlm_experiment, prepare_db, single_db, mlutils, wlmutils +): """This test needs two free nodes, 1 for the db and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -60,30 +62,24 @@ def test_torch_model_and_script(test_dir, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - exp_name = "test_torch_model_and_script" - - exp = Experiment(exp_name, exp_path=test_dir, launcher=wlmutils.get_test_launcher()) + db = prepare_db(single_db).orchestrator + wlm_experiment.reconnect_orchestrator(db.checkpoint_file) test_device = mlutils.get_test_device() - db = wlmutils.get_orchestrator(nodes=1) - db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( + run_settings = wlm_experiment.create_run_settings( "python", f"run_torch.py --device={test_device}" ) if wlmutils.get_test_launcher() != "local": run_settings.set_tasks(1) - model = exp.create_model("torch_script", run_settings) + model = wlm_experiment.create_model("torch_script", run_settings) script_dir = os.path.dirname(os.path.abspath(__file__)) script_path = Path(script_dir, "run_torch.py").resolve() model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) + wlm_experiment.generate(model) - exp.start(model, block=True) + wlm_experiment.start(model, block=True) - exp.stop(db) # if model failed, test will fail - model_status = exp.get_status(model)[0] - assert model_status != STATUS_FAILED + model_status = wlm_experiment.get_status(model)[0] + assert model_status != SmartSimStatus.STATUS_FAILED diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index c69b1746a..fd8017c7c 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings import QsubBatchSettings +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -44,7 +45,10 @@ def add_batch_resources(wlmutils, batch_settings): if isinstance(batch_settings, QsubBatchSettings): for key, value in wlmutils.get_batch_resources().items(): - batch_settings.set_resource(key, value) + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) def test_batch_model(fileutils, test_dir, wlmutils): @@ -54,7 +58,7 @@ def test_batch_model(fileutils, test_dir, wlmutils): exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) script = fileutils.get_test_conf_path("sleep.py") - batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + batch_settings = exp.create_batch_settings(nodes=1, time="00:05:00") batch_settings.set_account(wlmutils.get_test_account()) add_batch_resources(wlmutils, batch_settings) @@ -62,12 +66,12 @@ def test_batch_model(fileutils, test_dir, wlmutils): model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings ) - model.set_path(test_dir) + exp.generate(model) exp.start(model, block=True) statuses = exp.get_status(model) assert len(statuses) == 1 - assert statuses[0] == status.STATUS_COMPLETED + assert statuses[0] == SmartSimStatus.STATUS_COMPLETED def test_batch_ensemble(fileutils, test_dir, wlmutils): @@ -88,11 +92,11 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) - ensemble.set_path(test_dir) + exp.generate(ensemble) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): @@ -109,8 +113,7 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 058aef895..2a5627d6d 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -25,11 +25,14 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os.path as osp +import pathlib import time import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.settings.pbsSettings import QsubBatchSettings +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -41,6 +44,15 @@ ) +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + if key == "queue": + batch_settings.set_queue(value) + else: + batch_settings.set_resource(key, value) + + def test_launch_orc_auto_batch(test_dir, wlmutils): """test single node orchestrator""" launcher = wlmutils.get_test_launcher() @@ -58,21 +70,22 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:02:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_batch_single(test_dir, wlmutils): @@ -94,21 +107,22 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:02:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): @@ -130,63 +144,88 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): ) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:03:00") + orc.batch_settings.set_walltime("00:05:00") orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_reconnect(test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" + p_test_dir = pathlib.Path(test_dir) launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + exp_1_dir = p_test_dir / exp_name + exp_1_dir.mkdir() + exp = Experiment(exp_name, launcher=launcher, exp_path=str(exp_1_dir)) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface ) - orc.set_path(test_dir) orc.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, orc.batch_settings) - orc.batch_settings.set_walltime("00:03:00") + orc.batch_settings.set_walltime("00:05:00") exp.start(orc, block=True) statuses = exp.get_status(orc) - # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + try: + assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) + except Exception: exp.stop(orc) - assert False - - exp.stop(orc) + raise exp_name = "test-orc-cluster-orc-batch-reconnect-2nd" - exp_2 = Experiment(exp_name, launcher=launcher) - - checkpoint = osp.join(test_dir, "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) - - # let statuses update once - time.sleep(5) - - statuses = exp_2.get_status(reloaded_orc) - for stat in statuses: - if stat == status.STATUS_FAILED: - exp_2.stop(reloaded_orc) - assert False - - exp_2.stop(reloaded_orc) + exp_2_dir = p_test_dir / exp_name + exp_2_dir.mkdir() + exp_2 = Experiment(exp_name, launcher=launcher, exp_path=str(exp_2_dir)) + + try: + checkpoint = osp.join(orc.path, "smartsim_db.dat") + reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + + # let statuses update once + time.sleep(5) + + statuses = exp_2.get_status(reloaded_orc) + assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) + except Exception: + # Something went wrong! Let the experiment that started the DB + # clean up the DB + exp.stop(orc) + raise + + try: + # Test experiment 2 can stop the DB + exp_2.stop(reloaded_orc) + assert all( + stat == SmartSimStatus.STATUS_CANCELLED + for stat in exp_2.get_status(reloaded_orc) + ) + except Exception: + # Something went wrong! Let the experiment that started the DB + # clean up the DB + exp.stop(orc) + raise + else: + # Ensure it is the same DB that Experiment 1 was tracking + time.sleep(5) + assert not any( + stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(orc) + ) diff --git a/tests/full_wlm/test_mpmd.py b/tests/full_wlm/test_mpmd.py index 7f6cc2ea2..0167a8f08 100644 --- a/tests/full_wlm/test_mpmd.py +++ b/tests/full_wlm/test_mpmd.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils.helpers import is_valid_cmd +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -86,11 +87,13 @@ def prune_commands(launcher): settings.make_mpmd(deepcopy(settings)) settings.make_mpmd(deepcopy(settings)) - mpmd_model = exp.create_model("mmpd", path=test_dir, run_settings=settings) + mpmd_model = exp.create_model( + f"mpmd-{run_command}", path=test_dir, run_settings=settings + ) exp.start(mpmd_model, block=True) statuses = exp.get_status(mpmd_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) exp.start(mpmd_model, block=True) statuses = exp.get_status(mpmd_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/full_wlm/test_symlinking.py b/tests/full_wlm/test_symlinking.py new file mode 100644 index 000000000..c5b5b90ba --- /dev/null +++ b/tests/full_wlm/test_symlinking.py @@ -0,0 +1,180 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib +import time + +import pytest + +from smartsim import Experiment + +if pytest.test_launcher not in pytest.wlm_options: + pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") + + +def test_batch_model_and_ensemble(test_dir, wlmutils): + exp_name = "test-batch" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + + test_model = exp.create_model( + "test_model", path=test_dir, run_settings=rs, batch_settings=bs + ) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.sh")) + + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + assert pathlib.Path(test_ensemble.path).exists() + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_batch_ensemble_symlinks(test_dir, wlmutils): + exp_name = "test-batch-ensemble" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_batch_model_symlinks(test_dir, wlmutils): + exp_name = "test-batch-model" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + test_model = exp.create_model( + "test_model", path=test_dir, run_settings=rs, batch_settings=bs + ) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.sh")) + + +def test_batch_orchestrator_symlinks(test_dir, wlmutils): + exp_name = "test-batch-orc" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + port = 2424 + db = exp.create_database( + db_nodes=3, + port=port, + batch=True, + interface=wlmutils.get_test_interface(), + single_cmd=False, + ) + exp.generate(db) + exp.start(db, block=True) + time.sleep(2) + exp.stop(db) + + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.err"), False) + + for i in range(db.db_nodes): + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) + _should_not_be_symlinked( + pathlib.Path(db.path, f"nodes-orchestrator_{i}-{port}.conf") + ) + + +def _should_not_be_symlinked(non_linked_path: pathlib.Path): + """Helper function for assertions about paths that should NOT be symlinked""" + assert non_linked_path.exists() + assert not non_linked_path.is_symlink() + + +def _should_be_symlinked(linked_path: pathlib.Path, open_file: bool): + """Helper function for assertions about paths that SHOULD be symlinked""" + assert linked_path.exists() + assert linked_path.is_symlink() + # ensure the source file exists + assert pathlib.Path(os.readlink(linked_path)).exists() + if open_file: + with open(pathlib.Path(os.readlink(linked_path)), "r") as file: + log_contents = file.read() + assert "spam eggs" in log_contents diff --git a/tests/install/test_builder.py b/tests/install/test_builder.py index 5e6c8e597..feaf7e54f 100644 --- a/tests/install/test_builder.py +++ b/tests/install/test_builder.py @@ -27,8 +27,7 @@ import functools import pathlib -import platform -import threading +import textwrap import time import pytest @@ -41,7 +40,9 @@ RAI_VERSIONS = RedisAIVersion("1.2.7") -for_each_device = pytest.mark.parametrize("device", ["cpu", "gpu"]) +for_each_device = pytest.mark.parametrize( + "device", [build.Device.CPU, build.Device.GPU] +) _toggle_build_optional_backend = lambda backend: pytest.mark.parametrize( f"build_{backend}", @@ -163,7 +164,7 @@ def test_rai_builder_will_add_dep_if_backend_requested_wo_duplicates( rai_builder = build.RedisAIBuilder( build_tf=build_tf, build_torch=build_pt, build_onnx=build_ort ) - requested_backends = rai_builder._get_deps_to_fetch_for(device) + requested_backends = rai_builder._get_deps_to_fetch_for(build.Device(device)) assert dlpack_dep_presence(requested_backends) assert tf_dep_presence(build_tf, requested_backends) assert pt_dep_presence(build_pt, requested_backends) @@ -212,7 +213,7 @@ def test_rai_builder_raises_if_it_fetches_an_unexpected_number_of_ml_deps( build.BuildError, match=r"Expected to place \d+ dependencies, but only found \d+", ): - rai_builder._fetch_deps_for("cpu") + rai_builder._fetch_deps_for(build.Device.CPU) def test_threaded_map(): @@ -251,18 +252,24 @@ def test_PTArchiveMacOSX_url(): arch = build.Architecture.X64 pt_version = RAI_VERSIONS.torch - pt_linux_cpu = build._PTArchiveLinux(build.Architecture.X64, "cpu", pt_version) + pt_linux_cpu = build._PTArchiveLinux( + build.Architecture.X64, build.Device.CPU, pt_version, False + ) x64_prefix = "https://download.pytorch.org/libtorch/" assert x64_prefix in pt_linux_cpu.url - pt_macosx_cpu = build._PTArchiveMacOSX(build.Architecture.ARM64, "cpu", pt_version) + pt_macosx_cpu = build._PTArchiveMacOSX( + build.Architecture.ARM64, build.Device.CPU, pt_version, False + ) arm64_prefix = "https://github.com/CrayLabs/ml_lib_builder/releases/download/" assert arm64_prefix in pt_macosx_cpu.url def test_PTArchiveMacOSX_gpu_error(): with pytest.raises(build.BuildError, match="support GPU on Mac OSX"): - build._PTArchiveMacOSX(build.Architecture.ARM64, "gpu", RAI_VERSIONS.torch).url + build._PTArchiveMacOSX( + build.Architecture.ARM64, build.Device.GPU, RAI_VERSIONS.torch, False + ).url def test_valid_platforms(): @@ -362,3 +369,36 @@ def test_valid_platforms(): ) def test_git_commands_are_configered_correctly_for_platforms(plat, cmd, expected_cmd): assert build.config_git_command(plat, cmd) == expected_cmd + + +def test_modify_source_files(p_test_dir): + def make_text_blurb(food): + return textwrap.dedent(f"""\ + My favorite food is {food} + {food} is an important part of a healthy breakfast + {food} {food} {food} {food} + This line should be unchanged! + --> {food} <-- + """) + + original_word = "SPAM" + mutated_word = "EGGS" + + source_files = [] + for i in range(3): + source_file = p_test_dir / f"test_{i}" + source_file.touch() + source_file.write_text(make_text_blurb(original_word)) + source_files.append(source_file) + # Modify a single file + build._modify_source_files(source_files[0], original_word, mutated_word) + assert source_files[0].read_text() == make_text_blurb(mutated_word) + assert source_files[1].read_text() == make_text_blurb(original_word) + assert source_files[2].read_text() == make_text_blurb(original_word) + + # Modify multiple files + build._modify_source_files( + (source_files[1], source_files[2]), original_word, mutated_word + ) + assert source_files[1].read_text() == make_text_blurb(mutated_word) + assert source_files[2].read_text() == make_text_blurb(mutated_word) diff --git a/tests/on_wlm/test_base_settings_on_wlm.py b/tests/on_wlm/test_base_settings_on_wlm.py index 0b31eedd2..77bebd524 100644 --- a/tests/on_wlm/test_base_settings_on_wlm.py +++ b/tests/on_wlm/test_base_settings_on_wlm.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test the launch and stop of models and ensembles using base @@ -54,7 +55,7 @@ def test_model_on_wlm(fileutils, test_dir, wlmutils): for _ in range(2): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): @@ -74,4 +75,4 @@ def test_model_stop_on_wlm(fileutils, test_dir, wlmutils): assert M1.name in exp._control._jobs.completed assert M2.name in exp._control._jobs.completed statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/on_wlm/test_colocated_model.py b/tests/on_wlm/test_colocated_model.py index 8baf74bf4..97a47542d 100644 --- a/tests/on_wlm/test_colocated_model.py +++ b/tests/on_wlm/test_colocated_model.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Model +from smartsim.status import SmartSimStatus if sys.platform == "darwin": supported_dbs = ["tcp", "deprecated"] @@ -60,14 +61,14 @@ def test_launch_colocated_model_defaults(fileutils, test_dir, coloutils, db_type exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -91,7 +92,7 @@ def test_colocated_model_disable_pinning(fileutils, test_dir, coloutils, db_type exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -114,7 +115,7 @@ def test_colocated_model_pinning_auto_2cpu(fileutils, test_dir, coloutils, db_ty exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -139,7 +140,7 @@ def test_colocated_model_pinning_range(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -164,7 +165,7 @@ def test_colocated_model_pinning_list(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" @@ -189,5 +190,5 @@ def test_colocated_model_pinning_mixed(fileutils, test_dir, coloutils, db_type): exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses: {statuses}" diff --git a/tests/on_wlm/test_containers_wlm.py b/tests/on_wlm/test_containers_wlm.py index 8dc4baae0..21f1e1c5e 100644 --- a/tests/on_wlm/test_containers_wlm.py +++ b/tests/on_wlm/test_containers_wlm.py @@ -28,9 +28,10 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity +from smartsim.status import SmartSimStatus """Test SmartRedis container integration on a supercomputer with a WLM.""" @@ -49,10 +50,9 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): """ launcher = wlmutils.get_test_launcher() - print(launcher) - if launcher not in ["pbs", "slurm"]: + if launcher not in ["pbs", "slurm", "dragon"]: pytest.skip( - f"Test only runs on systems with PBS or Slurm as WLM. Current launcher: {launcher}" + f"Test only runs on systems with PBS, Dragon, or Slurm as WLM. Current launcher: {launcher}" ) exp = Experiment( @@ -92,7 +92,7 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): + if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): exp.stop(orc) assert False # client ensemble failed diff --git a/tests/on_wlm/test_dragon.py b/tests/on_wlm/test_dragon.py new file mode 100644 index 000000000..a05d38141 --- /dev/null +++ b/tests/on_wlm/test_dragon.py @@ -0,0 +1,94 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +from smartsim import Experiment +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher +from smartsim.status import SmartSimStatus + +# retrieved from pytest fixtures +if pytest.test_launcher != "dragon": + pytestmark = pytest.mark.skip(reason="Test is only for Dragon WLM systems") + + +def test_dragon_global_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): + monkeypatch.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + exp: Experiment = Experiment( + "test_dragon_connection", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() + + +def test_dragon_exp_path(global_dragon_teardown, wlmutils, test_dir, monkeypatch): + monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH", raising=False) + monkeypatch.delenv("SMARTSIM_DRAGON_SERVER_PATH_EXP", raising=False) + exp: Experiment = Experiment( + "test_dragon_connection", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_COMPLETED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() + + +def test_dragon_cannot_honor(wlmutils, test_dir): + exp: Experiment = Experiment( + "test_dragon_cannot_honor", + exp_path=test_dir, + launcher=wlmutils.get_test_launcher(), + ) + rs = exp.create_run_settings(exe="sleep", exe_args=["1"]) + rs.set_nodes(100) + model = exp.create_model("sleep", run_settings=rs) + + exp.generate(model) + exp.start(model, block=True) + + try: + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_FAILED + finally: + launcher: DragonLauncher = exp._control._launcher + launcher.cleanup() diff --git a/tests/on_wlm/test_dragon_entrypoint.py b/tests/on_wlm/test_dragon_entrypoint.py new file mode 100644 index 000000000..025b5692f --- /dev/null +++ b/tests/on_wlm/test_dragon_entrypoint.py @@ -0,0 +1,295 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os +import pathlib +import typing as t + +import pytest + +# retrieved from pytest fixtures +if pytest.test_launcher != "dragon": + pytestmark = pytest.mark.skip(reason="Test is only for Dragon WLM systems") + +try: + import smartsim._core.entrypoints.dragon as drg +except: + pytest.skip("Unable to import Dragon library", allow_module_level=True) + + +@pytest.fixture +def mock_argv() -> t.List[str]: + """Fixture for returning valid arguments to the entrypoint""" + return ["+launching_address", "mock-addr", "+interface", "mock-interface"] + + +def test_file_removal(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that the log file is removed when expected""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + drg.remove_config_log() + assert not expected_path.exists(), "Dragon config file was not removed" + + +def test_file_removal_on_bad_path(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that file removal doesn't blow up if the log file wasn't created""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + # confirm the file doesn't exist... + assert not expected_path.exists(), "Dragon config file was not found" + + try: + # ensure we don't blow up + drg.remove_config_log() + except: + assert False + + +def test_dragon_failure( + mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch +): + """Verify that the expected cleanup actions are taken when the dragon + entrypoint exits""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + + def raiser(args_) -> int: + raise Exception("Something bad...") + + # we don't need to execute the entrypoint... + ctx.setattr("smartsim._core.entrypoints.dragon.execute_entrypoint", raiser) + + return_code = drg.main(mock_argv) + + # ensure our exception error code is returned + assert return_code == -1 + + +def test_dragon_main( + mock_argv: t.List[str], test_dir: str, monkeypatch: pytest.MonkeyPatch +): + """Verify that the expected startup & cleanup actions are taken when the dragon + entrypoint exits""" + mock_file_name = "mocked_file_name.txt" + expected_path = pathlib.Path(test_dir) / mock_file_name + expected_path.touch() + + with monkeypatch.context() as ctx: + # ensure we get outputs in the test directory + ctx.setattr( + "smartsim._core.entrypoints.dragon.get_log_path", lambda: str(expected_path) + ) + # we don't need to execute the actual entrypoint... + ctx.setattr( + "smartsim._core.entrypoints.dragon.execute_entrypoint", lambda args_: 0 + ) + + return_code = drg.main(mock_argv) + + # execute_entrypoint should return 0 from our mock + assert return_code == 0 + # the cleanup should remove our config file + assert not expected_path.exists(), "Dragon config file was not removed!" + # the environment should be set as expected + assert os.environ.get("PYTHONUNBUFFERED", None) == "1" + + +@pytest.mark.parametrize( + "signal_num", + [ + pytest.param(0, id="non-truthy signal"), + pytest.param(-1, id="negative signal"), + pytest.param(1, id="positive signal"), + ], +) +def test_signal_handler(signal_num: int, monkeypatch: pytest.MonkeyPatch): + """Verify that the signal handler performs expected actions""" + counter: int = 0 + + def increment_counter(*args, **kwargs): + nonlocal counter + counter += 1 + + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.dragon.cleanup", increment_counter) + ctx.setattr("smartsim._core.entrypoints.dragon.logger.info", increment_counter) + + drg.handle_signal(signal_num, None) + + # show that we log informational message & do cleanup (take 2 actions) + assert counter == 2 + + +def test_log_path(monkeypatch: pytest.MonkeyPatch): + """Verify that the log path is loaded & returned as expected""" + + with monkeypatch.context() as ctx: + expected_filename = "foo.log" + ctx.setattr( + "smartsim._core.config.config.Config.dragon_log_filename", expected_filename + ) + + log_path = drg.get_log_path() + + assert expected_filename in log_path + + +def test_summary(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that the summary is written to expected location w/expected information""" + + with monkeypatch.context() as ctx: + expected_ip = "127.0.0.111" + expected_interface = "mock_int0" + summary_file = pathlib.Path(test_dir) / "foo.log" + expected_hostname = "mockhostname" + + ctx.setattr( + "smartsim._core.config.config.Config.dragon_log_filename", + str(summary_file), + ) + ctx.setattr( + "smartsim._core.entrypoints.dragon.socket.gethostname", + lambda: expected_hostname, + ) + + drg.print_summary(expected_interface, expected_ip) + + summary = summary_file.read_text() + + assert expected_ip in summary + assert expected_interface in summary + assert expected_hostname in summary + + +def test_cleanup(monkeypatch: pytest.MonkeyPatch): + """Verify that the cleanup function attempts to remove the log file""" + counter: int = 0 + + def increment_counter(*args, **kwargs): + nonlocal counter + counter += 1 + + with monkeypatch.context() as ctx: + ctx.setattr( + "smartsim._core.entrypoints.dragon.remove_config_log", increment_counter + ) + drg.SHUTDOWN_INITIATED = False + drg.cleanup() + + # show that cleanup removes config + assert counter == 1 + # show that cleanup alters the flag to enable shutdown + assert drg.SHUTDOWN_INITIATED + + +def test_signal_handler_registration(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Verify that signal handlers are registered for all expected signals""" + sig_nums: t.List[int] = [] + + def track_args(*args, **kwargs): + nonlocal sig_nums + sig_nums.append(args[0]) + + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.dragon.signal.signal", track_args) + + # ensure valid start point + assert not sig_nums + + drg.register_signal_handlers() + + # ensure all expected handlers are registered + assert set(sig_nums) == set(drg.SIGNALS) + + +def test_arg_parser__no_args(): + """Verify arg parser fails when no args are not supplied""" + args_list = [] + + with pytest.raises(SystemExit) as ex: + # ensure that parser complains about missing required arguments + drg.parse_arguments(args_list) + + +def test_arg_parser__invalid_launch_addr(): + """Verify arg parser fails with empty launch_address""" + addr_flag = "+launching_address" + addr_value = "" + + args_list = [addr_flag, addr_value] + + with pytest.raises(ValueError) as ex: + args = drg.parse_arguments(args_list) + + +def test_arg_parser__required_only(): + """Verify arg parser succeeds when optional args are omitted""" + addr_flag = "+launching_address" + addr_value = "mock-address" + + args_list = [addr_flag, addr_value] + + args = drg.parse_arguments(args_list) + + assert args.launching_address == addr_value + assert not args.interface + + +def test_arg_parser__with_optionals(): + """Verify arg parser succeeds when optional args are included""" + addr_flag = "+launching_address" + addr_value = "mock-address" + + interface_flag = "+interface" + interface_value = "mock-int" + + args_list = [interface_flag, interface_value, addr_flag, addr_value] + + args = drg.parse_arguments(args_list) + + assert args.launching_address == addr_value + assert args.interface == interface_value diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index 6cf1c3918..cacdd5be5 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -49,19 +50,18 @@ def test_launch_orc_auto(test_dir, wlmutils): single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_single(test_dir, wlmutils): @@ -82,19 +82,18 @@ def test_launch_cluster_orc_single(test_dir, wlmutils): single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) def test_launch_cluster_orc_multi(test_dir, wlmutils): @@ -115,16 +114,15 @@ def test_launch_cluster_orc_multi(test_dir, wlmutils): single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) # don't use assert so that orc we don't leave an orphan process - if status.STATUS_FAILED in statuses: + if SmartSimStatus.STATUS_FAILED in statuses: exp.stop(orc) assert False exp.stop(orc) statuses = exp.get_status(orc) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index 5a039a7c9..aeea7b474 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -34,10 +34,10 @@ pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") -def test_mpmd_errors(monkeypatch): +def test_mpmd_errors(monkeypatch, test_dir): monkeypatch.setenv("SLURM_HET_SIZE", "1") exp_name = "test-het-job-errors" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, exp_path=test_dir, launcher="slurm") rs: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") rs2: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") with pytest.raises(ValueError): @@ -49,11 +49,11 @@ def test_mpmd_errors(monkeypatch): rs.set_het_group(1) -def test_set_het_groups(monkeypatch): +def test_set_het_groups(monkeypatch, test_dir): """Test ability to set one or more het groups to run setting""" monkeypatch.setenv("SLURM_HET_SIZE", "4") exp_name = "test-set-het-group" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, exp_path=test_dir, launcher="slurm") rs: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") rs.set_het_group([1]) assert rs.run_args["het-group"] == "1" @@ -63,11 +63,11 @@ def test_set_het_groups(monkeypatch): rs.set_het_group([4]) -def test_orch_single_cmd(monkeypatch, wlmutils): +def test_orch_single_cmd(monkeypatch, wlmutils, test_dir): """Test that single cmd is rejected in a heterogeneous job""" monkeypatch.setenv("SLURM_HET_SIZE", "1") exp_name = "test-orch-single-cmd" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, diff --git a/tests/on_wlm/test_launch_errors.py b/tests/on_wlm/test_launch_errors.py index 905d96f54..2498a5a91 100644 --- a/tests/on_wlm/test_launch_errors.py +++ b/tests/on_wlm/test_launch_errors.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.error import SmartSimError +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -54,7 +55,7 @@ def test_failed_status(fileutils, test_dir, wlmutils): time.sleep(2) stat = exp.get_status(model) assert len(stat) == 1 - assert stat[0] == status.STATUS_FAILED + assert stat[0] == SmartSimStatus.STATUS_FAILED def test_bad_run_command_args(fileutils, test_dir, wlmutils): diff --git a/tests/on_wlm/test_launch_ompi_lsf.py b/tests/on_wlm/test_launch_ompi_lsf.py index ed5de291b..51c82e418 100644 --- a/tests/on_wlm/test_launch_ompi_lsf.py +++ b/tests/on_wlm/test_launch_ompi_lsf.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -49,4 +50,4 @@ def test_launch_openmpi_lsf(fileutils, test_dir, wlmutils): model = exp.create_model("ompi-model", path=test_dir, run_settings=settings) exp.start(model, block=True) statuses = exp.get_status(model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py index 4e5f45e0b..8f7d823b8 100644 --- a/tests/on_wlm/test_local_step.py +++ b/tests/on_wlm/test_local_step.py @@ -29,7 +29,7 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings import RunSettings # retrieved from pytest fixtures diff --git a/tests/on_wlm/test_preview_wlm.py b/tests/on_wlm/test_preview_wlm.py new file mode 100644 index 000000000..78da30c9a --- /dev/null +++ b/tests/on_wlm/test_preview_wlm.py @@ -0,0 +1,409 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from os import path as osp + +import numpy as np +import pytest +from jinja2.filters import FILTERS + +from smartsim import Experiment +from smartsim._core import Manifest, previewrenderer +from smartsim._core.config import CONFIG +from smartsim.database import Orchestrator +from smartsim.settings import QsubBatchSettings, RunSettings + +pytestmark = pytest.mark.slow_tests + +on_wlm = (pytest.test_launcher in pytest.wlm_options,) + + +@pytest.fixture +def choose_host(): + def _choose_host(wlmutils, index: int = 0): + hosts = wlmutils.get_test_hostlist() + if hosts: + return hosts[index] + return None + + return _choose_host + + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_wlm_run_commands_cluster_orc_model( + test_dir, coloutils, fileutils, wlmutils +): + """ + Test preview of wlm run command and run aruguments on a + orchestrator and model + """ + + exp_name = "test-preview-orc-model" + launcher = wlmutils.get_test_launcher() + test_port = wlmutils.get_test_port() + test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + + network_interface = wlmutils.get_test_interface() + orc = exp.create_database( + wlmutils.get_test_port(), + db_nodes=3, + batch=False, + interface=network_interface, + single_cmd=True, + hosts=wlmutils.get_test_hostlist(), + db_identifier="testdb_reg", + ) + + db_args = { + "port": test_port, + "db_cpus": 1, + "debug": True, + "db_identifier": "testdb_colo", + } + + # Create model with colocated database + smartsim_model = coloutils.setup_test_colo( + fileutils, "uds", exp, test_script, db_args, on_wlm=on_wlm + ) + + preview_manifest = Manifest(orc, smartsim_model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output + assert "Run Arguments" in output + assert "nodes" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_model_on_wlm(fileutils, test_dir, wlmutils): + """ + Test preview of wlm run command and run aruguments for a model + """ + exp_name = "test-preview-model-wlm" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") + settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5") + M1 = exp.create_model("m1", path=test_dir, run_settings=settings1) + M2 = exp.create_model("m2", path=test_dir, run_settings=settings2) + + preview_manifest = Manifest(M1, M2) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + if pytest.test_launcher != "dragon": + assert "Run Command" in output + assert "ntasks" in output + assert "time" in output + assert "nodes" in output + assert "Run Arguments" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_model(fileutils, test_dir, wlmutils): + """Test the preview of a model with batch settings""" + + exp_name = "test-batch-model" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") + model = exp.create_model( + "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings + ) + model.set_path(test_dir) + + preview_manifest = Manifest(model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + assert "nodes" in output + assert "time" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_ensemble(fileutils, test_dir, wlmutils): + """Test preview of a batch ensemble""" + + exp_name = "test-preview-batch-ensemble" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + settings = wlmutils.get_run_settings("python", f"{script} --time=5") + M1 = exp.create_model("m1", path=test_dir, run_settings=settings) + M2 = exp.create_model("m2", path=test_dir, run_settings=settings) + + batch = exp.create_batch_settings(nodes=1, time="00:01:00") + add_batch_resources(wlmutils, batch) + + batch.set_account(wlmutils.get_test_account()) + ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) + ensemble.add_model(M1) + ensemble.add_model(M2) + ensemble.set_path(test_dir) + + preview_manifest = Manifest(ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + assert "nodes" in output + assert "time" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_launch_command(test_dir, wlmutils, choose_host): + """Test preview launch command for orchestrator, models, and + ensembles""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_preview_launch_command" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular database + orc = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + + model_params = {"port": 6379, "password": "unbreakable_password"} + rs1 = RunSettings("bash", "multi_tags_template.sh") + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model( + "echo-hello", run_settings=rs1, params=model_params + ) + + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + + # setup ensemble parameter space + learning_rate = list(np.linspace(0.01, 0.5)) + train_params = {"LR": learning_rate} + + run = exp.create_run_settings(exe="python", exe_args="./train-model.py") + + ensemble = exp.create_ensemble( + "Training-Ensemble", + params=train_params, + params_as_args=["LR"], + run_settings=run, + perm_strategy="random", + n_models=4, + ) + + preview_manifest = Manifest(orc, spam_eggs_model, hello_world_model, ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "orchestrator" in output + assert "echo-spam" in output + assert "echo-hello" in output + + assert "Training-Ensemble" in output + assert "me: Training-Ensemble_0" in output + assert "Training-Ensemble_1" in output + assert "Training-Ensemble_2" in output + assert "Training-Ensemble_3" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_batch_launch_command(fileutils, test_dir, wlmutils): + """Test the preview of a model with batch settings""" + + exp_name = "test-batch-entities" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + script = fileutils.get_test_conf_path("sleep.py") + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + run_settings = wlmutils.get_run_settings("python", f"{script} --time=5") + model = exp.create_model( + "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings + ) + model.set_path(test_dir) + + orc = Orchestrator( + wlmutils.get_test_port(), + db_nodes=3, + batch=True, + interface="lo", + launcher="slurm", + run_command="srun", + ) + orc.set_batch_arg("account", "ACCOUNT") + + preview_manifest = Manifest(orc, model) + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Batch Launch: True" in output + assert "Batch Command" in output + assert "Batch Arguments" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_ensemble_batch(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher + ) + # Create Orchestrator + db = exp.create_database(port=6780, interface="lo") + exp.generate(db, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") + batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, batch_settings) + ensemble = exp.create_ensemble( + "fd_simulation", run_settings=rs1, batch_settings=batch_settings, replicas=2 + ) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + + preview_manifest = Manifest(db, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Database Identifier" in output + assert "Database Backend" in output + assert "Type" in output + + +@pytest.mark.skipif( + pytest.test_launcher not in pytest.wlm_options, + reason="Not testing WLM integrations", +) +def test_preview_ensemble_db_script(wlmutils, test_dir): + """ + Test preview of a torch script on a model in an ensemble. + """ + # Initialize the Experiment and set the launcher to auto + test_launcher = wlmutils.get_test_launcher() + exp = Experiment("getting-started", launcher=test_launcher) + + orch = exp.create_database(db_identifier="test_db1") + orch_2 = exp.create_database(db_identifier="test_db2", db_nodes=3) + # Initialize a RunSettings object + model_settings = exp.create_run_settings(exe="python", exe_args="params.py") + model_settings_2 = exp.create_run_settings(exe="python", exe_args="params.py") + model_settings_3 = exp.create_run_settings(exe="python", exe_args="params.py") + # Initialize a Model object + model_instance = exp.create_model("model_name", model_settings) + model_instance_2 = exp.create_model("model_name_2", model_settings_2) + batch = exp.create_batch_settings(time="24:00:00", account="test") + ensemble = exp.create_ensemble( + "ensemble", batch_settings=batch, run_settings=model_settings_3, replicas=2 + ) + ensemble.add_model(model_instance) + ensemble.add_model(model_instance_2) + + # TorchScript string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Attach TorchScript to Model + model_instance.add_script( + name="example_script", + script=torch_script_str, + device="GPU", + devices_per_node=2, + first_device=0, + ) + preview_manifest = Manifest(ensemble, orch, orch_2) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Script" in output diff --git a/tests/on_wlm/test_restart.py b/tests/on_wlm/test_restart.py index 42bbe752c..0116c10d3 100644 --- a/tests/on_wlm/test_restart.py +++ b/tests/on_wlm/test_restart.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -48,10 +49,10 @@ def test_restart(fileutils, test_dir, wlmutils): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # TODO add job history check here. diff --git a/tests/on_wlm/test_simple_base_settings_on_wlm.py b/tests/on_wlm/test_simple_base_settings_on_wlm.py index 1611781eb..caa55da3e 100644 --- a/tests/on_wlm/test_simple_base_settings_on_wlm.py +++ b/tests/on_wlm/test_simple_base_settings_on_wlm.py @@ -28,8 +28,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.settings.settings import RunSettings +from smartsim.status import SmartSimStatus """ Test the launch and stop of simple models and ensembles that use base @@ -63,7 +64,7 @@ def test_simple_model_on_wlm(fileutils, test_dir, wlmutils): # launch model twice to show that it can also be restarted for _ in range(2): exp.start(M, block=True) - assert exp.get_status(M)[0] == status.STATUS_COMPLETED + assert exp.get_status(M)[0] == SmartSimStatus.STATUS_COMPLETED def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): @@ -83,4 +84,4 @@ def test_simple_model_stop_on_wlm(fileutils, test_dir, wlmutils): time.sleep(2) exp.stop(M) assert M.name in exp._control._jobs.completed - assert exp.get_status(M)[0] == status.STATUS_CANCELLED + assert exp.get_status(M)[0] == SmartSimStatus.STATUS_CANCELLED diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 1ecc27442..28ddf92f7 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -24,11 +24,14 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os.path from copy import deepcopy +from pathlib import Path import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test the launch of simple entity types on pre-existing allocations. @@ -59,7 +62,38 @@ def test_models(fileutils, test_dir, wlmutils): exp.start(M1, M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) + + +def test_multinode_app(mpi_app_path, test_dir, wlmutils): + + if not mpi_app_path: + pytest.skip("Test needs MPI to run") + + exp_name = "test-mpi-app" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + + settings = exp.create_run_settings(str(mpi_app_path), []) + settings.set_nodes(3) + + model = exp.create_model("mpi_app", run_settings=settings) + exp.generate(model) + + exp.start(model, block=True) + + p = Path(model.path) + output_files = sorted([str(path) for path in p.glob("mpi_hello*")]) + expected_files = sorted( + [os.path.join(model.path, f"mpi_hello.{idx}.log") for idx in range(3)] + ) + + assert output_files == expected_files + + for index, file in enumerate(output_files): + with open(file) as f: + assert f.readlines() == [ + f"Hello world from rank {index} out of 3 processors\n" + ] def test_ensemble(fileutils, test_dir, wlmutils): @@ -71,11 +105,10 @@ def test_ensemble(fileutils, test_dir, wlmutils): settings.set_tasks(1) ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_summary(fileutils, test_dir, wlmutils): @@ -84,21 +117,21 @@ def test_summary(fileutils, test_dir, wlmutils): exp_name = "test-launch-summary" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) - sleep = fileutils.get_test_conf_path("sleep.py") + sleep_exp = fileutils.get_test_conf_path("sleep.py") bad = fileutils.get_test_conf_path("bad.py") - sleep_settings = exp.create_run_settings("python", f"{sleep} --time=3") + sleep_settings = exp.create_run_settings("python", f"{sleep_exp} --time=3") sleep_settings.set_tasks(1) bad_settings = exp.create_run_settings("python", f"{bad} --time=6") bad_settings.set_tasks(1) - sleep = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) + sleep_exp = exp.create_model("sleep", path=test_dir, run_settings=sleep_settings) bad = exp.create_model("bad", path=test_dir, run_settings=bad_settings) # start and poll - exp.start(sleep, bad) - assert exp.get_status(bad)[0] == status.STATUS_FAILED - assert exp.get_status(sleep)[0] == status.STATUS_COMPLETED + exp.start(sleep_exp, bad) + assert exp.get_status(bad)[0] == SmartSimStatus.STATUS_FAILED + assert exp.get_status(sleep_exp)[0] == SmartSimStatus.STATUS_COMPLETED summary_str = exp.summary(style="plain") print(summary_str) @@ -106,13 +139,18 @@ def test_summary(fileutils, test_dir, wlmutils): rows = [s.split() for s in summary_str.split("\n")] headers = ["Index"] + rows.pop(0) + # There is no guarantee that the order of + # the rows will be sleep, bad row = dict(zip(headers, rows[0])) - assert sleep.name == row["Name"] - assert sleep.type == row["Entity-Type"] + row_1 = dict(zip(headers, rows[1])) + if row["Name"] != sleep_exp.name: + row_1, row = row, row_1 + + assert sleep_exp.name == row["Name"] + assert sleep_exp.type == row["Entity-Type"] assert 0 == int(row["RunID"]) assert 0 == int(row["Returncode"]) - row_1 = dict(zip(headers, rows[1])) assert bad.name == row_1["Name"] assert bad.type == row_1["Entity-Type"] assert 0 == int(row_1["RunID"]) diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 8d75d9f65..abc7441bb 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -28,7 +28,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus """ Test Stopping launched entities. @@ -55,7 +56,7 @@ def test_stop_entity(fileutils, test_dir, wlmutils): time.sleep(5) exp.stop(M1) assert M1.name in exp._control._jobs.completed - assert exp.get_status(M1)[0] == status.STATUS_CANCELLED + assert exp.get_status(M1)[0] == SmartSimStatus.STATUS_CANCELLED def test_stop_entity_list(fileutils, test_dir, wlmutils): @@ -67,11 +68,10 @@ def test_stop_entity_list(fileutils, test_dir, wlmutils): settings.set_tasks(1) ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=False) time.sleep(5) exp.stop(ensemble) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_CANCELLED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) assert all([m.name in exp._control._jobs.completed for m in ensemble]) diff --git a/tests/on_wlm/test_wlm_orc_config_settings.py b/tests/on_wlm/test_wlm_orc_config_settings.py index f9ab60609..c74f2a497 100644 --- a/tests/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/on_wlm/test_wlm_orc_config_settings.py @@ -27,6 +27,9 @@ import pytest from smartsim.error import SmartSimError +from smartsim.log import get_logger + +logger = get_logger(__name__) # retrieved from pytest fixtures if pytest.test_launcher not in pytest.wlm_options: @@ -40,13 +43,15 @@ pytestmark = pytest.mark.skip(reason="SmartRedis version is < 0.3.1") -def test_config_methods_on_wlm_single(dbutils, db): +def test_config_methods_on_wlm_single(dbutils, prepare_db, single_db): """Test all configuration file edit methods on single node WLM db""" + db = prepare_db(single_db).orchestrator # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = dbutils.get_db_configs() for setting, value in configs.items(): + logger.debug(f"Setting {setting}={value}") config_set_method = dbutils.get_config_edit_method(db, setting) config_set_method(value) @@ -67,14 +72,16 @@ def test_config_methods_on_wlm_single(dbutils, db): db.set_db_conf(key, value) -def test_config_methods_on_wlm_cluster(dbutils, db_cluster): +def test_config_methods_on_wlm_cluster(dbutils, prepare_db, clustered_db): """Test all configuration file edit methods on an active clustered db""" + db = prepare_db(clustered_db).orchestrator # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = dbutils.get_db_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db_cluster, setting) + logger.debug(f"Setting {setting}={value}") + config_set_method = dbutils.get_config_edit_method(db, setting) config_set_method(value) # ensure SmartSimError is raised when a clustered database's @@ -83,7 +90,8 @@ def test_config_methods_on_wlm_cluster(dbutils, db_cluster): for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - db_cluster.set_db_conf(key, value) + logger.debug(f"Setting {key}={value}") + db.set_db_conf(key, value) # ensure TypeError is raised when a clustered database's # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs @@ -91,4 +99,5 @@ def test_config_methods_on_wlm_cluster(dbutils, db_cluster): for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - db_cluster.set_db_conf(key, value) + logger.debug(f"Setting {key}={value}") + db.set_db_conf(key, value) diff --git a/tests/test_collector_manager.py b/tests/test_collector_manager.py new file mode 100644 index 000000000..56add1ef7 --- /dev/null +++ b/tests/test_collector_manager.py @@ -0,0 +1,481 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import asyncio +import datetime + +import pytest + +from conftest import MockCollectorEntityFunc +from smartsim._core.utils.telemetry.collector import ( + CollectorManager, + DBConnectionCollector, + DBConnectionCountCollector, + DBMemoryCollector, + FileSink, + redisa, +) +from smartsim._core.utils.telemetry.telemetry import JobEntity + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_collector_manager_add(mock_entity: MockCollectorEntityFunc, mock_sink) -> None: + """Ensure that collector manager add & clear work as expected""" + entity1 = mock_entity(telemetry_on=True) + + con_col = DBConnectionCollector(entity1, mock_sink()) + mem_col = DBMemoryCollector(entity1, mock_sink()) + + manager = CollectorManager() + + # ensure manager starts empty + assert len(list(manager.all_collectors)) == 0 + + # ensure added item is in the collector list + manager.add(con_col) + assert len(list(manager.all_collectors)) == 1 + + # ensure a duplicate isn't added + manager.add(con_col) + assert len(list(manager.all_collectors)) == 1 + + # ensure another collector for the same entity is added + manager.add(mem_col) + assert len(list(manager.all_collectors)) == 2 + + # create a collector for another entity + entity2 = mock_entity(telemetry_on=True) + con_col2 = DBConnectionCollector(entity2, mock_sink()) + + # ensure collectors w/same type for new entities are not treated as dupes + manager.add(con_col2) + assert len(list(manager.all_collectors)) == 3 + + # verify no dupe on second entity + manager.add(con_col2) + assert len(list(manager.all_collectors)) == 3 + + manager.clear() + assert len(list(manager.all_collectors)) == 0 + + # ensure post-clear adding still works + manager.add(con_col2) + assert len(list(manager.all_collectors)) == 1 + + +def test_collector_manager_add_multi( + mock_entity: MockCollectorEntityFunc, mock_sink +) -> None: + """Ensure that collector manager multi-add works as expected""" + entity = mock_entity(telemetry_on=True) + + con_col = DBConnectionCollector(entity, mock_sink()) + mem_col = DBMemoryCollector(entity, mock_sink()) + manager = CollectorManager() + + # add multiple items at once + manager.add_all([con_col, mem_col]) + + assert len(list(manager.all_collectors)) == 2 + + # ensure multi-add does not produce dupes + con_col2 = DBConnectionCollector(entity, mock_sink()) + mem_col2 = DBMemoryCollector(entity, mock_sink()) + + manager.add_all([con_col2, mem_col2]) + assert len(list(manager.all_collectors)) == 2 + + +@pytest.mark.asyncio +async def test_collector_manager_remove( + mock_entity: MockCollectorEntityFunc, mock_sink +) -> None: + """Ensure that collector manager solo remove works as expected""" + entity1 = mock_entity(telemetry_on=True) + entity2 = mock_entity(telemetry_on=True) + + con_col1 = DBConnectionCollector(entity1, mock_sink()) + mem_col1 = DBMemoryCollector(entity1, mock_sink()) + manager = CollectorManager() + + # ensure multi-add does not produce dupes + con_col2 = DBConnectionCollector(entity2, mock_sink()) + mem_col2 = DBMemoryCollector(entity2, mock_sink()) + + manager.add_all([con_col1, mem_col1, con_col2, mem_col2]) + assert len(manager.all_collectors) == 4 + + await manager.remove(entity1) + assert len(manager.all_collectors) == 2 + + await manager.remove(entity1) + assert len(manager.all_collectors) == 2 + + await manager.remove(entity2) + assert len(manager.all_collectors) == 0 + + +@pytest.mark.asyncio +async def test_collector_manager_remove_all( + mock_entity: MockCollectorEntityFunc, mock_sink +) -> None: + """Ensure that collector manager multi-remove works as expected""" + entity1 = mock_entity(telemetry_on=True) + entity2 = mock_entity(telemetry_on=True) + + con_col1 = DBConnectionCollector(entity1, mock_sink()) + mem_col1 = DBMemoryCollector(entity1, mock_sink()) + manager = CollectorManager() + + # ensure multi-add does not produce dupes + con_col2 = DBConnectionCollector(entity2, mock_sink()) + mem_col2 = DBMemoryCollector(entity2, mock_sink()) + + manager.add_all([con_col1, mem_col1, con_col2, mem_col2]) + assert len(manager.all_collectors) == 4 + + await manager.remove_all([entity1, entity2]) + assert len(manager.all_collectors) == 0 + + +@pytest.mark.asyncio +async def test_collector_manager_collect( + mock_entity: MockCollectorEntityFunc, + mock_redis, + monkeypatch: pytest.MonkeyPatch, + mock_con, + mock_mem, + mock_sink, +) -> None: + """Ensure that all collectors are executed and some metric is retrieved + NOTE: responses & producer are mocked""" + entity1 = mock_entity(port=1234, name="entity1", telemetry_on=True) + entity2 = mock_entity(port=2345, name="entity2", telemetry_on=True) + + sinks = [mock_sink(), mock_sink(), mock_sink()] + con_col1 = DBConnectionCollector(entity1, sinks[0]) + mem_col1 = DBMemoryCollector(entity1, sinks[1]) + mem_col2 = DBMemoryCollector(entity2, sinks[2]) + + manager = CollectorManager() + manager.add_all([con_col1, mem_col1, mem_col2]) + + # Execute collection + with monkeypatch.context() as ctx: + ctx.setattr( + redisa, + "Redis", + mock_redis(client_stats=mock_con(1, 10), mem_stats=mock_mem(1, 10)), + ) + await manager.collect() + + # verify each collector retrieved some metric & sent it to the sink + for sink in sinks: + value = sink.args + assert value + + +@pytest.mark.asyncio +async def test_collector_manager_collect_filesink( + mock_entity: MockCollectorEntityFunc, + mock_redis, + monkeypatch, + mock_mem, + mock_con, +) -> None: + """Ensure that all collectors are executed and some metric is retrieved + and the FileSink is written to as expected""" + entity1 = mock_entity(port=1234, name="entity1", telemetry_on=True) + entity2 = mock_entity(port=2345, name="entity2", telemetry_on=True) + + sinks = [ + FileSink(entity1.status_dir + "/1_con.csv"), + FileSink(entity1.status_dir + "/1_mem.csv"), + FileSink(entity2.status_dir + "/2_mem.csv"), + ] + con_col1 = DBConnectionCollector(entity1, sinks[0]) + mem_col1 = DBMemoryCollector(entity1, sinks[1]) + mem_col2 = DBMemoryCollector(entity2, sinks[2]) + + manager = CollectorManager() + manager.add_all([con_col1, mem_col1, mem_col2]) + + # Execute collection + with monkeypatch.context() as ctx: + ctx.setattr( + redisa, + "Redis", + mock_redis(client_stats=mock_con(1, 10), mem_stats=mock_mem(1, 10)), + ) + await manager.collect() + + # verify each collector retrieved some metric & sent it to the sink + for sink in sinks: + save_to = sink.path + assert save_to.exists() + if "con" in str(save_to): + assert "127.0.0." in save_to.read_text() + else: + # look for something multiplied by 1000 + assert "000" in save_to.read_text() + + +@pytest.mark.asyncio +async def test_collector_manager_collect_integration( + test_dir: str, mock_entity: MockCollectorEntityFunc, prepare_db, local_db, mock_sink +) -> None: + """Ensure that all collectors are executed and some metric is retrieved""" + + db = prepare_db(local_db).orchestrator + entity1 = mock_entity(port=db.ports[0], name="e1", telemetry_on=True) + entity2 = mock_entity(port=db.ports[0], name="e2", telemetry_on=True) + + # todo: consider a MockSink so i don't have to save the last value in the collector + sinks = [mock_sink(), mock_sink(), mock_sink()] + con_col1 = DBConnectionCollector(entity1, sinks[0]) + mem_col1 = DBMemoryCollector(entity1, sinks[1]) + mem_col2 = DBMemoryCollector(entity2, sinks[2]) + + manager = CollectorManager() + manager.add_all([con_col1, mem_col1, mem_col2]) + + # Execute collection + await manager.collect() + + # verify each collector retrieved some metric & sent it to the sink + for sink in sinks: + value = sink.args + assert value + + +@pytest.mark.parametrize( + "timeout_at,delay_for,expect_fail", + [ + pytest.param(1000, 5000, True, id="1s timeout"), + pytest.param(2000, 5000, True, id="2s timeout"), + pytest.param(3000, 5000, True, id="3s timeout"), + pytest.param(4000, 5000, True, id="4s timeout"), + pytest.param(2000, 1000, False, id="under timeout"), + ], +) +@pytest.mark.asyncio +async def test_collector_manager_timeout_db( + mock_entity: MockCollectorEntityFunc, + mock_redis, + monkeypatch: pytest.MonkeyPatch, + mock_mem, + mock_con, + timeout_at: int, + delay_for: int, + expect_fail: bool, + mock_sink, +) -> None: + """Ensure that the collector timeout is honored""" + entity1 = mock_entity(port=1234, name="e1", telemetry_on=True) + entity2 = mock_entity(port=2345, name="e2", telemetry_on=True) + + sinks = [mock_sink(), mock_sink(), mock_sink()] + con_col1 = DBConnectionCollector(entity1, sinks[0]) + mem_col1 = DBMemoryCollector(entity1, sinks[1]) + mem_col2 = DBMemoryCollector(entity2, sinks[2]) + + manager = CollectorManager(timeout_ms=timeout_at) + manager.add_all([con_col1, mem_col1, mem_col2]) + + async def snooze() -> None: + await asyncio.sleep(delay_for / 1000) + + # Execute collection + with monkeypatch.context() as ctx: + ctx.setattr( + redisa, + "Redis", + mock_redis( + client_stats=mock_con(1, 10), + mem_stats=mock_mem(1, 10), + coll_side_effect=snooze, + ), + ) + + ts0 = datetime.datetime.utcnow() + await manager.collect() + ts1 = datetime.datetime.utcnow() + + t_diff = ts1 - ts0 + actual_delay = 1000 * t_diff.seconds + + if expect_fail: + assert timeout_at <= actual_delay < delay_for + else: + assert delay_for <= actual_delay < timeout_at + + +@pytest.mark.parametrize( + "e_type,telemetry_on", + [ + pytest.param("model", False, id="models"), + pytest.param("model", True, id="models, telemetry enabled"), + pytest.param("ensemble", False, id="ensemble"), + pytest.param("ensemble", True, id="ensemble, telemetry enabled"), + pytest.param("orchestrator", False, id="orchestrator"), + pytest.param("orchestrator", True, id="orchestrator, telemetry enabled"), + pytest.param("dbnode", False, id="dbnode"), + pytest.param("dbnode", True, id="dbnode, telemetry enabled"), + ], +) +@pytest.mark.asyncio +async def test_collector_manager_find_nondb( + mock_entity: MockCollectorEntityFunc, + e_type: str, + telemetry_on: bool, +) -> None: + """Ensure that the number of collectors returned for entity types match expectations + NOTE: even orchestrator returns 0 mapped collectors because no collector output + paths are set on the entity""" + entity = mock_entity(port=1234, name="e1", type=e_type, telemetry_on=telemetry_on) + manager = CollectorManager(timeout_ms=10000) + + # Ask manager to produce appliable collectors + manager.register_collectors(entity) + collectors = manager.all_collectors + + # Verify collector counts, assuming no per-collector config + assert 0 == len(collectors) + + +@pytest.mark.asyncio +async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) -> None: + """Ensure that the manifest allows individually enabling a given collector""" + entity: JobEntity = mock_entity( + port=1234, name="entity1", type="model", telemetry_on=True + ) + manager = CollectorManager() + + # 0. popping all should result in no collectors mapping to the entity + manager.register_collectors(entity) + collectors = manager.all_collectors + + assert len(collectors) == 0 + + # 1. ensure DBConnectionCountCollector is mapped + entity = mock_entity( + port=1234, name="entity1", type="orchestrator", telemetry_on=True + ) + entity.collectors["client"] = "mock/path.csv" + manager = CollectorManager() + + # 2. client count collector should be mapped + manager.register_collectors(entity) + collectors = manager.all_collectors + + assert len(collectors) == 1 + assert isinstance(collectors[0], DBConnectionCollector) + + # 3. ensure DBConnectionCountCollector is mapped + entity = mock_entity( + port=1234, name="entity1", type="orchestrator", telemetry_on=True + ) + entity.collectors["client_count"] = "mock/path.csv" + manager = CollectorManager() + + # 4. client count collector should be mapped + manager.register_collectors(entity) + collectors = manager.all_collectors + + assert len(collectors) == 1 + assert isinstance(collectors[0], DBConnectionCountCollector) + + # ensure DbMemoryCollector is mapped + entity = mock_entity( + port=1234, name="entity1", type="orchestrator", telemetry_on=True + ) + entity.collectors["memory"] = "mock/path.csv" + manager = CollectorManager() + + # 5. memory collector should be mapped + manager.register_collectors(entity) + collectors = manager.all_collectors + + assert len(collectors) == 1 + assert isinstance(collectors[0], DBMemoryCollector) + + +@pytest.mark.asyncio +async def test_collector_manager_find_entity_disabled( + mock_entity: MockCollectorEntityFunc, +) -> None: + """Ensure that disabling telemetry on the entity results in no collectors""" + entity: JobEntity = mock_entity(port=1234, name="entity1", type="orchestrator") + + # set paths for all known collectors + entity.collectors["client"] = "mock/path.csv" + entity.collectors["client_count"] = "mock/path.csv" + entity.collectors["memory"] = "mock/path.csv" + + manager = CollectorManager() + + # ON behavior should locate multiple collectors + entity.telemetry_on = True + manager.register_collectors(entity) + collectors = manager.all_collectors + assert len(collectors) > 0 + + # OFF behavior should locate ZERO collectors + entity.telemetry_on = False + manager.register_collectors(entity) + collectors = manager.all_collectors + assert len(collectors) == 0 + + +@pytest.mark.asyncio +async def test_collector_manager_find_entity_unmapped( + mock_entity: MockCollectorEntityFunc, +) -> None: + """Ensure that an entity type that is not mapped results in no collectors""" + entity: JobEntity = mock_entity( + port=1234, name="entity1", type="model", telemetry_on=True + ) + manager = CollectorManager() + + # set paths for all known collectors + entity.collectors["client"] = "mock/path.csv" + entity.collectors["client_count"] = "mock/path.csv" + entity.collectors["memory"] = "mock/path.csv" + + manager = CollectorManager() + + # ON behavior should locate ZERO collectors + entity.telemetry_on = True + manager.register_collectors(entity) + collectors = manager.all_collectors + assert len(collectors) == 0 + + # OFF behavior should locate ZERO collectors + entity.telemetry_on = False + manager.register_collectors(entity) + collectors = manager.all_collectors + assert len(collectors) == 0 diff --git a/tests/test_collector_sink.py b/tests/test_collector_sink.py new file mode 100644 index 000000000..148a72ef7 --- /dev/null +++ b/tests/test_collector_sink.py @@ -0,0 +1,107 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import uuid + +import pytest + +from conftest import MockCollectorEntityFunc +from smartsim._core.utils.telemetry.collector import FileSink + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +@pytest.mark.asyncio +async def test_sink_null_filename(mock_entity: MockCollectorEntityFunc) -> None: + """Ensure the filesink handles a null filename as expected""" + with pytest.raises(ValueError): + # pass null file path + sink = FileSink(None) # type: ignore + + +@pytest.mark.asyncio +async def test_sink_write(mock_entity: MockCollectorEntityFunc) -> None: + """Ensure the FileSink writes values to the output file as expected""" + entity = mock_entity(port=1234, name="e1") + sink = FileSink(entity.status_dir + "/test.csv") + + # all values are converted to strings before saving + v1, v2, v3 = str(uuid.uuid4()), str(uuid.uuid4()), str(uuid.uuid4()) + await sink.save(v1, v2, v3) + + # show file was written + path = sink.path + assert path.exists() + + # show each value is found in the file + content = path.read_text() + for value in [v1, v2, v3]: + assert str(value) in content + + +@pytest.mark.asyncio +async def test_sink_write_nonstring_input(mock_entity: MockCollectorEntityFunc) -> None: + """Ensure the FileSink writes values to the output file as expected + when inputs are non-strings""" + entity = mock_entity(port=1234, name="e1") + sink = FileSink(entity.status_dir + "/test.csv") + + # v1, v2 are not converted to strings + v1, v2 = 1, uuid.uuid4() + await sink.save(v1, v2) + + # show file was written + path = sink.path + assert path.exists() + + # split down to individual elements to ensure expected default format + content = path.read_text() + lines = content.splitlines() + line = lines[0].split(",") + + # show each value can be found + assert [str(v1), str(v2)] == line + + +@pytest.mark.asyncio +async def test_sink_write_no_inputs(mock_entity: MockCollectorEntityFunc) -> None: + """Ensure the FileSink writes to an output file without error if no + values are supplied""" + entity = mock_entity(port=1234, name="e1") + sink = FileSink(entity.status_dir + "/test.csv") + + num_saves = 5 + for _ in range(num_saves): + await sink.save() + + path = sink.path + assert path.exists() + + # show file was written + content = path.read_text() + + # show a line was written for each call to save + assert len(content.splitlines()) == num_saves diff --git a/tests/test_collectors.py b/tests/test_collectors.py new file mode 100644 index 000000000..2eb61d62d --- /dev/null +++ b/tests/test_collectors.py @@ -0,0 +1,305 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# import pathlib + +import typing as t + +import pytest + +import smartsim._core.entrypoints.telemetrymonitor +import smartsim._core.utils.telemetry.collector +from conftest import MockCollectorEntityFunc, MockSink +from smartsim._core.utils.telemetry.collector import ( + DBConnectionCollector, + DBConnectionCountCollector, + DBMemoryCollector, + redisa, +) + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +PrepareDB = t.Callable[[dict], smartsim.experiment.Orchestrator] + + +@pytest.mark.asyncio +async def test_dbmemcollector_prepare( + mock_entity: MockCollectorEntityFunc, mock_sink +) -> None: + """Ensure that collector preparation succeeds when expected""" + entity = mock_entity(telemetry_on=True) + + collector = DBMemoryCollector(entity, mock_sink()) + await collector.prepare() + assert collector._client + + +@pytest.mark.asyncio +async def test_dbmemcollector_prepare_fail( + mock_entity: MockCollectorEntityFunc, + mock_sink: MockSink, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure that collector preparation reports a failure to connect + when the redis client cannot be created""" + entity = mock_entity(telemetry_on=True) + + with monkeypatch.context() as ctx: + # mock up a redis constructor that returns None + ctx.setattr(redisa, "Redis", lambda host, port: None) + + sink = mock_sink() + collector = DBMemoryCollector(entity, sink) + assert sink.num_saves == 0 + + await collector.prepare() + + # Attempt to save header when preparing... + assert not collector._client + assert sink.num_saves == 1 + + +@pytest.mark.asyncio +async def test_dbcollector_config( + mock_entity: MockCollectorEntityFunc, + mock_sink, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure that missing required db collector config causes an exception""" + + # Check that a bad host causes exception + entity = mock_entity(host="", telemetry_on=True) + with pytest.raises(ValueError): + DBMemoryCollector(entity, mock_sink()) + + entity = mock_entity(host=" ", telemetry_on=True) + with pytest.raises(ValueError): + DBMemoryCollector(entity, mock_sink()) + + # Check that a bad port causes exception + entity = mock_entity(port="", telemetry_on=True) # type: ignore + with pytest.raises(ValueError): + DBMemoryCollector(entity, mock_sink()) + + +@pytest.mark.asyncio +async def test_dbmemcollector_prepare_fail_dep( + mock_entity: MockCollectorEntityFunc, + mock_sink, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[t.Any], +) -> None: + """Ensure that collector preparation attempts to connect, ensure it + reports a failure if the db conn bombs""" + entity = mock_entity(telemetry_on=True) + + def raiser(*args: t.Any, **kwargs: t.Any) -> None: + # mock raising exception on connect attempts to test err handling + raise redisa.ConnectionError("mock connection failure") + + sink = mock_sink() + collector = DBMemoryCollector(entity, sink) + with monkeypatch.context() as ctx: + ctx.setattr(redisa, "Redis", raiser) + + assert sink.num_saves == 0 + await collector.prepare() + + assert sink.num_saves == 1 + assert not collector._client + + +@pytest.mark.asyncio +async def test_dbmemcollector_collect( + mock_entity: MockCollectorEntityFunc, + mock_redis, + mock_mem, + mock_sink, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure that a valid response is returned as expected""" + entity = mock_entity(telemetry_on=True) + + sink = mock_sink() + collector = DBMemoryCollector(entity, sink) + with monkeypatch.context() as ctx: + ctx.setattr(redisa, "Redis", mock_redis(mem_stats=mock_mem(1, 2))) + ctx.setattr( + smartsim._core.utils.telemetry.collector, + "get_ts_ms", + lambda: 12131415, + ) + + await collector.prepare() + await collector.collect() + + reqd_items = { + "timestamp", + "total_system_memory", + "used_memory", + "used_memory_peak", + } + actual_items = set(sink.args) + + reqd_values = {12131415, 1000.0, 1111.0, 1234.0} + actual_values = set(sink.args) + assert actual_values == reqd_values + + +@pytest.mark.asyncio +async def test_dbmemcollector_integration( + mock_entity: MockCollectorEntityFunc, + mock_sink: MockSink, + prepare_db: PrepareDB, + local_db: dict, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Integration test with a real orchestrator instance to ensure + output data matches expectations and proper db client API uage""" + + db = prepare_db(local_db).orchestrator + entity = mock_entity(port=db.ports[0], telemetry_on=True) + + sink = mock_sink() + collector = DBMemoryCollector(entity, sink) + + with monkeypatch.context() as ctx: + ctx.setattr( + smartsim._core.utils.telemetry.collector, + "get_ts_ms", + lambda: 12131415, + ) + assert sink.num_saves == 0 + await collector.prepare() + assert sink.num_saves == 1 + await collector.collect() + assert sink.num_saves == 2 + + stats = sink.args + assert len(stats) == 4 # show we have the expected amount of data points + ts = 12131415 + + assert ts in stats + + +@pytest.mark.asyncio +async def test_dbconncollector_collect( + mock_entity: MockCollectorEntityFunc, + mock_sink, + mock_redis, + mock_con, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure that a valid response is returned as expected""" + entity = mock_entity(telemetry_on=True) + + sink = mock_sink() + collector = DBConnectionCollector(entity, sink) + with monkeypatch.context() as ctx: + ctx.setattr(redisa, "Redis", mock_redis(client_stats=mock_con(1, 2))) + + assert sink.num_saves == 0 + await collector.prepare() + assert sink.num_saves == 1 + await collector.collect() + assert sink.num_saves == 3 # save twice w/two datapoints + + stats = sink.args + + idx = 1 + id0, ip0 = f"ABC{idx}", f"127.0.0.{idx}:1234" + id1, ip1 = f"XYZ{idx}", f"127.0.0.{idx}:2345" + exp_clients = [{"id": id0, "addr": ip0}, {"id": id1, "addr": ip1}] + + assert len(exp_clients) + 1 == len(stats) # output includes timestamp + assert id0 in set(client["id"] for client in exp_clients) + assert id1 in set(client["id"] for client in exp_clients) + assert ip0 in set(client["addr"] for client in exp_clients) + assert ip1 in set(client["addr"] for client in exp_clients) + + +@pytest.mark.asyncio +async def test_dbconn_count_collector_collect( + mock_entity: MockCollectorEntityFunc, + mock_sink, + mock_redis, + mock_con, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure that a valid response is returned as expected""" + entity = mock_entity(telemetry_on=True) + + sink = mock_sink() + collector = DBConnectionCountCollector(entity, sink) + with monkeypatch.context() as ctx: + ctx.setattr(redisa, "Redis", mock_redis(client_stats=mock_con(1, 2))) + + assert sink.num_saves == 0 + await collector.prepare() + assert sink.num_saves == 1 + await collector.collect() + assert sink.num_saves == 2 + + stats = sink.args + exp_counts = 2 + + assert exp_counts == len(stats) # output includes timestamp + + +@pytest.mark.asyncio +async def test_dbconncollector_integration( + mock_entity: MockCollectorEntityFunc, + mock_sink: MockSink, + prepare_db: PrepareDB, + local_db: dict, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Integration test with a real orchestrator instance to ensure + output data matches expectations and proper db client API uage""" + + db = prepare_db(local_db).orchestrator + entity = mock_entity(port=db.ports[0], telemetry_on=True) + + sink = mock_sink() + collector = DBConnectionCollector(entity, sink) + + with monkeypatch.context() as ctx: + ctx.setattr( + smartsim._core.utils.telemetry.collector, + "get_ts_ms", + lambda: 12131415, + ) + await collector.prepare() + await collector.collect() + stats = sink.args + + ip = "127.0.0.1:" + num_conns = int(stats[1]) + ts = 12131415 + + assert ts in stats + assert num_conns > 0 + assert ip in stats[2] diff --git a/tests/test_colo_model_local.py b/tests/test_colo_model_local.py index 138ceb4b7..fe347ee30 100644 --- a/tests/test_colo_model_local.py +++ b/tests/test_colo_model_local.py @@ -28,9 +28,10 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.entity import Model from smartsim.error import SSUnsupportedError +from smartsim.status import SmartSimStatus # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -139,13 +140,13 @@ def test_launch_colocated_model_defaults( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all(stat == status.STATUS_COMPLETED for stat in statuses) + assert all(stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) assert all( - stat == status.STATUS_COMPLETED for stat in statuses + stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses ), f"Statuses {statuses}" @@ -181,12 +182,12 @@ def test_launch_multiple_colocated_models( exp.generate(*colo_models) exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) - assert all(stat == status.STATUS_COMPLETED for stat in statuses) + assert all(stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses) # test restarting the colocated model exp.start(*colo_models, block=True) statuses = exp.get_status(*colo_models) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) @@ -212,7 +213,7 @@ def test_colocated_model_disable_pinning( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.parametrize("db_type", supported_dbs) @@ -245,7 +246,7 @@ def test_colocated_model_pinning_auto_2cpu( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @@ -272,7 +273,7 @@ def test_colocated_model_pinning_range( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") @@ -299,7 +300,7 @@ def test_colocated_model_pinning_list( exp.generate(colo_model) exp.start(colo_model, block=True) statuses = exp.get_status(colo_model) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): diff --git a/tests/test_config.py b/tests/test_config.py index 0716ac0d5..00a1fcdd3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,9 +61,7 @@ def get_redisai_env( """Convenience method to create a set of environment variables that include RedisAI-specific variables :param rai_path: The path to the RedisAI library - :type: str (optional) :param lib_path: The path to the SMARTSIM_DEP_INSTALL_PATH - :type: str (optional) :return: A dictionary containing an updated set of environment variables """ env = os.environ.copy() @@ -255,3 +253,31 @@ def test_telemetry_cooldown( monkeypatch.delenv("SMARTSIM_TELEMETRY_COOLDOWN", raising=False) config = Config() assert config.telemetry_cooldown == exp_result + + +def test_key_path_unset(monkeypatch: pytest.MonkeyPatch): + """Ensure that the default value of the key path meets expectations""" + monkeypatch.delenv("SMARTSIM_KEY_PATH", raising=False) + + config = Config() + + key_path = config.smartsim_key_path + + exp_default = Path.home() / ".smartsim" / "keys" + assert str(exp_default) == key_path, "Unexpected default key path" + + +def test_key_path_non_default(monkeypatch: pytest.MonkeyPatch): + """Ensure that the environment variable for key path overrides + the default when it is set""" + key_path1 = "/foo/bar" + key_path2 = "/foo/baz" + config = Config() + + monkeypatch.setenv("SMARTSIM_KEY_PATH", key_path1) + actual_value = config.smartsim_key_path + assert key_path1 == actual_value, "Key path 1 didn't match overridden value" + + monkeypatch.setenv("SMARTSIM_KEY_PATH", key_path2) + actual_value = config.smartsim_key_path + assert key_path2 == actual_value, "Key path 2 didn't match overridden value" diff --git a/tests/test_configs/mpi/mpi_hello.c b/tests/test_configs/mpi/mpi_hello.c new file mode 100755 index 000000000..dcf80f3ac --- /dev/null +++ b/tests/test_configs/mpi/mpi_hello.c @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include +#include + + +int main(int argc, char** argv) { + sleep(1); + // Initialize the MPI environment + MPI_Init(NULL, NULL); + + // Get the number of processes + int world_size; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + + // Get the rank of the process + int world_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); + + char filename[64]; + sprintf(filename, "mpi_hello.%d.log", world_rank); + FILE *log = fopen(filename, "w"); + + fprintf(log, "Hello world from rank %d out of %d processors\n", + world_rank, world_size); + fflush(log); + + // unlink(filename); + fclose(log); + + // Finalize the MPI environment. + MPI_Finalize(); +} diff --git a/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py b/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py new file mode 100644 index 000000000..74a15c010 --- /dev/null +++ b/tests/test_configs/smartredis/multidbid_colo_env_vars_only.py @@ -0,0 +1,52 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import os + +from smartredis import Client, ConfigOptions + +if __name__ == "__main__": + """For inclusion in test with two unique database identifiers with multiple + databases where one (presumably colocated) database is started before the + other, and thus only one DB ID is known at application runtime and + available via environment variable. + """ + + parser = argparse.ArgumentParser(description="SmartRedis") + parser.add_argument("--exchange", action="store_true") + parser.add_argument("--should-see-reg-db", action="store_true") + args = parser.parse_args() + + env_vars = [ + "SSDB_testdb_colo", + "SR_DB_TYPE_testdb_colo", + ] + + assert all([var in os.environ for var in env_vars]) + + opts = ConfigOptions.create_from_environment("testdb_colo") + Client(opts, logger_name="SmartSim") diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json index 58c1c841a..36edc7486 100644 --- a/tests/test_configs/telemetry/db_and_model.json +++ b/tests/test_configs/telemetry/db_and_model.json @@ -29,6 +29,9 @@ "conf_file": null, "out_file": "/path/to/some/file.out", "err_file": "/path/to/some/file.err", + "client_file": "/path/to/some/client.log", + "client_count_file": null, + "memory_file": "/path/to/some/mem.log", "telemetry_metadata": { "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", "step_id": "4139111.27", diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json index 841324ec6..67e53ca09 100644 --- a/tests/test_configs/telemetry/ensembles.json +++ b/tests/test_configs/telemetry/ensembles.json @@ -1,329 +1,329 @@ { - "schema info": { - "schema_name": "entity manifest", - "version": "0.0.1" - }, - "experiment": { - "name": "my-exp", - "path": "/home/someuser/code/ss/my-exp", - "launcher": "Local" - }, - "runs": [ - { - "run_id": "d041b90", - "timestamp": 1698679830384608928, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" + "schema info": { + "schema_name": "entity manifest", + "version": "0.0.1" + }, + "experiment": { + "name": "my-exp", + "path": "/home/someuser/code/ss/my-exp", + "launcher": "Local" + }, + "runs": [ + { + "run_id": "d041b90", + "timestamp": 1698679830384608928, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" ], - "MID": [ - "eggs", - "bar" + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", + "step_id": null, + "task_id": "88118", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_0.out", + "err_file": "/home/someuser/code/ss/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" ], - "END": [ - "ham", - "baz" - ] + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", + "step_id": null, + "task_id": "88131", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_1.out", + "err_file": "/home/someuser/code/ss/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", + "step_id": null, + "task_id": "88146", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_2.out", + "err_file": "/home/someuser/code/ss/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" + ], + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", + "step_id": null, + "task_id": "88170", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_3.out", + "err_file": "/home/someuser/code/ss/my-ens_3.err" }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + { + "name": "my-ens_4", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_0", - "step_id": null, - "task_id": "88118", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_0.out", - "err_file": "/home/someuser/code/ss/my-ens_0.err" - }, - { - "name": "my-ens_1", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_1", - "step_id": null, - "task_id": "88131", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_1.out", - "err_file": "/home/someuser/code/ss/my-ens_1.err" - }, - { - "name": "my-ens_2", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", + "step_id": null, + "task_id": "88178", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_4.out", + "err_file": "/home/someuser/code/ss/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_2", - "step_id": null, - "task_id": "88146", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_2.out", - "err_file": "/home/someuser/code/ss/my-ens_2.err" - }, - { - "name": "my-ens_3", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_3", - "step_id": null, - "task_id": "88170", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_3.out", - "err_file": "/home/someuser/code/ss/my-ens_3.err" - }, - { - "name": "my-ens_4", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", + "step_id": null, + "task_id": "88193", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_5.out", + "err_file": "/home/someuser/code/ss/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_4", - "step_id": null, - "task_id": "88178", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_4.out", - "err_file": "/home/someuser/code/ss/my-ens_4.err" - }, - { - "name": "my-ens_5", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_5", - "step_id": null, - "task_id": "88193", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_5.out", - "err_file": "/home/someuser/code/ss/my-ens_5.err" - }, - { - "name": "my-ens_6", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", + "step_id": null, + "task_id": "88221", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_6.out", + "err_file": "/home/someuser/code/ss/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/home/someuser/code/ss", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_6", - "step_id": null, - "task_id": "88221", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_6.out", - "err_file": "/home/someuser/code/ss/my-ens_6.err" - }, - { - "name": "my-ens_7", - "path": "/home/someuser/code/ss", - "exe_args": [ - "yo.py" + "run_command": null, + "run_args": {} + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/home/someuser/code/ss/manifest/demo/yo.py" ], - "run_settings": { - "exe": [ - "/home/someuser/.pyenv/versions/3.9.16/envs/ss/bin/python" - ], - "run_command": null, - "run_args": {} - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/home/someuser/code/ss/manifest/demo/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", - "step_id": null, - "task_id": "88241", - "managed": false - }, - "out_file": "/home/someuser/code/ss/my-ens_7.out", - "err_file": "/home/someuser/code/ss/my-ens_7.err" - } - ] - } - ] - } - ] - } + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/home/someuser/code/ss/my-exp/.smartsim/telemetry/my-exp/d041b90/ensemble/my-ens/my-ens_7", + "step_id": null, + "task_id": "88241", + "managed": false + }, + "out_file": "/home/someuser/code/ss/my-ens_7.out", + "err_file": "/home/someuser/code/ss/my-ens_7.err" + } + ] + } + ] + } + ] +} diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json index a380bc5fb..916f5922b 100644 --- a/tests/test_configs/telemetry/telemetry.json +++ b/tests/test_configs/telemetry/telemetry.json @@ -1,946 +1,945 @@ { - "experiment": { - "name": "my-exp", - "path": "/path/to/my-exp", - "launcher": "Slurm" - }, - "runs": [ - { - "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", - "timestamp": 1697824072792854287, - "model": [ - { - "name": "my-model", - "path": "/path/to/my-exp/my-model", - "exe_args": [ - "hello", - "world" + "experiment": { + "name": "my-exp", + "path": "/path/to/my-exp", + "launcher": "Slurm" + }, + "runs": [ + { + "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "timestamp": 1697824072792854287, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" ], - "run_settings": { - "exe": [ - "/usr/bin/echo" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null + }, + "extra_db_args": {} }, - "colocated_db": { - "settings": { - "port": 5757, - "ifname": "lo", - "cpus": 1, - "custom_pinning": "0", - "debug": false, - "db_identifier": "COLO", - "rai_args": { - "threads_per_queue": null, - "inter_op_parallelism": null, - "intra_op_parallelism": null - }, - "extra_db_args": {} - }, - "scripts": [], - "models": [ - { - "cnn": { - "backend": "TORCH", - "device": "CPU" - } + "scripts": [], + "models": [ + { + "cnn": { + "backend": "TORCH", + "device": "CPU" } - ] + } + ] + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "step_id": "4121050.30", + "task_id": "25230", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", + "timestamp": 1697824102122439975, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_1", + "hostname": "10.128.0.70", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } }, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", - "step_id": "4121050.30", - "task_id": "25230", - "managed": true + { + "name": "orchestrator_2", + "hostname": "10.128.0.71", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } }, - "out_file": "/path/to/my-exp/my-model/my-model.out", - "err_file": "/path/to/my-exp/my-model/my-model.err" - } - ], - "orchestrator": [], - "ensemble": [] - }, - { - "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", - "timestamp": 1697824102122439975, - "model": [], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" + { + "name": "orchestrator_0", + "hostname": "10.128.0.69", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "step_id": "4121050.31+2", + "task_id": "25241", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", + "timestamp": 1697824127962219505, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" ], - "shards": [ - { - "name": "orchestrator_1", - "hostname": "10.128.0.70", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } }, - { - "name": "orchestrator_2", - "hostname": "10.128.0.71", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", + "step_id": "4121050.32", + "task_id": "25639", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + }, + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } }, - { - "name": "orchestrator_0", - "hostname": "10.128.0.69", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", - "step_id": "4121050.31+2", - "task_id": "25241", - "managed": true + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", + "step_id": "4121050.33", + "task_id": "25768", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } - } - ] - } - ], - "ensemble": [] - }, - { - "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", - "timestamp": 1697824127962219505, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", + "step_id": "4121050.34", + "task_id": "25817", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" ], - "MID": [ - "eggs", - "bar" + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", + "step_id": "4121050.35", + "task_id": "25837", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" ], - "END": [ - "ham", - "baz" - ] + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", + "step_id": "4121050.36", + "task_id": "25872", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/path/to/my-exp/my-ens/my-ens_0", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", - "step_id": "4121050.32", - "task_id": "25639", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } }, - { - "name": "my-ens_1", - "path": "/path/to/my-exp/my-ens/my-ens_1", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", - "step_id": "4121050.33", - "task_id": "25768", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" }, - { - "name": "my-ens_2", - "path": "/path/to/my-exp/my-ens/my-ens_2", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", - "step_id": "4121050.34", - "task_id": "25817", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] }, - { - "name": "my-ens_3", - "path": "/path/to/my-exp/my-ens/my-ens_3", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", - "step_id": "4121050.35", - "task_id": "25837", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", + "step_id": "4121050.37", + "task_id": "25930", + "managed": true }, - { - "name": "my-ens_4", - "path": "/path/to/my-exp/my-ens/my-ens_4", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", - "step_id": "4121050.36", - "task_id": "25872", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + }, + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } }, - { - "name": "my-ens_5", - "path": "/path/to/my-exp/my-ens/my-ens_5", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", - "step_id": "4121050.37", - "task_id": "25930", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" }, - { - "name": "my-ens_6", - "path": "/path/to/my-exp/my-ens/my-ens_6", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", - "step_id": "4121050.38", - "task_id": "25945", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", + "step_id": "4121050.38", + "task_id": "25945", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", + "step_id": "4121050.39", + "task_id": "25967", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + }, + { + "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", + "timestamp": 1697835227560376025, + "model": [ + { + "name": "my-model", + "path": "/path/to/my-exp/my-model", + "exe_args": [ + "hello", + "world" + ], + "run_settings": { + "exe": [ + "/usr/bin/echo" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": {}, + "files": { + "Symlink": [], + "Configure": [], + "Copy": [] + }, + "colocated_db": { + "settings": { + "port": 5757, + "ifname": "lo", + "cpus": 1, + "custom_pinning": "0", + "debug": false, + "db_identifier": "COLO", + "rai_args": { + "threads_per_queue": null, + "inter_op_parallelism": null, + "intra_op_parallelism": null }, + "extra_db_args": {} + }, + "scripts": [], + "models": [ { - "name": "my-ens_7", - "path": "/path/to/my-exp/my-ens/my-ens_7", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", - "step_id": "4121050.39", - "task_id": "25967", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + "cnn": { + "backend": "TORCH", + "device": "CPU" + } } ] - } - ] - }, - { - "run_id": "e41f8e17-c4b2-441d-adf9-707443ee2c72", - "timestamp": 1697835227560376025, - "model": [ - { - "name": "my-model", - "path": "/path/to/my-exp/my-model", - "exe_args": [ - "hello", - "world" - ], - "run_settings": { - "exe": [ - "/usr/bin/echo" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 + }, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", + "step_id": "4121904.0", + "task_id": "28277", + "managed": true + }, + "out_file": "/path/to/my-exp/my-model/my-model.out", + "err_file": "/path/to/my-exp/my-model/my-model.err" + } + ], + "orchestrator": [], + "ensemble": [] + }, + { + "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", + "timestamp": 1697835261956135240, + "model": [], + "orchestrator": [ + { + "name": "orchestrator", + "type": "redis", + "interface": [ + "ipogif0" + ], + "shards": [ + { + "name": "orchestrator_0", + "hostname": "10.128.0.2", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_0-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true } }, - "batch_settings": {}, - "params": {}, - "files": { - "Symlink": [], - "Configure": [], - "Copy": [] + { + "name": "orchestrator_2", + "hostname": "10.128.0.4", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_2-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } }, - "colocated_db": { - "settings": { - "port": 5757, - "ifname": "lo", - "cpus": 1, - "custom_pinning": "0", - "debug": false, - "db_identifier": "COLO", - "rai_args": { - "threads_per_queue": null, - "inter_op_parallelism": null, - "intra_op_parallelism": null - }, - "extra_db_args": {} - }, - "scripts": [], - "models": [ - { - "cnn": { - "backend": "TORCH", - "device": "CPU" - } + { + "name": "orchestrator_1", + "hostname": "10.128.0.3", + "port": 2424, + "cluster": true, + "conf_file": "nodes-orchestrator_1-2424.conf", + "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", + "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "step_id": "4121904.1+2", + "task_id": "28289", + "managed": true + } + } + ] + } + ], + "ensemble": [] + }, + { + "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", + "timestamp": 1697835287798613875, + "model": [], + "orchestrator": [], + "ensemble": [ + { + "name": "my-ens", + "params": { + "START": [ + "spam", + "foo" + ], + "MID": [ + "eggs", + "bar" + ], + "END": [ + "ham", + "baz" + ] + }, + "batch_settings": {}, + "models": [ + { + "name": "my-ens_0", + "path": "/path/to/my-exp/my-ens/my-ens_0", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } - ] - }, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/e41f8e17-c4b2-441d-adf9-707443ee2c72/model/my-model", - "step_id": "4121904.0", - "task_id": "28277", - "managed": true + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", + "step_id": "4121904.2", + "task_id": "28333", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" }, - "out_file": "/path/to/my-exp/my-model/my-model.out", - "err_file": "/path/to/my-exp/my-model/my-model.err" - } - ], - "orchestrator": [], - "ensemble": [] - }, - { - "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", - "timestamp": 1697835261956135240, - "model": [], - "orchestrator": [ - { - "name": "orchestrator", - "type": "redis", - "interface": [ - "ipogif0" - ], - "shards": [ - { - "name": "orchestrator_0", - "hostname": "10.128.0.2", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true + { + "name": "my-ens_1", + "path": "/path/to/my-exp/my-ens/my-ens_1", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } }, - { - "name": "orchestrator_2", - "hostname": "10.128.0.4", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", + "step_id": "4121904.3", + "task_id": "28342", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + }, + { + "name": "my-ens_2", + "path": "/path/to/my-exp/my-ens/my-ens_2", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } }, - { - "name": "orchestrator_1", - "hostname": "10.128.0.3", - "port": 2424, - "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", - "step_id": "4121904.1+2", - "task_id": "28289", - "managed": true + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", + "step_id": "4121904.4", + "task_id": "28353", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + }, + { + "name": "my-ens_3", + "path": "/path/to/my-exp/my-ens/my-ens_3", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 } - } - ] - } - ], - "ensemble": [] - }, - { - "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", - "timestamp": 1697835287798613875, - "model": [], - "orchestrator": [], - "ensemble": [ - { - "name": "my-ens", - "params": { - "START": [ - "spam", - "foo" + }, + "batch_settings": {}, + "params": { + "START": "spam", + "MID": "bar", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", + "step_id": "4121904.5", + "task_id": "28362", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + }, + { + "name": "my-ens_4", + "path": "/path/to/my-exp/my-ens/my-ens_4", + "exe_args": [ + "yo.py" ], - "MID": [ - "eggs", - "bar" + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "ham" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", + "step_id": "4121904.6", + "task_id": "28371", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + }, + { + "name": "my-ens_5", + "path": "/path/to/my-exp/my-ens/my-ens_5", + "exe_args": [ + "yo.py" ], - "END": [ - "ham", - "baz" - ] + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } + }, + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "eggs", + "END": "baz" + }, + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] + }, + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", + "step_id": "4121904.7", + "task_id": "28380", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" }, - "batch_settings": {}, - "models": [ - { - "name": "my-ens_0", - "path": "/path/to/my-exp/my-ens/my-ens_0", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", - "step_id": "4121904.2", - "task_id": "28333", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_0/my-ens_0.err" + { + "name": "my-ens_6", + "path": "/path/to/my-exp/my-ens/my-ens_6", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } }, - { - "name": "my-ens_1", - "path": "/path/to/my-exp/my-ens/my-ens_1", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", - "step_id": "4121904.3", - "task_id": "28342", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_1/my-ens_1.err" + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "ham" }, - { - "name": "my-ens_2", - "path": "/path/to/my-exp/my-ens/my-ens_2", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", - "step_id": "4121904.4", - "task_id": "28353", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_2/my-ens_2.err" + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] }, - { - "name": "my-ens_3", - "path": "/path/to/my-exp/my-ens/my-ens_3", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "spam", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", - "step_id": "4121904.5", - "task_id": "28362", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_3/my-ens_3.err" + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", + "step_id": "4121904.8", + "task_id": "28389", + "managed": true }, - { - "name": "my-ens_4", - "path": "/path/to/my-exp/my-ens/my-ens_4", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", - "step_id": "4121904.6", - "task_id": "28371", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_4/my-ens_4.err" + "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + }, + { + "name": "my-ens_7", + "path": "/path/to/my-exp/my-ens/my-ens_7", + "exe_args": [ + "yo.py" + ], + "run_settings": { + "exe": [ + "/path/to/my/python3" + ], + "run_command": "/opt/slurm/20.11.5/bin/srun", + "run_args": { + "nodes": 1, + "ntasks": 1 + } }, - { - "name": "my-ens_5", - "path": "/path/to/my-exp/my-ens/my-ens_5", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "eggs", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", - "step_id": "4121904.7", - "task_id": "28380", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_5/my-ens_5.err" + "batch_settings": {}, + "params": { + "START": "foo", + "MID": "bar", + "END": "baz" }, - { - "name": "my-ens_6", - "path": "/path/to/my-exp/my-ens/my-ens_6", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "ham" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", - "step_id": "4121904.8", - "task_id": "28389", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_6/my-ens_6.err" + "files": { + "Symlink": [], + "Configure": [ + "/path/to/yo.py" + ], + "Copy": [] }, - { - "name": "my-ens_7", - "path": "/path/to/my-exp/my-ens/my-ens_7", - "exe_args": [ - "yo.py" - ], - "run_settings": { - "exe": [ - "/path/to/my/python3" - ], - "run_command": "/opt/slurm/20.11.5/bin/srun", - "run_args": { - "nodes": 1, - "ntasks": 1 - } - }, - "batch_settings": {}, - "params": { - "START": "foo", - "MID": "bar", - "END": "baz" - }, - "files": { - "Symlink": [], - "Configure": [ - "/path/to/yo.py" - ], - "Copy": [] - }, - "colocated_db": {}, - "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", - "step_id": "4121904.9", - "task_id": "28398", - "managed": true - }, - "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", - "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" - } - ] - } - ] - } - ] - } - + "colocated_db": {}, + "telemetry_metadata": { + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", + "step_id": "4121904.9", + "task_id": "28398", + "managed": true + }, + "out_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.out", + "err_file": "/path/to/my-exp/my-ens/my-ens_7/my-ens_7.err" + } + ] + } + ] + } + ] +} diff --git a/tests/test_containers.py b/tests/test_containers.py index 21fe50ad4..5d0f933ff 100644 --- a/tests/test_containers.py +++ b/tests/test_containers.py @@ -32,9 +32,9 @@ import pytest from smartsim import Experiment, status -from smartsim.database import Orchestrator from smartsim.entity import Ensemble from smartsim.settings.containers import Singularity +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -109,7 +109,7 @@ def test_singularity_basic(fileutils, test_dir): # get and confirm status stat = exp.get_status(model)[0] - assert stat == status.STATUS_COMPLETED + assert stat == SmartSimStatus.STATUS_COMPLETED print(exp.summary()) @@ -136,13 +136,13 @@ def test_singularity_args(fileutils, test_dir): # get and confirm status stat = exp.get_status(model)[0] - assert stat == status.STATUS_COMPLETED + assert stat == SmartSimStatus.STATUS_COMPLETED print(exp.summary()) @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(test_dir, fileutils, wlmutils): +def test_singularity_smartredis(local_experiment, prepare_db, local_db, fileutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. @@ -150,18 +150,13 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): Note: This is a containerized port of test_smartredis.py """ - exp = Experiment( - "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" - ) - # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + db = prepare_db(local_db).orchestrator + local_experiment.reconnect_orchestrator(db.checkpoint_file) container = Singularity(containerURI) - rs = exp.create_run_settings( + rs = local_experiment.create_run_settings( "python3", "producer.py --exchange", container=container ) params = {"mult": [1, -10]} @@ -178,18 +173,12 @@ def test_singularity_smartredis(test_dir, fileutils, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) - if not all([stat == status.STATUS_COMPLETED for stat in statuses]): - exp.stop(orc) + statuses = local_experiment.get_status(ensemble) + if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): assert False # client ensemble failed - - # stop the orchestrator - exp.stop(orc) - - print(exp.summary()) diff --git a/tests/test_controller_errors.py b/tests/test_controller_errors.py index a02c17678..2d623cdd1 100644 --- a/tests/test_controller_errors.py +++ b/tests/test_controller_errors.py @@ -28,15 +28,28 @@ import pytest from smartsim._core.control import Controller, Manifest +from smartsim._core.launcher.step import Step +from smartsim._core.launcher.step.dragonStep import DragonStep from smartsim.database import Orchestrator from smartsim.entity import Model +from smartsim.entity.ensemble import Ensemble from smartsim.error import SmartSimError, SSUnsupportedError from smartsim.error.errors import SSUnsupportedError -from smartsim.settings import RunSettings +from smartsim.settings import RunSettings, SrunSettings # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a +entity_settings = SrunSettings("echo", ["spam", "eggs"]) +model_dup_setting = RunSettings("echo", ["spam_1", "eggs_2"]) +model = Model("model_name", run_settings=entity_settings, params={}, path="") +# Model entity slightly different but with same name +model_2 = Model("model_name", run_settings=model_dup_setting, params={}, path="") +ens = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=2) +# Ensemble entity slightly different but with same name +ens_2 = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") + def test_finished_entity_orc_error(): """Orchestrators are never 'finished', either run forever or stopped by user""" @@ -108,3 +121,84 @@ def test_bad_orc_checkpoint(): cont = Controller(launcher="local") with pytest.raises(FileNotFoundError): cont.reload_saved_db(checkpoint) + + +class MockStep(Step): + """Mock step to implement any abstract methods so that it can be + instanced for test purposes + """ + + def get_launch_cmd(self): + return ["echo", "spam"] + + +@pytest.mark.parametrize( + "entity", + [ + pytest.param(ens, id="Ensemble_running"), + pytest.param(model, id="Model_running"), + pytest.param(orc, id="Orch_running"), + ], +) +def test_duplicate_running_entity(test_dir, wlmutils, entity): + """This test validates that users cannot reuse entity names + that are running in JobManager.jobs or JobManager.db_jobs + """ + step_settings = RunSettings("echo") + step = MockStep("mock-step", test_dir, step_settings) + test_launcher = wlmutils.get_test_launcher() + controller = Controller(test_launcher) + controller._jobs.add_job(entity.name, job_id="1234", entity=entity) + with pytest.raises(SSUnsupportedError) as ex: + controller._launch_step(step, entity=entity) + assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." + + +@pytest.mark.parametrize( + "entity", + [pytest.param(ens, id="Ensemble_running"), pytest.param(model, id="Model_running")], +) +def test_restarting_entity(test_dir, wlmutils, entity): + """Validate restarting a completed Model/Ensemble job""" + step_settings = RunSettings("echo") + test_launcher = wlmutils.get_test_launcher() + step = MockStep("mock-step", test_dir, step_settings) + step.meta["status_dir"] = test_dir + entity.path = test_dir + controller = Controller(test_launcher) + controller._jobs.add_job(entity.name, job_id="1234", entity=entity) + controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) + controller._launch_step(step, entity=entity) + + +def test_restarting_orch(test_dir, wlmutils): + """Validate restarting a completed Orchestrator job""" + step_settings = RunSettings("echo") + test_launcher = wlmutils.get_test_launcher() + step = MockStep("mock-step", test_dir, step_settings) + step.meta["status_dir"] = test_dir + orc.path = test_dir + controller = Controller(test_launcher) + controller._jobs.add_job(orc.name, job_id="1234", entity=orc) + controller._jobs.move_to_completed(controller._jobs.db_jobs.get(orc.name)) + controller._launch_step(step, entity=orc) + + +@pytest.mark.parametrize( + "entity,entity_2", + [ + pytest.param(ens, ens_2, id="Ensemble_running"), + pytest.param(model, model_2, id="Model_running"), + ], +) +def test_starting_entity(test_dir, wlmutils, entity, entity_2): + """Test launching a job of Model/Ensemble with same name in completed""" + step_settings = RunSettings("echo") + step = MockStep("mock-step", test_dir, step_settings) + test_launcher = wlmutils.get_test_launcher() + controller = Controller(test_launcher) + controller._jobs.add_job(entity.name, job_id="1234", entity=entity) + controller._jobs.move_to_completed(controller._jobs.jobs.get(entity.name)) + with pytest.raises(SSUnsupportedError) as ex: + controller._launch_step(step, entity=entity_2) + assert ex.value.args[0] == "SmartSim entities cannot have duplicate names." diff --git a/tests/test_dbnode.py b/tests/test_dbnode.py index 227572ac9..04845344c 100644 --- a/tests/test_dbnode.py +++ b/tests/test_dbnode.py @@ -49,22 +49,12 @@ def test_parse_db_host_error(): orc.entities[0].host -def test_hosts(test_dir, wlmutils): - exp_name = "test_hosts" - exp = Experiment(exp_name, exp_path=test_dir) - - orc = Orchestrator(port=wlmutils.get_test_port(), interface="lo", launcher="local") - orc.set_path(test_dir) - exp.start(orc) - - hosts = [] - try: - hosts = orc.hosts - assert len(hosts) == orc.db_nodes == 1 - finally: - # stop the database even if there is an error raised - exp.stop(orc) - orc.remove_stale_files() +def test_hosts(local_experiment, prepare_db, local_db): + db = prepare_db(local_db).orchestrator + orc = local_experiment.reconnect_orchestrator(db.checkpoint_file) + + hosts = orc.hosts + assert len(hosts) == orc.db_nodes == 1 def _random_shard_info(): diff --git a/tests/test_dragon_backend.py b/tests/test_dragon_backend.py new file mode 100644 index 000000000..a510f660a --- /dev/null +++ b/tests/test_dragon_backend.py @@ -0,0 +1,453 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import sys +import textwrap +import time +from unittest.mock import MagicMock + +import pytest + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_a + +try: + import dragon +except ImportError: + pass +else: + pytest.skip( + reason="Using dragon as launcher, not running Dragon unit tests", + allow_module_level=True, + ) + +from smartsim._core.config import CONFIG +from smartsim._core.schemas.dragonRequests import * +from smartsim._core.schemas.dragonResponses import * +from smartsim._core.utils.helpers import create_short_id_str +from smartsim.status import TERMINAL_STATUSES, SmartSimStatus + +if t.TYPE_CHECKING: + from smartsim._core.launcher.dragon.dragonBackend import ( + DragonBackend, + ProcessGroupInfo, + ) + + +class NodeMock(MagicMock): + @property + def hostname(self) -> str: + return create_short_id_str() + + +class GroupStateMock(MagicMock): + def Running(self) -> MagicMock: + running = MagicMock(**{"__str__.return_value": "Running"}) + return running + + def Error(self) -> MagicMock: + error = MagicMock(**{"__str__.return_value": "Error"}) + return error + + +class ProcessGroupMock(MagicMock): + puids = [121, 122] + + +def get_mock_backend(monkeypatch: pytest.MonkeyPatch) -> "DragonBackend": + + process_mock = MagicMock(returncode=0) + process_group_mock = MagicMock(**{"Process.return_value": ProcessGroupMock()}) + process_module_mock = MagicMock() + process_module_mock.Process = process_mock + node_mock = NodeMock() + system_mock = MagicMock(nodes=["node1", "node2", "node3"]) + monkeypatch.setitem( + sys.modules, + "dragon", + MagicMock( + **{ + "native.machine.Node.return_value": node_mock, + "native.machine.System.return_value": system_mock, + "native.group_state": GroupStateMock(), + "native.process_group.ProcessGroup.return_value": ProcessGroupMock(), + } + ), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.connection", + MagicMock(), + ) + monkeypatch.setitem( + sys.modules, + "dragon.infrastructure.policy", + MagicMock(**{"Policy.return_value": MagicMock()}), + ) + monkeypatch.setitem(sys.modules, "dragon.native.process", process_module_mock) + monkeypatch.setitem(sys.modules, "dragon.native.process_group", process_group_mock) + + monkeypatch.setitem(sys.modules, "dragon.native.group_state", GroupStateMock()) + monkeypatch.setitem( + sys.modules, + "dragon.native.machine", + MagicMock( + **{"System.return_value": system_mock, "Node.return_value": node_mock} + ), + ) + from smartsim._core.launcher.dragon.dragonBackend import DragonBackend + + dragon_backend = DragonBackend(pid=99999) + monkeypatch.setattr( + dragon_backend, "_free_hosts", collections.deque(dragon_backend._hosts) + ) + + return dragon_backend + + +def set_mock_group_infos( + monkeypatch: pytest.MonkeyPatch, dragon_backend: "DragonBackend" +) -> t.Dict[str, "ProcessGroupInfo"]: + dragon_mock = MagicMock() + process_mock = MagicMock() + process_mock.configure_mock(**{"returncode": 0}) + dragon_mock.configure_mock(**{"native.process.Process.return_value": process_mock}) + monkeypatch.setitem(sys.modules, "dragon", dragon_mock) + from smartsim._core.launcher.dragon.dragonBackend import ProcessGroupInfo + + running_group = MagicMock(status="Running") + error_group = MagicMock(status="Error") + hosts = dragon_backend._hosts + + group_infos = { + "abc123-1": ProcessGroupInfo( + SmartSimStatus.STATUS_RUNNING, + running_group, + [123], + [], + hosts[0:1], + MagicMock(), + ), + "del999-2": ProcessGroupInfo( + SmartSimStatus.STATUS_CANCELLED, + error_group, + [124], + [-9], + hosts[1:2], + MagicMock(), + ), + "c101vz-3": ProcessGroupInfo( + SmartSimStatus.STATUS_COMPLETED, + MagicMock(), + [125, 126], + [0], + hosts[1:3], + MagicMock(), + ), + "0ghjk1-4": ProcessGroupInfo( + SmartSimStatus.STATUS_FAILED, + error_group, + [127], + [-1], + hosts[2:3], + MagicMock(), + ), + "ljace0-5": ProcessGroupInfo( + SmartSimStatus.STATUS_NEVER_STARTED, None, [], [], [], None + ), + } + + monkeypatch.setattr(dragon_backend, "_group_infos", group_infos) + monkeypatch.setattr(dragon_backend, "_free_hosts", collections.deque(hosts[1:3])) + monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"}) + monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"]) + + return group_infos + + +def test_handshake_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + handshake_req = DragonHandshakeRequest() + handshake_resp = dragon_backend.process_request(handshake_req) + + assert isinstance(handshake_resp, DragonHandshakeResponse) + assert handshake_resp.dragon_pid == 99999 + + +def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + + step_id = run_resp.step_id + assert dragon_backend._queued_steps[step_id] == run_req + + mock_process_group = MagicMock(puids=[123, 124]) + + dragon_backend._group_infos[step_id].process_group = mock_process_group + dragon_backend._group_infos[step_id].puids = [123, 124] + dragon_backend._start_steps() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + monkeypatch.setattr( + dragon_backend._group_infos[step_id].process_group, "status", "Running" + ) + + dragon_backend._update() + + assert dragon_backend._running_steps == [step_id] + assert len(dragon_backend._queued_steps) == 0 + assert len(dragon_backend._free_hosts) == 1 + assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id + assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id + + dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED + + dragon_backend._update() + assert not dragon_backend._running_steps + + +def test_deny_run_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + dragon_backend._shutdown_requested = True + + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=2, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + run_resp = dragon_backend.process_request(run_req) + assert isinstance(run_resp, DragonRunResponse) + assert run_resp.error_message == "Cannot satisfy request, server is shutting down." + step_id = run_resp.step_id + + assert dragon_backend.group_infos[step_id].status == SmartSimStatus.STATUS_FAILED + + +def test_udpate_status_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + status_update_request = DragonUpdateStatusRequest(step_ids=list(group_infos.keys())) + + status_update_response = dragon_backend.process_request(status_update_request) + + assert isinstance(status_update_response, DragonUpdateStatusResponse) + assert status_update_response.statuses == { + step_id: (grp_info.status, grp_info.return_codes) + for step_id, grp_info in group_infos.items() + } + + +def test_stop_request(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + group_infos = set_mock_group_infos(monkeypatch, dragon_backend) + + running_steps = [ + step_id + for step_id, group in group_infos.items() + if group.status == SmartSimStatus.STATUS_RUNNING + ] + + step_id_to_stop = running_steps[0] + + stop_request = DragonStopRequest(step_id=step_id_to_stop) + + stop_response = dragon_backend.process_request(stop_request) + + assert isinstance(stop_response, DragonStopResponse) + assert len(dragon_backend._stop_requests) == 1 + + dragon_backend._update() + + assert len(dragon_backend._stop_requests) == 0 + assert ( + dragon_backend._group_infos[step_id_to_stop].status + == SmartSimStatus.STATUS_CANCELLED + ) + + assert len(dragon_backend._allocated_hosts) == 0 + assert len(dragon_backend._free_hosts) == 3 + + +@pytest.mark.parametrize( + "immediate, kill_jobs, frontend_shutdown", + [ + [True, True, True], + [True, True, False], + [True, False, True], + [True, False, False], + [False, True, True], + [False, True, False], + ], +) +def test_shutdown_request( + monkeypatch: pytest.MonkeyPatch, + immediate: bool, + kill_jobs: bool, + frontend_shutdown: bool, +) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") + dragon_backend = get_mock_backend(monkeypatch) + monkeypatch.setattr(dragon_backend, "_cooldown_period", 1) + set_mock_group_infos(monkeypatch, dragon_backend) + + if kill_jobs: + for group_info in dragon_backend.group_infos.values(): + if not group_info.status in TERMINAL_STATUSES: + group_info.status = SmartSimStatus.STATUS_FAILED + group_info.return_codes = [-9] + group_info.process_group = None + group_info.redir_workers = None + dragon_backend._running_steps.clear() + + shutdown_req = DragonShutdownRequest( + immediate=immediate, frontend_shutdown=frontend_shutdown + ) + shutdown_resp = dragon_backend.process_request(shutdown_req) + + if not kill_jobs: + stop_request_ids = ( + stop_request.step_id for stop_request in dragon_backend._stop_requests + ) + for step_id, group_info in dragon_backend.group_infos.items(): + if not group_info.status in TERMINAL_STATUSES: + assert step_id in stop_request_ids + + assert isinstance(shutdown_resp, DragonShutdownResponse) + assert dragon_backend._shutdown_requested + assert dragon_backend.frontend_shutdown == frontend_shutdown + + dragon_backend._update() + assert not dragon_backend.should_shutdown + time.sleep(dragon_backend._cooldown_period + 0.1) + dragon_backend._update() + + assert dragon_backend._can_shutdown == kill_jobs + assert dragon_backend.should_shutdown == kill_jobs + assert dragon_backend._has_cooled_down == kill_jobs + + +@pytest.mark.parametrize("telemetry_flag", ["0", "1"]) +def test_cooldown_is_set(monkeypatch: pytest.MonkeyPatch, telemetry_flag: str) -> None: + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", telemetry_flag) + dragon_backend = get_mock_backend(monkeypatch) + + expected_cooldown = ( + 2 * CONFIG.telemetry_frequency + 5 if int(telemetry_flag) > 0 else 5 + ) + + if telemetry_flag: + assert dragon_backend.cooldown_period == expected_cooldown + else: + assert dragon_backend.cooldown_period == expected_cooldown + + +def test_heartbeat_and_time(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + first_heartbeat = dragon_backend.last_heartbeat + assert dragon_backend.current_time > first_heartbeat + dragon_backend._heartbeat() + assert dragon_backend.last_heartbeat > first_heartbeat + + +@pytest.mark.parametrize("num_nodes", [1, 3, 100]) +def test_can_honor(monkeypatch: pytest.MonkeyPatch, num_nodes: int) -> None: + dragon_backend = get_mock_backend(monkeypatch) + run_req = DragonRunRequest( + exe="sleep", + exe_args=["5"], + path="/a/fake/path", + nodes=num_nodes, + tasks=1, + tasks_per_node=1, + env={}, + current_env={}, + pmi_enabled=False, + ) + + assert dragon_backend._can_honor(run_req)[0] == ( + num_nodes <= len(dragon_backend._hosts) + ) + + +def test_get_id(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + step_id = next(dragon_backend._step_ids) + + assert step_id.endswith("0") + assert step_id != next(dragon_backend._step_ids) + + +def test_view(monkeypatch: pytest.MonkeyPatch) -> None: + dragon_backend = get_mock_backend(monkeypatch) + set_mock_group_infos(monkeypatch, dragon_backend) + hosts = dragon_backend.hosts + + expected_message = textwrap.dedent(f"""\ + Dragon server backend update + | Host | Status | + |---------|----------| + | {hosts[0]} | Busy | + | {hosts[1]} | Free | + | {hosts[2]} | Free | + | Step | Status | Hosts | Return codes | Num procs | + |----------|--------------|-----------------|----------------|-------------| + | abc123-1 | Running | {hosts[0]} | | 1 | + | del999-2 | Cancelled | {hosts[1]} | -9 | 1 | + | c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 | + | 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 | + | ljace0-5 | NeverStarted | | | 0 |""") + + assert dragon_backend.status_message == expected_message diff --git a/tests/test_dragon_installer.py b/tests/test_dragon_installer.py new file mode 100644 index 000000000..b23a1a7ef --- /dev/null +++ b/tests/test_dragon_installer.py @@ -0,0 +1,471 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import sys +import tarfile +import typing as t +from collections import namedtuple + +import pytest +from github.GitReleaseAsset import GitReleaseAsset +from github.Requester import Requester + +import smartsim +import smartsim._core.utils.helpers as helpers +from smartsim._core._cli.scripts.dragon_install import ( + cleanup, + create_dotenv, + install_dragon, + install_package, + retrieve_asset, + retrieve_asset_info, +) +from smartsim.error.errors import SmartSimCLIActionCancelled + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +mock_archive_name = "dragon-0.8-py3.9.4.1-CRAYEX-ac132fe95.tar.gz" +_git_attr = namedtuple("_git_attr", "value") + + +@pytest.fixture +def test_archive(test_dir: str, archive_path: pathlib.Path) -> pathlib.Path: + """Fixture for returning a simple tarfile to test on""" + num_files = 10 + with tarfile.TarFile.open(archive_path, mode="w:gz") as tar: + mock_whl = pathlib.Path(test_dir) / "mock.whl" + mock_whl.touch() + + for i in range(num_files): + content = pathlib.Path(test_dir) / f"{i:04}.txt" + content.write_text(f"i am file {i}\n") + tar.add(content) + return archive_path + + +@pytest.fixture +def archive_path(test_dir: str) -> pathlib.Path: + """Fixture for returning a dir path based on the default mock asset archive name""" + path = pathlib.Path(test_dir) / mock_archive_name + return path + + +@pytest.fixture +def extraction_dir(test_dir: str) -> pathlib.Path: + """Fixture for returning a dir path based on the default mock asset archive name""" + path = pathlib.Path(test_dir) / mock_archive_name.replace(".tar.gz", "") + return path + + +@pytest.fixture +def test_assets(monkeypatch: pytest.MonkeyPatch) -> t.Dict[str, GitReleaseAsset]: + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + assets: t.List[GitReleaseAsset] = [] + mock_archive_name_tpl = "{}-{}.4.1-{}ac132fe95.tar.gz" + + for python_version in ["py3.9", "py3.10", "py3.11"]: + for dragon_version in ["dragon-0.8", "dragon-0.9", "dragon-0.10"]: + for platform in ["", "CRAYEX-"]: + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + archive_name = mock_archive_name_tpl.format( + dragon_version, python_version, platform + ) + + monkeypatch.setattr( + asset, + "_browser_download_url", + _git_attr(value=f"http://foo/{archive_name}"), + ) + monkeypatch.setattr(asset, "_name", _git_attr(value=archive_name)) + assets.append(asset) + + return assets + + +def test_cleanup_no_op(archive_path: pathlib.Path) -> None: + """Ensure that the cleanup method doesn't bomb when called with + missing archive path; simulate a failed download""" + # confirm assets do not exist + assert not archive_path.exists() + + # call cleanup. any exceptions should break test... + cleanup(archive_path) + + +def test_cleanup_archive_exists(test_archive: pathlib.Path) -> None: + """Ensure that the cleanup method removes the archive""" + assert test_archive.exists() + + cleanup(test_archive) + + # verify archive is gone after cleanup + assert not test_archive.exists() + + +def test_retrieve_cached( + test_dir: str, + # archive_path: pathlib.Path, + test_archive: pathlib.Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Verify that a previously retrieved asset archive is re-used""" + with tarfile.TarFile.open(test_archive) as tar: + tar.extractall(test_dir) + + ts1 = test_archive.parent.stat().st_ctime + + requester = Requester( + auth=None, + base_url="https://github.com", + user_agent="mozilla", + per_page=10, + verify=False, + timeout=1, + retry=1, + pool_size=1, + ) + headers = {"mock-header": "mock-value"} + attributes = {"mock-attr": "mock-attr-value"} + completed = True + + asset = GitReleaseAsset(requester, headers, attributes, completed) + + # ensure mocked asset has values that we use... + monkeypatch.setattr(asset, "_browser_download_url", _git_attr(value="http://foo")) + monkeypatch.setattr(asset, "_name", _git_attr(value=mock_archive_name)) + + asset_path = retrieve_asset(test_archive.parent, asset) + ts2 = asset_path.stat().st_ctime + + assert ( + asset_path == test_archive.parent + ) # show that the expected path matches the output path + assert ts1 == ts2 # show that the file wasn't changed... + + +@pytest.mark.parametrize( + "dragon_pin,pyv,is_found,is_crayex", + [ + pytest.param("0.8", "py3.8", False, False, id="0.8,python 3.8"), + pytest.param("0.8", "py3.9", True, False, id="0.8,python 3.9"), + pytest.param("0.8", "py3.10", True, False, id="0.8,python 3.10"), + pytest.param("0.8", "py3.11", True, False, id="0.8,python 3.11"), + pytest.param("0.8", "py3.12", False, False, id="0.8,python 3.12"), + pytest.param("0.8", "py3.8", False, True, id="0.8,python 3.8,CrayEX"), + pytest.param("0.8", "py3.9", True, True, id="0.8,python 3.9,CrayEX"), + pytest.param("0.8", "py3.10", True, True, id="0.8,python 3.10,CrayEX"), + pytest.param("0.8", "py3.11", True, True, id="0.8,python 3.11,CrayEX"), + pytest.param("0.8", "py3.12", False, True, id="0.8,python 3.12,CrayEX"), + pytest.param("0.9", "py3.8", False, False, id="0.9,python 3.8"), + pytest.param("0.9", "py3.9", True, False, id="0.9,python 3.9"), + pytest.param("0.9", "py3.10", True, False, id="0.9,python 3.10"), + pytest.param("0.9", "py3.11", True, False, id="0.9,python 3.11"), + pytest.param("0.9", "py3.12", False, False, id="0.9,python 3.12"), + pytest.param("0.9", "py3.8", False, True, id="0.9,python 3.8,CrayEX"), + pytest.param("0.9", "py3.9", True, True, id="0.9,python 3.9,CrayEX"), + pytest.param("0.9", "py3.10", True, True, id="0.9,python 3.10,CrayEX"), + pytest.param("0.9", "py3.11", True, True, id="0.9,python 3.11,CrayEX"), + pytest.param("0.9", "py3.12", False, True, id="0.9,python 3.12,CrayEX"), + # add a couple variants for a dragon version that isn't in the asset list + pytest.param("0.7", "py3.9", False, False, id="0.7,python 3.9"), + pytest.param("0.7", "py3.9", False, True, id="0.7,python 3.9,CrayEX"), + ], +) +def test_retrieve_asset_info( + test_assets: t.Collection[GitReleaseAsset], + monkeypatch: pytest.MonkeyPatch, + dragon_pin: str, + pyv: str, + is_found: bool, + is_crayex: bool, +) -> None: + """Verify that an information is retrieved correctly based on the python + version, platform (e.g. CrayEX, !CrayEx), and target dragon pin""" + + with monkeypatch.context() as ctx: + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "python_version", + lambda: pyv, + ) + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "is_crayex_platform", + lambda: is_crayex, + ) + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "dragon_pin", + lambda: dragon_pin, + ) + # avoid hitting github API + ctx.setattr( + smartsim._core._cli.scripts.dragon_install, + "_get_release_assets", + lambda: test_assets, + ) + + if is_found: + chosen_asset = retrieve_asset_info() + + assert chosen_asset + assert pyv in chosen_asset.name + assert dragon_pin in chosen_asset.name + + if is_crayex: + assert "crayex" in chosen_asset.name.lower() + else: + assert "crayex" not in chosen_asset.name.lower() + else: + with pytest.raises(SmartSimCLIActionCancelled): + retrieve_asset_info() + + +def test_check_for_utility_missing(test_dir: str) -> None: + """Ensure that looking for a missing utility doesn't raise an exception""" + ld_config = pathlib.Path(test_dir) / "ldconfig" + + utility = helpers.check_for_utility(ld_config) + + assert not utility + + +def test_check_for_utility_exists() -> None: + """Ensure that looking for an existing utility returns a non-empty path""" + utility = helpers.check_for_utility("ls") + assert utility + + +def test_is_crayex_missing_ldconfig(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the cray ex platform check doesn't fail when ldconfig isn't + available for use""" + + def mock_util_check(util: str) -> str: + if util == "ldconfig": + return "" + return "w00t!" + + with monkeypatch.context() as ctx: + # mock utility existence + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + + is_cray = helpers.is_crayex_platform() + assert not is_cray + + +def test_is_crayex_missing_fi_info(monkeypatch: pytest.MonkeyPatch) -> None: + """Ensure the cray ex platform check doesn't fail when fi_info isn't + available for use""" + + def mock_util_check(util: str) -> str: + if util == "fi_info": + return "" + return "w00t!" + + with monkeypatch.context() as ctx: + # mock utility existence + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + + is_cray = helpers.is_crayex_platform() + assert not is_cray + + +@pytest.mark.parametrize( + "is_cray,output,return_code", + [ + pytest.param(True, "cray pmi2.so\ncxi\ncray pmi.so\npni.so", 0, id="CrayEX"), + pytest.param(False, "cray pmi2.so\ncxi\npni.so", 0, id="No PMI"), + pytest.param(False, "cxi\ncray pmi.so\npni.so", 0, id="No PMI 2"), + pytest.param(False, "cray pmi2.so\ncray pmi.so\npni.so", 0, id="No CXI"), + pytest.param(False, "pmi.so\ncray pmi2.so\ncxi", 0, id="Non Cray PMI"), + pytest.param(False, "cray pmi.so\npmi2.so\ncxi", 0, id="Non Cray PMI2"), + ], +) +def test_is_cray_ex( + monkeypatch: pytest.MonkeyPatch, is_cray: bool, output: str, return_code: int +) -> None: + """Test that cray ex platform check result is returned as expected""" + + def mock_util_check(util: str) -> bool: + # mock that we have the necessary tools + return True + + with monkeypatch.context() as ctx: + # make it look like the utilies always exist + ctx.setattr( + helpers, + "check_for_utility", + mock_util_check, + ) + # mock + ctx.setattr( + helpers, + "execute_platform_cmd", + lambda x: (output, return_code), + ) + + platform_result = helpers.is_crayex_platform() + assert is_cray == platform_result + + +def test_install_package_no_wheel(extraction_dir: pathlib.Path): + """Verify that a missing wheel does not blow up and has a failure retcode""" + exp_path = extraction_dir + + result = install_package(exp_path) + assert result != 0 + + +def test_install_macos(monkeypatch: pytest.MonkeyPatch, extraction_dir: pathlib.Path): + """Verify that installation exits cleanly if installing on unsupported platform""" + with monkeypatch.context() as ctx: + ctx.setattr(sys, "platform", "darwin") + + result = install_dragon(extraction_dir) + assert result == 1 + + +def test_create_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file without any existing + file or container directory works""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure no .env exists before trying to create it. + assert not exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + +def test_create_dotenv_existing_dir(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file in an existing + target dir works""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + # set up the parent directory that will contain the .env + exp_env_path.parent.mkdir(parents=True) + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure no .env exists before trying to create it. + assert not exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + +def test_create_dotenv_existing_dotenv(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that attempting to create a .env file when one exists works as expected""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + # set up the parent directory that will contain the .env + exp_env_path.parent.mkdir(parents=True) + + # write something into file to verify it is overwritten + var_name = "DRAGON_BASE_DIR" + exp_env_path.write_text(f"{var_name}=/foo/bar") + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + # ensure .env exists so we can update it + assert exp_env_path.exists() + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + assert exp_env_path.exists() + + # ensure file was overwritten and env vars are not duplicated + dotenv_content = exp_env_path.read_text(encoding="utf-8") + split_content = dotenv_content.split(var_name) + + # split to confirm env var only appars once + assert len(split_content) == 2 + + +def test_create_dotenv_format(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Verify that created .env files are correctly formatted""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + create_dotenv(mock_dragon_root) + + # ensure the .env is created as side-effect of create_dotenv + content = exp_env_path.read_text(encoding="utf-8") + + # ensure we have values written, but ignore empty lines + lines = [line for line in content.split("\n") if line] + assert lines + + # ensure each line is formatted as key=value + for line in lines: + line_split = line.split("=") + assert len(line_split) == 2 diff --git a/tests/test_dragon_launcher.py b/tests/test_dragon_launcher.py new file mode 100644 index 000000000..ee0fcb14b --- /dev/null +++ b/tests/test_dragon_launcher.py @@ -0,0 +1,523 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import logging +import multiprocessing as mp +import os +import pathlib +import sys +import time +import typing as t + +import pytest +import zmq + +import smartsim._core.config +from smartsim._core._cli.scripts.dragon_install import create_dotenv +from smartsim._core.config.config import get_config +from smartsim._core.launcher.dragon.dragonLauncher import DragonConnector +from smartsim._core.launcher.dragon.dragonSockets import ( + get_authenticator, + get_secure_socket, +) +from smartsim._core.schemas.dragonRequests import DragonBootstrapRequest +from smartsim._core.schemas.dragonResponses import DragonHandshakeResponse +from smartsim._core.utils.network import IFConfig, find_free_port +from smartsim._core.utils.security import KeyManager + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +is_mac = sys.platform == "darwin" + + +class MockPopen: + calls = [] + + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + self.args = args + self.kwargs = kwargs + + MockPopen.calls.append((args, kwargs)) + + @property + def pid(self) -> int: + return 99999 + + @property + def returncode(self) -> int: + return 0 + + @property + def stdout(self): + return None + + @property + def stderr(self): + return None + + def wait(self, timeout: float) -> None: + time.sleep(timeout) + + +class MockSocket: + def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: + self._bind_address = "" + + def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: + return self + + def bind(self, addr: str) -> None: + self._bind_address = addr + + def recv_string(self, flags: int) -> str: + dbr = DragonBootstrapRequest(address=self._bind_address) + return f"bootstrap|{dbr.json()}" + + def close(self) -> None: ... + + def send(self, *args, **kwargs) -> None: ... + + def send_json(self, json: str) -> None: ... + + def send_string(*args, **kwargs) -> None: ... + + def connect(*args, **kwargs) -> None: ... + + @property + def bind_address(self) -> str: + return self._bind_address + + +class MockAuthenticator: + def __init__(self, context: zmq.Context, log: t.Any) -> None: + self.num_starts: int = 0 + self.num_stops: int = 0 + self.num_configure_curves: int = 0 + self.context = context + self.thread = None + + def configure_curve(self, *args, **kwargs) -> None: + self.cfg_args = args + self.cfg_kwargs = kwargs + self.num_configure_curves += 1 + + def start(self) -> None: + self.num_starts += 1 + + def stop(self) -> None: + self.num_stops += 1 + + def is_alive(self) -> bool: + return self.num_starts > 0 and self.num_stops == 0 + + +def mock_dragon_env(test_dir, *args, **kwargs): + """Create a mock dragon environment that can talk to the launcher through ZMQ""" + logger = logging.getLogger(__name__) + config = get_config() + logging.basicConfig(level=logging.DEBUG) + try: + addr = "127.0.0.1" + callback_port = kwargs["port"] + head_port = find_free_port(start=callback_port + 1) + context = zmq.Context.instance() + context.setsockopt(zmq.SNDTIMEO, config.dragon_server_timeout) + context.setsockopt(zmq.RCVTIMEO, config.dragon_server_timeout) + authenticator = get_authenticator(context, -1) + + callback_socket = get_secure_socket(context, zmq.REQ, False) + dragon_head_socket = get_secure_socket(context, zmq.REP, True) + + full_addr = f"{addr}:{callback_port}" + callback_socket.connect(f"tcp://{full_addr}") + + full_head_addr = f"tcp://{addr}:{head_port}" + dragon_head_socket.bind(full_head_addr) + + req = DragonBootstrapRequest(address=full_head_addr) + + msg_sent = False + while not msg_sent: + logger.info("Sending bootstrap request to callback socket") + callback_socket.send_string("bootstrap|" + req.json()) + # hold until bootstrap response is received + logger.info("Receiving bootstrap response from callback socket") + _ = callback_socket.recv() + msg_sent = True + + hand_shaken = False + while not hand_shaken: + # other side should set up a socket and push me a `HandshakeRequest` + logger.info("Receiving handshake request through dragon head socket") + _ = dragon_head_socket.recv() + # acknowledge handshake success w/DragonHandshakeResponse + logger.info("Sending handshake response through dragon head socket") + handshake_ack = DragonHandshakeResponse(dragon_pid=os.getpid()) + dragon_head_socket.send_string(f"handshake|{handshake_ack.json()}") + + hand_shaken = True + + shutting_down = False + while not shutting_down: + logger.info("Waiting for shutdown request through dragon head socket") + # any incoming request at this point in test is my shutdown... + try: + message = dragon_head_socket.recv() + logger.info(f"Received final message {message}") + finally: + shutting_down = True + try: + logger.info("Handshake complete. Shutting down mock dragon env.") + authenticator.stop() + finally: + logger.info("Dragon mock env exiting...") + + except Exception as ex: + logger.info(f"exception occurred while configuring mock handshaker: {ex}") + raise ex from None + + +def test_dragon_connect_attributes(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Test the connection to a dragon environment dynamically selects an open port + in the range supplied and passes the correct environment""" + test_path = pathlib.Path(test_dir) + + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + mock_socket = MockSocket() + + # look at test_dir for dragon config + ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + # avoid finding real interface + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", + lambda: IFConfig(interface="faux_interface", address="127.0.0.1"), + ) + # we need to set the socket value or is_connected returns False + ctx.setattr( + "smartsim._core.launcher.dragon.dragonLauncher.DragonConnector._handshake", + lambda self, address: ..., + ) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + # avoid starting a real zmq socket + ctx.setattr("zmq.Context.socket", mock_socket) + # avoid starting a real process for dragon entrypoint + ctx.setattr( + "subprocess.Popen", lambda *args, **kwargs: MockPopen(*args, **kwargs) + ) + + # avoid reading "real" config in test... + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + dotenv_path = smartsim._core.config.CONFIG.dragon_dotenv + dotenv_path.parent.mkdir(parents=True) + dotenv_path.write_text("FOO=BAR\nBAZ=BOO") + + dragon_connector = DragonConnector() + dragon_connector.connect_to_dragon() + + chosen_port = int(mock_socket.bind_address.split(":")[-1]) + assert chosen_port >= 5995 + + # grab the kwargs env=xxx from the mocked popen to check what was passed + env = MockPopen.calls[0][1].get("env", None) + + # confirm the environment values were passed from .env file to dragon process + assert "PYTHONUNBUFFERED" in env + assert "FOO" in env + assert "BAZ" in env + + dragon_connector._authenticator.stop() + + +@pytest.mark.parametrize( + "socket_type, is_server", + [ + pytest.param(zmq.REQ, True, id="as-server"), + pytest.param(zmq.REP, False, id="as-client"), + ], +) +def test_secure_socket_authenticator_setup( + test_dir: str, monkeypatch: pytest.MonkeyPatch, socket_type: int, is_server: bool +): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + + with monkeypatch.context() as ctx: + # look at test dir for dragon config + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + + authenticator = get_authenticator(zmq.Context.instance()) + + km = KeyManager(get_config(), as_server=is_server) + + assert isinstance(authenticator, MockAuthenticator) + + # ensure authenticator was configured + assert authenticator.num_configure_curves > 0 + # ensure authenticator was started + assert authenticator.num_starts > 0 + assert authenticator.context == zmq.Context.instance() + # ensure authenticator will accept any secured connection + assert authenticator.cfg_kwargs.get("domain", "") == "*" + # ensure authenticator is using the expected set of keys + assert authenticator.cfg_kwargs.get("location", "") == km.client_keys_dir + + authenticator.stop() + + +@pytest.mark.parametrize( + "as_server", + [ + pytest.param(True, id="server-socket"), + pytest.param(False, id="client-socket"), + ], +) +def test_secure_socket_setup( + test_dir: str, monkeypatch: pytest.MonkeyPatch, as_server: bool +): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + + with monkeypatch.context() as ctx: + # look at test dir for dragon config + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + # avoid starting a real authenticator thread + ctx.setattr("zmq.auth.thread.ThreadAuthenticator", MockAuthenticator) + + context = zmq.Context.instance() + + socket = get_secure_socket(context, zmq.REP, as_server) + + # verify the socket is correctly configured to use curve authentication + assert bool(socket.CURVE_SERVER) == as_server + assert not socket.closed + + socket.close() + + +def test_secure_socket(test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Ensure the authenticator created by the secure socket factory method + is fully configured and started when returned to a client""" + logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG) + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + context = zmq.Context.instance() + authenticator = get_authenticator(context) + server = get_secure_socket(context, zmq.REP, True) + + ip, port = "127.0.0.1", find_free_port(start=9999) + + try: + server.bind(f"tcp://*:{port}") + + client = get_secure_socket(context, zmq.REQ, False) + + client.connect(f"tcp://{ip}:{port}") + + to_send = "you get a foo! you get a foo! everybody gets a foo!" + client.send_string(to_send, flags=zmq.NOBLOCK) + + received_msg = server.recv_string() + assert received_msg == to_send + logger.debug(f"server received: {received_msg}") + finally: + if authenticator: + authenticator.stop() + if client: + client.close() + if server: + server.close() + + +@pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") +def test_dragon_launcher_handshake(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Test that a real handshake between a launcher & dragon environment + completes successfully using secure sockets""" + addr = "127.0.0.1" + bootstrap_port = find_free_port(start=5995) + + with monkeypatch.context() as ctx: + # make sure we don't touch "real keys" during a test + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + # look at test dir for dragon config + ctx.setenv("SMARTSIM_DRAGON_SERVER_PATH", test_dir) + # avoid finding real interface since we may not be on a super + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector.get_best_interface_and_address", + lambda: IFConfig("faux_interface", addr), + ) + + ctx.setattr( + "smartsim._core.launcher.dragon.dragonConnector._dragon_cleanup", + lambda server_socket, server_process_pid, server_authenticator: server_authenticator.stop(), + ) + + # start up a faux dragon env that knows how to do the handshake process + # but uses secure sockets for all communication. + mock_dragon = mp.Process( + target=mock_dragon_env, + daemon=True, + kwargs={"port": bootstrap_port, "test_dir": test_dir}, + ) + + def fn(*args, **kwargs): + mock_dragon.start() + return mock_dragon + + ctx.setattr("subprocess.Popen", fn) + + connector = DragonConnector() + + try: + # connect executes the complete handshake and raises an exception if comms fails + connector.connect_to_dragon() + finally: + connector.cleanup() + + +def test_load_env_no_file(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure an empty dragon .env file doesn't break the launcher""" + test_path = pathlib.Path(test_dir) + # mock_dragon_root = pathlib.Path(test_dir) / "dragon" + # exp_env_path = pathlib.Path(test_dir) / "dragon" / ".env" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + + dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv + # verify config doesn't exist + assert not dragon_conf.exists() + + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert not loaded_env + + +def test_load_env_env_file_created(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure a populated dragon .env file is loaded correctly by the launcher""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + dragon_conf = smartsim._core.config.CONFIG.dragon_dotenv + + # verify config does exist + assert dragon_conf.exists() + + # load config w/launcher + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert loaded_env + + # confirm .env was parsed as expected by inspecting a key + assert "DRAGON_ROOT_DIR" in loaded_env + + +def test_load_env_cached_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure repeated attempts to use dragon env don't hit file system""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + + # load config w/launcher + connector = DragonConnector() + + loaded_env = connector.load_persisted_env() + assert loaded_env + + # ensure attempting to reload would bomb + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", None) + + # attempt to load and if it doesn't blow up, it used the cached copy + + loaded_env = connector.load_persisted_env() + assert loaded_env + + +def test_merge_env(monkeypatch: pytest.MonkeyPatch, test_dir: str): + """Ensure that merging dragon .env file into current env has correct precedences""" + test_path = pathlib.Path(test_dir) + mock_dragon_root = pathlib.Path(test_dir) / "dragon" + + with monkeypatch.context() as ctx: + ctx.setattr(smartsim._core.config.CONFIG, "conf_dir", test_path) + create_dotenv(mock_dragon_root) + + # load config w/launcher + connector = DragonConnector() + loaded_env = {**connector.load_persisted_env()} + assert loaded_env + + curr_base_dir = "/foo" + curr_path = "/foo:/bar" + curr_only = "some-value" + + loaded_path = loaded_env.get("PATH", "") + + # ensure some non-dragon value exists in env; we want + # to see that it is in merged output without empty prepending + non_dragon_key = "NON_DRAGON_KEY" + non_dragon_value = "non_dragon_value" + connector._env_vars[non_dragon_key] = non_dragon_value + + curr_env = { + "DRAGON_BASE_DIR": curr_base_dir, # expect overwrite + "PATH": curr_path, # expect prepend + "ONLY_IN_CURRENT": curr_only, # expect pass-through + } + + merged_env = connector.merge_persisted_env(curr_env) + + # any dragon env vars should be overwritten + assert merged_env["DRAGON_BASE_DIR"] != curr_base_dir + + # any non-dragon collisions should result in prepending + assert merged_env["PATH"] == f"{loaded_path}:{curr_path}" + # ensure we actually see a change + assert merged_env["PATH"] != loaded_env["PATH"] + + # any keys that were in curr env should still exist, unchanged + assert merged_env["ONLY_IN_CURRENT"] == curr_only + + # any non-dragon keys that didn't exist avoid unnecessary prepending + assert merged_env[non_dragon_key] == non_dragon_value diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 12b2f1579..4bae09e68 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -24,22 +24,33 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import os.path as osp +import pathlib +import shutil +import typing as t import pytest from smartsim import Experiment from smartsim._core.config import CONFIG +from smartsim._core.config.config import Config +from smartsim._core.utils import serialize +from smartsim.database import Orchestrator from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError from smartsim.settings import RunSettings -from smartsim.status import STATUS_NEVER_STARTED +from smartsim.status import SmartSimStatus + +if t.TYPE_CHECKING: + import conftest + # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests -def test_model_prefix(test_dir): +def test_model_prefix(test_dir: str) -> None: exp_name = "test_prefix" exp = Experiment(exp_name) @@ -52,24 +63,38 @@ def test_model_prefix(test_dir): assert model._key_prefixing_enabled == True -def test_bad_exp_path(): +def test_model_no_name(): + exp = Experiment("test_model_no_name") + with pytest.raises(AttributeError): + _ = exp.create_model(name=None, run_settings=RunSettings("python")) + + +def test_ensemble_no_name(): + exp = Experiment("test_ensemble_no_name") + with pytest.raises(AttributeError): + _ = exp.create_ensemble( + name=None, run_settings=RunSettings("python"), replicas=2 + ) + + +def test_bad_exp_path() -> None: with pytest.raises(NotADirectoryError): exp = Experiment("test", "not-a-directory") -def test_type_exp_path(): +def test_type_exp_path() -> None: with pytest.raises(TypeError): exp = Experiment("test", ["this-is-a-list-dummy"]) -def test_stop_type(): +def test_stop_type() -> None: """Wrong argument type given to stop""" exp = Experiment("name") with pytest.raises(TypeError): exp.stop("model") -def test_finished_new_model(): +def test_finished_new_model() -> None: # finished should fail as this model hasn't been # launched yet. @@ -79,40 +104,40 @@ def test_finished_new_model(): exp.finished(model) -def test_status_typeerror(): +def test_status_typeerror() -> None: exp = Experiment("test") with pytest.raises(TypeError): exp.get_status([]) -def test_status_pre_launch(): +def test_status_pre_launch() -> None: model = Model("name", {}, "./", RunSettings("python")) exp = Experiment("test") - assert exp.get_status(model)[0] == STATUS_NEVER_STARTED + assert exp.get_status(model)[0] == SmartSimStatus.STATUS_NEVER_STARTED -def test_bad_ensemble_init_no_rs(): +def test_bad_ensemble_init_no_rs(test_dir: str) -> None: """params supplied without run settings""" - exp = Experiment("test") + exp = Experiment("test", exp_path=test_dir) with pytest.raises(SmartSimError): exp.create_ensemble("name", {"param1": 1}) -def test_bad_ensemble_init_no_params(): +def test_bad_ensemble_init_no_params(test_dir: str) -> None: """params supplied without run settings""" - exp = Experiment("test") + exp = Experiment("test", exp_path=test_dir) with pytest.raises(SmartSimError): exp.create_ensemble("name", run_settings=RunSettings("python")) -def test_bad_ensemble_init_no_rs_bs(): +def test_bad_ensemble_init_no_rs_bs(test_dir: str) -> None: """ensemble init without run settings or batch settings""" - exp = Experiment("test") + exp = Experiment("test", exp_path=test_dir) with pytest.raises(SmartSimError): exp.create_ensemble("name") -def test_stop_entity(test_dir): +def test_stop_entity(test_dir: str) -> None: exp_name = "test_stop_entity" exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model("model", path=test_dir, run_settings=RunSettings("sleep", "5")) @@ -122,7 +147,7 @@ def test_stop_entity(test_dir): assert exp.finished(m) == True -def test_poll(test_dir): +def test_poll(test_dir: str) -> None: # Ensure that a SmartSimError is not raised exp_name = "test_exp_poll" exp = Experiment(exp_name, exp_path=test_dir) @@ -134,7 +159,7 @@ def test_poll(test_dir): exp.stop(model) -def test_summary(test_dir): +def test_summary(test_dir: str) -> None: exp_name = "test_exp_summary" exp = Experiment(exp_name, exp_path=test_dir) m = exp.create_model( @@ -157,30 +182,189 @@ def test_summary(test_dir): assert 0 == int(row["Returncode"]) -def test_launcher_detection(wlmutils, monkeypatch): +def test_launcher_detection( + wlmutils: "conftest.WLMUtils", monkeypatch: pytest.MonkeyPatch +) -> None: if wlmutils.get_test_launcher() == "pals": pytest.skip(reason="Launcher detection cannot currently detect pbs vs pals") if wlmutils.get_test_launcher() == "local": monkeypatch.setenv("PATH", "") # Remove all WLMs from PATH + if wlmutils.get_test_launcher() == "dragon": + pytest.skip(reason="Launcher detection cannot currently detect dragon") exp = Experiment("test-launcher-detection", launcher="auto") assert exp._launcher == wlmutils.get_test_launcher() -def test_enable_disable_telemtery(monkeypatch): - # TODO: Currently these are implemented by setting an environment variable - # so that ALL experiments instanced in a driver script will begin - # producing telemetry data. In the future it is planned to have this - # work on a "per-instance" basis +def test_enable_disable_telemetry( + monkeypatch: pytest.MonkeyPatch, test_dir: str, config: Config +) -> None: + # Global telemetry defaults to `on` and can be modified by + # setting the value of env var SMARTSIM_FLAG_TELEMETRY to 0/1 monkeypatch.setattr(os, "environ", {}) - exp = Experiment("my-exp") - exp.enable_telemetry() - assert CONFIG.telemetry_enabled - exp.disable_telemetry() - assert not CONFIG.telemetry_enabled + exp = Experiment("my-exp", exp_path=test_dir) + exp.telemetry.enable() + assert exp.telemetry.is_enabled + + exp.telemetry.disable() + assert not exp.telemetry.is_enabled + + exp.telemetry.enable() + assert exp.telemetry.is_enabled + + exp.telemetry.disable() + assert not exp.telemetry.is_enabled + + exp.start() + mani_path = ( + pathlib.Path(test_dir) / config.telemetry_subdir / serialize.MANIFEST_FILENAME + ) + assert mani_path.exists() + +def test_telemetry_default( + monkeypatch: pytest.MonkeyPatch, test_dir: str, config: Config +) -> None: + """Ensure the default values for telemetry configuration match expectation + that experiment telemetry is on""" -def test_error_on_cobalt(): + # If env var related to telemetry doesn't exist, experiment should default to True + monkeypatch.setattr(os, "environ", {}) + exp = Experiment("my-exp", exp_path=test_dir) + assert exp.telemetry.is_enabled + + # If telemetry disabled in env, should get False + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "0") + exp = Experiment("my-exp", exp_path=test_dir) + assert not exp.telemetry.is_enabled + + # If telemetry enabled in env, should get True + monkeypatch.setenv("SMARTSIM_FLAG_TELEMETRY", "1") + exp = Experiment("my-exp", exp_path=test_dir) + assert exp.telemetry.is_enabled + + +def test_error_on_cobalt() -> None: with pytest.raises(SSUnsupportedError): exp = Experiment("cobalt_exp", launcher="cobalt") + + +def test_default_orch_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Orchestrator""" + + exp_name = "default-orch-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + db = exp.create_database( + port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() + ) + exp.start(db) + orch_path = pathlib.Path(test_dir) / db.name + assert orch_path.exists() + assert db.path == str(orch_path) + + +def test_default_model_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Model""" + + exp_name = "default-model-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + model = exp.create_model(name="model_name", run_settings=settings) + exp.start(model) + model_path = pathlib.Path(test_dir) / model.name + assert model_path.exists() + assert model.path == str(model_path) + + +def test_default_ensemble_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Ensemble""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + ensemble = exp.create_ensemble( + name="ensemble_name", run_settings=settings, replicas=2 + ) + exp.start(ensemble) + ensemble_path = pathlib.Path(test_dir) / ensemble.name + assert ensemble_path.exists() + assert ensemble.path == str(ensemble_path) + for member in ensemble.models: + member_path = ensemble_path / member.name + assert member_path.exists() + assert member.path == str(ensemble_path / member.name) + + +def test_user_orch_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Orchestrator folder""" + + exp_name = "default-orch-path" + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + db = exp.create_database( + port=wlmutils.get_test_port(), + interface=wlmutils.get_test_interface(), + path="./testing_folder1234", + ) + exp.start(db) + orch_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert orch_path.exists() + assert db.path == str(orch_path) + shutil.rmtree(orch_path) + + +def test_default_model_with_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Model folder""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + model = exp.create_model( + name="model_name", run_settings=settings, path="./testing_folder1234" + ) + exp.start(model) + model_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert model_path.exists() + assert model.path == str(model_path) + shutil.rmtree(model_path) + + +def test_default_ensemble_with_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Ensemble folder""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + ensemble = exp.create_ensemble( + name="ensemble_name", + run_settings=settings, + path="./testing_folder1234", + replicas=2, + ) + exp.start(ensemble) + ensemble_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert ensemble_path.exists() + assert ensemble.path == str(ensemble_path) + for member in ensemble.models: + member_path = ensemble_path / member.name + assert member_path.exists() + assert member.path == str(member_path) + shutil.rmtree(ensemble_path) diff --git a/tests/test_fixtures.py b/tests/test_fixtures.py new file mode 100644 index 000000000..ea753374e --- /dev/null +++ b/tests/test_fixtures.py @@ -0,0 +1,56 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os + +import psutil +import pytest + +from smartsim import Experiment +from smartsim.database import Orchestrator +from smartsim.error import SmartSimError +from smartsim.error.errors import SSUnsupportedError + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_db_fixtures(local_experiment, local_db, prepare_db): + db = prepare_db(local_db).orchestrator + local_experiment.reconnect_orchestrator(db.checkpoint_file) + assert db.is_active() + local_experiment.stop(db) + + +def test_create_new_db_fixture_if_stopped(local_experiment, local_db, prepare_db): + # Run this twice to make sure that there is a stopped database + output = prepare_db(local_db) + local_experiment.reconnect_orchestrator(output.orchestrator.checkpoint_file) + local_experiment.stop(output.orchestrator) + + output = prepare_db(local_db) + assert output.new_db + local_experiment.reconnect_orchestrator(output.orchestrator.checkpoint_file) + assert output.orchestrator.is_active() diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 025f53d32..523ed7191 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -24,6 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import collections +import signal + import pytest from smartsim._core.utils import helpers @@ -68,3 +71,115 @@ def test_encode_raises_on_empty(): def test_decode_raises_on_empty(): with pytest.raises(ValueError): helpers.decode_cmd("") + + +class MockSignal: + def __init__(self): + self.signal_handlers = collections.defaultdict(lambda: signal.SIG_IGN) + + def signal(self, signalnum, handler): + orig = self.getsignal(signalnum) + self.signal_handlers[signalnum] = handler + return orig + + def getsignal(self, signalnum): + return self.signal_handlers[signalnum] + + +@pytest.fixture +def mock_signal(monkeypatch): + mock_signal = MockSignal() + monkeypatch.setattr(helpers, "signal", mock_signal) + yield mock_signal + + +def test_signal_intercept_stack_will_register_itself_with_callback_fn(mock_signal): + callback = lambda num, frame: ... + stack = helpers.SignalInterceptionStack.get(signal.NSIG) + stack.push(callback) + assert isinstance(stack, helpers.SignalInterceptionStack) + assert stack is mock_signal.signal_handlers[signal.NSIG] + assert len(stack) == 1 + assert list(stack)[0] == callback + + +def test_signal_intercept_stack_keeps_track_of_previous_handlers(mock_signal): + default_handler = lambda num, frame: ... + mock_signal.signal_handlers[signal.NSIG] = default_handler + stack = helpers.SignalInterceptionStack.get(signal.NSIG) + stack.push(lambda n, f: ...) + assert stack._original is default_handler + + +def test_signal_intercept_stacks_are_registered_per_signal_number(mock_signal): + handler = lambda num, frame: ... + stack_1 = helpers.SignalInterceptionStack.get(signal.NSIG) + stack_1.push(handler) + stack_2 = helpers.SignalInterceptionStack.get(signal.NSIG + 1) + stack_2.push(handler) + + assert mock_signal.signal_handlers[signal.NSIG] is stack_1 + assert mock_signal.signal_handlers[signal.NSIG + 1] is stack_2 + assert stack_1 is not stack_2 + assert list(stack_1) == list(stack_2) == [handler] + + +def test_signal_intercept_handlers_will_not_overwrite_if_handler_already_exists( + mock_signal, +): + handler_1 = lambda num, frame: ... + handler_2 = lambda num, frame: ... + stack_1 = helpers.SignalInterceptionStack.get(signal.NSIG) + stack_1.push(handler_1) + stack_2 = helpers.SignalInterceptionStack.get(signal.NSIG) + stack_2.push(handler_2) + assert stack_1 is stack_2 is mock_signal.signal_handlers[signal.NSIG] + assert list(stack_1) == [handler_2, handler_1] + + +def test_signal_intercept_stack_can_add_multiple_instances_of_the_same_handler( + mock_signal, +): + handler = lambda num, frame: ... + stack = helpers.SignalInterceptionStack.get(signal.NSIG) + stack.push(handler) + stack.push(handler) + assert list(stack) == [handler, handler] + + +def test_signal_intercept_stack_enforces_that_unique_push_handlers_are_unique( + mock_signal, +): + handler = lambda num, frame: ... + stack = helpers.SignalInterceptionStack.get(signal.NSIG) + assert stack.push_unique(handler) + assert not helpers.SignalInterceptionStack.get(signal.NSIG).push_unique(handler) + assert list(stack) == [handler] + + +def test_signal_intercept_stack_enforces_that_unique_push_method_handlers_are_unique( + mock_signal, +): + class C: + def fn(num, frame): ... + + c1 = C() + c2 = C() + stack = helpers.SignalInterceptionStack.get(signal.NSIG) + stack.push_unique(c1.fn) + assert helpers.SignalInterceptionStack.get(signal.NSIG).push_unique(c2.fn) + assert not helpers.SignalInterceptionStack.get(signal.NSIG).push_unique(c1.fn) + assert list(stack) == [c2.fn, c1.fn] + + +def test_signal_handler_calls_functions_in_reverse_order(mock_signal): + called_list = [] + default = lambda num, frame: called_list.append("default") + handler_1 = lambda num, frame: called_list.append("handler_1") + handler_2 = lambda num, frame: called_list.append("handler_2") + + mock_signal.signal_handlers[signal.NSIG] = default + helpers.SignalInterceptionStack.get(signal.NSIG).push(handler_1) + helpers.SignalInterceptionStack.get(signal.NSIG).push(handler_2) + mock_signal.signal_handlers[signal.NSIG](signal.NSIG, None) + assert called_list == ["handler_2", "handler_1", "default"] diff --git a/tests/test_indirect.py b/tests/test_indirect.py index 73f381441..814302968 100644 --- a/tests/test_indirect.py +++ b/tests/test_indirect.py @@ -24,15 +24,15 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - import pathlib import sys import psutil import pytest +import conftest from smartsim._core.config import CONFIG -from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts, main +from smartsim._core.entrypoints.indirect import cleanup, get_parser, get_ts_ms, main from smartsim._core.utils.helpers import encode_cmd ALL_ARGS = { @@ -141,7 +141,7 @@ def terminate(self) -> None: def test_ts(): """Ensure expected output type""" - ts = get_ts() + ts = get_ts_ms() assert isinstance(ts, int) @@ -182,24 +182,70 @@ def test_indirect_main_cmd_check(capsys, test_dir, monkeypatch): assert "Invalid cmd supplied" in ex.value.args[0] -def test_complete_process(fileutils, test_dir): - """Ensure the happy-path completes and returns a success return code""" +def test_process_failure(fileutils, test_dir: str, monkeypatch: pytest.MonkeyPatch): + """Ensure that a stop event is logged if the process unexpectedly terminates""" + mock_pid = 1122334455 + create_msg = "creating: {0}" + term_msg = "term: {0}" + wait_msg = "wait: {0}" + + class MockProc: + def __init__(self, *args, **kwargs): + print(create_msg.format(mock_pid)) + + @property + def pid(self): + return mock_pid + + def terminate(self): + print(term_msg.format(mock_pid)) + + def wait(self): + print(wait_msg.format(mock_pid)) + raise Exception("You shall not pass!") + script = fileutils.get_test_conf_path("sleep.py") exp_dir = pathlib.Path(test_dir) - raw_cmd = f"{sys.executable} {script} --time=1" + raw_cmd = f"{sys.executable} {script} --time=10" cmd = encode_cmd(raw_cmd.split()) - rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) - assert rc == 0 + mock_track = conftest.CountingCallable() + + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.indirect.write_event", mock_track) + ctx.setattr("psutil.pid_exists", lambda pid: True) + ctx.setattr("psutil.Popen", MockProc) + ctx.setattr("psutil.Process", MockProc) # handle the proc.terminate() + ctx.setattr("smartsim._core.entrypoints.indirect.STEP_PID", mock_pid) + + rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) + assert rc == -1 + + (args1, _), (args2, kwargs2) = mock_track.details + assert "start" in args1 + assert "stop" in args2 + assert kwargs2.get("returncode", -1) + + +def test_complete_process( + fileutils: conftest.FileUtils, test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the happy-path completes and returns a success return code""" + script = fileutils.get_test_conf_path("sleep.py") - assert exp_dir.exists() + exp_dir = pathlib.Path(test_dir) - # NOTE: don't have a manifest so we're falling back to default event path - data_dir = exp_dir / CONFIG.telemetry_subdir - start_events = list(data_dir.rglob("start.json")) - stop_events = list(data_dir.rglob("stop.json")) + raw_cmd = f"{sys.executable} {script} --time=1" + cmd = encode_cmd(raw_cmd.split()) + + mock_track = conftest.CountingCallable() + with monkeypatch.context() as ctx: + ctx.setattr("smartsim._core.entrypoints.indirect.write_event", mock_track) + rc = main(cmd, "application", exp_dir, exp_dir / CONFIG.telemetry_subdir) + assert rc == 0 - assert start_events - assert stop_events + (args1, _), (args2, _) = mock_track.details + assert "start" in args1 + assert "stop" in args2 diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 28c48e0db..c38ae0225 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -63,22 +63,22 @@ def test_interrupt_blocked_jobs(test_dir): replicas=2, run_settings=RunSettings("sleep", "100"), ) - ensemble.set_path(test_dir) num_jobs = 1 + len(ensemble) - try: - pid = os.getpid() - keyboard_interrupt_thread = Thread( - name="sigint_thread", target=keyboard_interrupt, args=(pid,) - ) - keyboard_interrupt_thread.start() + pid = os.getpid() + keyboard_interrupt_thread = Thread( + name="sigint_thread", target=keyboard_interrupt, args=(pid,) + ) + keyboard_interrupt_thread.start() + + with pytest.raises(KeyboardInterrupt): exp.start(model, ensemble, block=True, kill_on_interrupt=True) - except KeyboardInterrupt: - time.sleep(2) # allow time for jobs to be stopped - active_jobs = exp._control._jobs.jobs - active_db_jobs = exp._control._jobs.db_jobs - completed_jobs = exp._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 - assert len(completed_jobs) == num_jobs + + time.sleep(2) # allow time for jobs to be stopped + active_jobs = exp._control._jobs.jobs + active_db_jobs = exp._control._jobs.db_jobs + completed_jobs = exp._control._jobs.completed + assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(completed_jobs) == num_jobs def test_interrupt_multi_experiment_unblocked_jobs(test_dir): @@ -104,22 +104,23 @@ def test_interrupt_multi_experiment_unblocked_jobs(test_dir): replicas=2, run_settings=RunSettings("sleep", "100"), ) - ensemble.set_path(test_dir) jobs_per_experiment[i] = 1 + len(ensemble) - try: - pid = os.getpid() - keyboard_interrupt_thread = Thread( - name="sigint_thread", target=keyboard_interrupt, args=(pid,) - ) - keyboard_interrupt_thread.start() + + pid = os.getpid() + keyboard_interrupt_thread = Thread( + name="sigint_thread", target=keyboard_interrupt, args=(pid,) + ) + keyboard_interrupt_thread.start() + + with pytest.raises(KeyboardInterrupt): for experiment in experiments: experiment.start(model, ensemble, block=False, kill_on_interrupt=True) - time.sleep(9) # since jobs aren't blocked, wait for SIGINT - except KeyboardInterrupt: - time.sleep(2) # allow time for jobs to be stopped - for i, experiment in enumerate(experiments): - active_jobs = experiment._control._jobs.jobs - active_db_jobs = experiment._control._jobs.db_jobs - completed_jobs = experiment._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 - assert len(completed_jobs) == jobs_per_experiment[i] + keyboard_interrupt_thread.join() # since jobs aren't blocked, wait for SIGINT + + time.sleep(2) # allow time for jobs to be stopped + for i, experiment in enumerate(experiments): + active_jobs = experiment._control._jobs.jobs + active_db_jobs = experiment._control._jobs.db_jobs + completed_jobs = experiment._control._jobs.completed + assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(completed_jobs) == jobs_per_experiment[i] diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 0557f3cf4..21b3184e5 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -27,18 +27,19 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.error import SSUnsupportedError from smartsim.settings import JsrunSettings, RunSettings +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -def test_unsupported_run_settings(): +def test_unsupported_run_settings(test_dir): exp_name = "test-unsupported-run-settings" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) bad_settings = JsrunSettings("echo", "hello") model = exp.create_model("bad_rs", bad_settings) @@ -57,7 +58,7 @@ def test_model_failure(fileutils, test_dir): exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_FAILED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_FAILED for stat in statuses]) def test_orchestrator_relaunch(test_dir, wlmutils): @@ -65,9 +66,9 @@ def test_orchestrator_relaunch(test_dir, wlmutils): exp_name = "test-orc-on-relaunch" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - orc = Orchestrator(port=wlmutils.get_test_port()) + orc = Orchestrator(port=wlmutils.get_test_port(), db_identifier="orch_1") orc.set_path(test_dir) - orc_1 = Orchestrator(port=wlmutils.get_test_port() + 1) + orc_1 = Orchestrator(port=wlmutils.get_test_port() + 1, db_identifier="orch_2") orc_1.set_path(test_dir) try: exp.start(orc) diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index 7befff95e..85687e014 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -49,7 +50,7 @@ def test_models(fileutils, test_dir): exp.start(M1, M2, block=True, summary=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_ensemble(fileutils, test_dir): @@ -60,8 +61,7 @@ def test_ensemble(fileutils, test_dir): settings = exp.create_run_settings("python", f"{script} --time=3") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True, summary=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_local_multi_run.py b/tests/test_local_multi_run.py index 576e290ca..a2c1d70ee 100644 --- a/tests/test_local_multi_run.py +++ b/tests/test_local_multi_run.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a @@ -49,9 +50,9 @@ def test_models(fileutils, test_dir): exp.start(M1, block=False) statuses = exp.get_status(M1) - assert all([stat != status.STATUS_FAILED for stat in statuses]) + assert all([stat != SmartSimStatus.STATUS_FAILED for stat in statuses]) # start another while first model is running exp.start(M2, block=True) statuses = exp.get_status(M1, M2) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index c59aebd7b..2556c5597 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -26,7 +26,8 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -48,12 +49,12 @@ def test_restart(fileutils, test_dir): exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # restart the model exp.start(M1, block=True) statuses = exp.get_status(M1) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) def test_ensemble(fileutils, test_dir): @@ -64,13 +65,12 @@ def test_ensemble(fileutils, test_dir): settings = exp.create_run_settings("python", f"{script} --time=3") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) # restart the ensemble exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_logs.py b/tests/test_logs.py index 88c6a738f..a187baa2a 100644 --- a/tests/test_logs.py +++ b/tests/test_logs.py @@ -27,6 +27,7 @@ import io import logging import pathlib +import socket import pytest @@ -91,8 +92,10 @@ def test_add_exp_loggers(test_dir): logger = logging.getLogger("smartsim_test_add_exp_loggers") logger.addHandler(logging.StreamHandler(faux_out_stream)) - out_file = pathlib.Path(test_dir) / "smartsim.out" - err_file = pathlib.Path(test_dir) / "smartsim.err" + logger.addFilter(smartsim.log.HostnameFilter()) + + out_file = pathlib.Path(test_dir) / "logs/smartsim.out" + err_file = pathlib.Path(test_dir) / "logs/smartsim.err" filter_fn = lambda x: True @@ -210,3 +213,35 @@ def thrower(_self): assert ctx_var.get() == original_ctx_value ctx_var.reset(token) assert ctx_var.get() == "" + + +def test_hostname_filter_results() -> None: + """Ensure the hostname filter returns true for all records, even if not enriched""" + filter = smartsim.log.HostnameFilter("test-filter") + record = logging.LogRecord( + "name", logging.INFO, "/foo/bar", 42, "this is your message", None, None + ) + + # no hostname, will be enriched. + passes_filter = filter.filter(record) + assert passes_filter + + # has hostname, will NOT be enriched. + passes_filter = filter.filter(record) + assert passes_filter + + +def test_hostname_filter() -> None: + """Ensure the hostname filter adds a hostname to the log record""" + filter = smartsim.log.HostnameFilter("test-filter") + + exp_name = socket.gethostname() + record = logging.LogRecord( + "name", logging.INFO, "/foo/bar", 42, "this is your message", None, None + ) + + filter.filter(record) + assert hasattr(record, "hostname") + + name = getattr(record, "hostname") + assert exp_name == name diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 33fc6b163..c26868ebb 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -27,6 +27,7 @@ import os.path from copy import deepcopy +from uuid import uuid4 import pytest @@ -60,7 +61,6 @@ orc = Orchestrator() orc_1 = deepcopy(orc) orc_1.name = "orc2" -model_no_name = exp.create_model(name=None, run_settings=rs) db_script = DBScript("some-script", "def main():\n print('hello world')\n") db_model = DBModel("some-model", "TORCH", b"some-model-bytes") @@ -75,11 +75,6 @@ def test_separate(): assert manifest.dbs[0] == orc -def test_no_name(): - with pytest.raises(AttributeError): - _ = Manifest(model_no_name) - - def test_separate_type(): with pytest.raises(TypeError): _ = Manifest([1, 2, 3]) @@ -159,7 +154,7 @@ def test_launched_manifest_transform_data(): def test_launched_manifest_builder_correctly_maps_data(): - lmb = LaunchedManifestBuilder("name", "path", "launcher name") + lmb = LaunchedManifestBuilder("name", "path", "launcher name", str(uuid4())) lmb.add_model(model, 1) lmb.add_model(model_2, 1) lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) @@ -172,7 +167,7 @@ def test_launched_manifest_builder_correctly_maps_data(): def test_launced_manifest_builder_raises_if_lens_do_not_match(): - lmb = LaunchedManifestBuilder("name", "path", "launcher name") + lmb = LaunchedManifestBuilder("name", "path", "launcher name", str(uuid4())) with pytest.raises(ValueError): lmb.add_ensemble(ensemble, list(range(123))) with pytest.raises(ValueError): @@ -182,7 +177,7 @@ def test_launced_manifest_builder_raises_if_lens_do_not_match(): def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( monkeypatch, ): - lmb = LaunchedManifestBuilder("name", "path", "launcher") + lmb = LaunchedManifestBuilder("name", "path", "launcher", str(uuid4())) monkeypatch.setattr(ensemble, "entities", []) with pytest.raises(ValueError): lmb.add_ensemble(ensemble, []) @@ -190,7 +185,7 @@ def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( def test_lmb_and_launched_manifest_have_same_paths_for_launched_metadata(): exp_path = "/path/to/some/exp" - lmb = LaunchedManifestBuilder("exp_name", exp_path, "launcher") + lmb = LaunchedManifestBuilder("exp_name", exp_path, "launcher", str(uuid4())) manifest = lmb.finalize() assert ( lmb.exp_telemetry_subdirectory == manifest.metadata.exp_telemetry_subdirectory diff --git a/tests/test_model.py b/tests/test_model.py index a1b5ba505..64a68b299 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -24,6 +24,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from uuid import uuid4 + import pytest from smartsim import Experiment @@ -114,8 +116,10 @@ def launch_step_nop(self, step, entity): return _monkeypatch_exp_controller -def test_model_with_batch_settings_makes_batch_step(monkeypatch_exp_controller): - exp = Experiment("experiment", launcher="slurm") +def test_model_with_batch_settings_makes_batch_step( + monkeypatch_exp_controller, test_dir +): + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) bs = SbatchSettings() rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs, batch_settings=bs) @@ -130,9 +134,9 @@ def test_model_with_batch_settings_makes_batch_step(monkeypatch_exp_controller): def test_model_without_batch_settings_makes_run_step( - monkeypatch, monkeypatch_exp_controller + monkeypatch, monkeypatch_exp_controller, test_dir ): - exp = Experiment("experiment", launcher="slurm") + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs) @@ -148,8 +152,10 @@ def test_model_without_batch_settings_makes_run_step( assert isinstance(step, SrunStep) -def test_models_batch_settings_are_ignored_in_ensemble(monkeypatch_exp_controller): - exp = Experiment("experiment", launcher="slurm") +def test_models_batch_settings_are_ignored_in_ensemble( + monkeypatch_exp_controller, test_dir +): + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) bs_1 = SbatchSettings(nodes=5) rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs, batch_settings=bs_1) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index af21f5a1e..81f21856a 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -27,11 +27,12 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -51,7 +52,7 @@ def make_entity_context(exp: Experiment, entity: SmartSimEntity): try: yield entity finally: - if exp.get_status(entity)[0] == status.STATUS_RUNNING: + if exp.get_status(entity)[0] == SmartSimStatus.STATUS_RUNNING: exp.stop(entity) @@ -65,7 +66,7 @@ def choose_host(wlmutils, index=0): def check_not_failed(exp, *args): statuses = exp.get_status(*args) - assert all(stat is not status.STATUS_FAILED for stat in statuses) + assert all(stat is not SmartSimStatus.STATUS_FAILED for stat in statuses) @pytest.mark.parametrize("db_type", supported_dbs) @@ -152,7 +153,6 @@ def test_db_identifier_colo_then_standard( # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port, @@ -227,7 +227,7 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): assert orc2.name == "my_db" # CREATE DATABASE with db_identifier - with make_entity_context(exp, orc), make_entity_context(exp, orc2): + with make_entity_context(exp, orc2), make_entity_context(exp, orc): exp.start(orc) with pytest.raises(SSDBIDConflictError) as ex: exp.start(orc2) @@ -325,7 +325,6 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, @@ -404,7 +403,9 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") + test_script = fileutils.get_test_conf_path( + "smartredis/multidbid_colo_env_vars_only.py" + ) test_interface = wlmutils.get_test_interface() test_launcher = wlmutils.get_test_launcher() @@ -434,8 +435,9 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db ) with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): + exp.start(smartsim_model, block=False) exp.start(db) - exp.start(smartsim_model, block=True) + exp.poll(smartsim_model) check_not_failed(exp, db, smartsim_model) diff --git a/tests/test_orc_config_settings.py b/tests/test_orc_config_settings.py index 365596496..74d0c1af2 100644 --- a/tests/test_orc_config_settings.py +++ b/tests/test_orc_config_settings.py @@ -27,6 +27,7 @@ import pytest +from smartsim.database import Orchestrator from smartsim.error import SmartSimError try: @@ -40,14 +41,15 @@ pytestmark = pytest.mark.group_b -def test_config_methods(dbutils, local_db): +def test_config_methods(dbutils, prepare_db, local_db): """Test all configuration file edit methods on an active db""" + db = prepare_db(local_db).orchestrator # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs configs = dbutils.get_db_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(local_db, setting) + config_set_method = dbutils.get_config_edit_method(db, setting) config_set_method(value) # ensure SmartSimError is raised when Orchestrator.set_db_conf @@ -56,7 +58,7 @@ def test_config_methods(dbutils, local_db): for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - local_db.set_db_conf(key, value) + db.set_db_conf(key, value) # ensure TypeError is raised when Orchestrator.set_db_conf # is given either a key or a value that is not a string @@ -64,14 +66,14 @@ def test_config_methods(dbutils, local_db): for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - local_db.set_db_conf(key, value) + db.set_db_conf(key, value) -def test_config_methods_inactive(wlmutils, dbutils): +def test_config_methods_inactive(dbutils): """Ensure a SmartSimError is raised when trying to set configurations on an inactive database """ - db = wlmutils.get_orchestrator() + db = Orchestrator() configs = dbutils.get_db_configs() for setting, value in configs.items(): config_set_method = dbutils.get_config_edit_method(db, setting) diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py index f87aa9331..66fb894f7 100644 --- a/tests/test_orchestrator.py +++ b/tests/test_orchestrator.py @@ -25,6 +25,8 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import typing as t + import psutil import pytest @@ -37,7 +39,11 @@ pytestmark = pytest.mark.slow_tests -def test_orc_parameters(): +if t.TYPE_CHECKING: + import conftest + + +def test_orc_parameters() -> None: threads_per_queue = 2 inter_op_threads = 2 intra_op_threads = 2 @@ -57,45 +63,33 @@ def test_orc_parameters(): assert "INTER_OP_PARALLELISM" in module_str -def test_is_not_active(): +def test_is_not_active() -> None: db = Orchestrator(db_nodes=1) assert not db.is_active() -def test_inactive_orc_get_address(): +def test_inactive_orc_get_address() -> None: db = Orchestrator() with pytest.raises(SmartSimError): db.get_address() -def test_orc_active_functions(test_dir, wlmutils): - exp_name = "test_orc_active_functions" - exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - - db = Orchestrator(port=wlmutils.get_test_port()) - db.set_path(test_dir) - - exp.start(db) - - # check if the orchestrator is active +def test_orc_is_active_functions( + local_experiment, + prepare_db, + local_db, +) -> None: + db = prepare_db(local_db).orchestrator + db = local_experiment.reconnect_orchestrator(db.checkpoint_file) assert db.is_active() # check if the orchestrator can get the address - correct_address = db.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] - if not correct_address: - exp.stop(db) - assert False + assert db.get_address() == [f"127.0.0.1:{db.ports[0]}"] - exp.stop(db) - assert not db.is_active() - - # check if orchestrator.get_address() raises an exception - with pytest.raises(SmartSimError): - db.get_address() - - -def test_multiple_interfaces(test_dir, wlmutils): +def test_multiple_interfaces( + test_dir: str, wlmutils: t.Type["conftest.WLMUtils"] +) -> None: exp_name = "test_multiple_interfaces" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) @@ -106,7 +100,8 @@ def test_multiple_interfaces(test_dir, wlmutils): net_if_addrs = ["lo", net_if_addrs[0]] - db = Orchestrator(port=wlmutils.get_test_port(), interface=net_if_addrs) + port = wlmutils.get_test_port() + db = Orchestrator(port=port, interface=net_if_addrs) db.set_path(test_dir) exp.start(db) @@ -115,15 +110,16 @@ def test_multiple_interfaces(test_dir, wlmutils): assert db.is_active() # check if the orchestrator can get the address - correct_address = db.get_address() == ["127.0.0.1:" + str(wlmutils.get_test_port())] - if not correct_address: + correct_address = [f"127.0.0.1:{port}"] + + if not correct_address == db.get_address(): exp.stop(db) assert False exp.stop(db) -def test_catch_local_db_errors(): +def test_catch_local_db_errors() -> None: # local database with more than one node not allowed with pytest.raises(SSUnsupportedError): db = Orchestrator(db_nodes=2) @@ -140,7 +136,7 @@ def test_catch_local_db_errors(): ##### PBS ###### -def test_pbs_set_run_arg(wlmutils): +def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -159,7 +155,7 @@ def test_pbs_set_run_arg(wlmutils): ) -def test_pbs_set_batch_arg(wlmutils): +def test_pbs_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -188,7 +184,7 @@ def test_pbs_set_batch_arg(wlmutils): ##### Slurm ###### -def test_slurm_set_run_arg(wlmutils): +def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -203,7 +199,7 @@ def test_slurm_set_run_arg(wlmutils): ) -def test_slurm_set_batch_arg(wlmutils): +def test_slurm_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -234,7 +230,7 @@ def test_slurm_set_batch_arg(wlmutils): pytest.param(False, id="Multiple `srun`s"), ], ) -def test_orc_results_in_correct_number_of_shards(single_cmd): +def test_orc_results_in_correct_number_of_shards(single_cmd: bool) -> None: num_shards = 5 orc = Orchestrator( port=12345, @@ -259,7 +255,7 @@ def test_orc_results_in_correct_number_of_shards(single_cmd): ###### LSF ###### -def test_catch_orc_errors_lsf(wlmutils): +def test_catch_orc_errors_lsf(wlmutils: t.Type["conftest.WLMUtils"]) -> None: with pytest.raises(SSUnsupportedError): orc = Orchestrator( wlmutils.get_test_port(), @@ -282,7 +278,7 @@ def test_catch_orc_errors_lsf(wlmutils): orc.set_batch_arg("P", "MYPROJECT") -def test_lsf_set_run_args(wlmutils): +def test_lsf_set_run_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -295,7 +291,7 @@ def test_lsf_set_run_args(wlmutils): assert all(["l" not in db.run_settings.run_args for db in orc.entities]) -def test_lsf_set_batch_args(wlmutils): +def test_lsf_set_batch_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: orc = Orchestrator( wlmutils.get_test_port(), db_nodes=3, @@ -308,3 +304,24 @@ def test_lsf_set_batch_args(wlmutils): assert orc.batch_settings.batch_args["m"] == '"batch host1 host2"' orc.set_batch_arg("D", "102400000") assert orc.batch_settings.batch_args["D"] == "102400000" + + +def test_orc_telemetry(test_dir: str, wlmutils: t.Type["conftest.WLMUtils"]) -> None: + """Ensure the default behavior for an orchestrator is to disable telemetry""" + db = Orchestrator(port=wlmutils.get_test_port()) + db.set_path(test_dir) + + # default is disabled + assert not db.telemetry.is_enabled + + # ensure updating value works as expected + db.telemetry.enable() + assert db.telemetry.is_enabled + + # toggle back + db.telemetry.disable() + assert not db.telemetry.is_enabled + + # toggle one more time + db.telemetry.enable() + assert db.telemetry.is_enabled diff --git a/tests/test_output_files.py b/tests/test_output_files.py new file mode 100644 index 000000000..f3830051c --- /dev/null +++ b/tests/test_output_files.py @@ -0,0 +1,169 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib + +import pytest + +from smartsim import Experiment +from smartsim._core.config import CONFIG +from smartsim._core.control.controller import Controller, _AnonymousBatchJob +from smartsim._core.launcher.step import Step +from smartsim.database.orchestrator import Orchestrator +from smartsim.entity.ensemble import Ensemble +from smartsim.entity.model import Model +from smartsim.settings.base import RunSettings +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +controller = Controller() +slurm_controller = Controller(launcher="slurm") + +rs = RunSettings("echo", ["spam", "eggs"]) +bs = SbatchSettings() +batch_rs = SrunSettings("echo", ["spam", "eggs"]) + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +model = Model("test_model", params={}, path="", run_settings=rs) +batch_model = Model( + "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs +) +anon_batch_model = _AnonymousBatchJob(batch_model) + + +def test_mutated_model_output(test_dir): + exp_name = "test-mutated-model-output" + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) + + test_model = exp.create_model("test_model", path=test_dir, run_settings=rs) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + assert pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink() + assert pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink() + + with open(pathlib.Path(test_model.path, f"{test_model.name}.out"), "r") as file: + log_contents = file.read() + + assert "spam eggs" in log_contents + + first_link = os.readlink(pathlib.Path(test_model.path, f"{test_model.name}.out")) + + test_model.run_settings.exe_args = ["hello", "world"] + exp.generate(test_model, overwrite=True) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + assert pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink() + assert pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink() + + with open(pathlib.Path(test_model.path, f"{test_model.name}.out"), "r") as file: + log_contents = file.read() + + assert "hello world" in log_contents + + second_link = os.readlink(pathlib.Path(test_model.path, f"{test_model.name}.out")) + + with open(first_link, "r") as file: + first_historical_log = file.read() + + assert "spam eggs" in first_historical_log + + with open(second_link, "r") as file: + second_historical_log = file.read() + + assert "hello world" in second_historical_log + + +def test_get_output_files_with_create_job_step(test_dir): + """Testing output files through _create_job_step""" + exp_dir = pathlib.Path(test_dir) + status_dir = exp_dir / CONFIG.telemetry_subdir / model.type + step = controller._create_job_step(model, status_dir) + expected_out_path = status_dir / model.name / (model.name + ".out") + expected_err_path = status_dir / model.name / (model.name + ".err") + assert step.get_output_files() == (str(expected_out_path), str(expected_err_path)) + + +@pytest.mark.parametrize( + "entity", + [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")], +) +def test_get_output_files_with_create_batch_job_step(entity, test_dir): + """Testing output files through _create_batch_job_step""" + exp_dir = pathlib.Path(test_dir) + status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) + for step in substeps: + # example output path for a member of an Ensemble is + # .smartsim/telemetry/Ensemble/ens/ens_0/ens_0.out + expected_out_path = ( + status_dir / entity.name / step.entity_name / (step.entity_name + ".out") + ) + expected_err_path = ( + status_dir / entity.name / step.entity_name / (step.entity_name + ".err") + ) + assert step.get_output_files() == ( + str(expected_out_path), + str(expected_err_path), + ) + + +def test_model_get_output_files(test_dir): + """Testing model output files with manual step creation""" + exp_dir = pathlib.Path(test_dir) + step = Step(model.name, model.path, model.run_settings) + step.meta["status_dir"] = exp_dir / "output_dir" + expected_out_path = step.meta["status_dir"] / (model.name + ".out") + expected_err_path = step.meta["status_dir"] / (model.name + ".err") + assert step.get_output_files() == (str(expected_out_path), str(expected_err_path)) + + +def test_ensemble_get_output_files(test_dir): + """Testing ensemble output files with manual step creation""" + exp_dir = pathlib.Path(test_dir) + for member in ens.models: + step = Step(member.name, member.path, member.run_settings) + step.meta["status_dir"] = exp_dir / "output_dir" + expected_out_path = step.meta["status_dir"] / (member.name + ".out") + expected_err_path = step.meta["status_dir"] / (member.name + ".err") + assert step.get_output_files() == ( + str(expected_out_path), + str(expected_err_path), + ) + + +def test_get_output_files_no_status_dir(test_dir): + """Test that a step not having a status directory throws a KeyError""" + step_settings = RunSettings("echo") + step = Step("mock-step", test_dir, step_settings) + with pytest.raises(KeyError): + out, err = step.get_output_files() diff --git a/tests/test_pbs_parser.py b/tests/test_pbs_parser.py index f77eb7c93..ae01ffb19 100644 --- a/tests/test_pbs_parser.py +++ b/tests/test_pbs_parser.py @@ -72,3 +72,23 @@ def test_parse_qstat_status(): status = "R" parsed_status = pbsParser.parse_qstat_jobid(output, "1289903.sdb") assert status == parsed_status + + +def test_parse_qstat_status_not_found(): + output = ( + "Job id Name User Time Use S Queue\n" + "---------------- ---------------- ---------------- -------- - -----\n" + "1289903.sdb jobname username 00:00:00 R queue\n" + ) + parsed_status = pbsParser.parse_qstat_jobid(output, "9999999.sdb") + + assert parsed_status is None + + +def test_parse_qstat_status_json(fileutils): + """Parse nodes from qsub called with -f -F json""" + file_path = fileutils.get_test_conf_path("qstat.json") + output = Path(file_path).read_text() + status = "R" + parsed_status = pbsParser.parse_qstat_jobid_json(output, "16705.sdb") + assert status == parsed_status diff --git a/tests/test_preview.py b/tests/test_preview.py new file mode 100644 index 000000000..3c7bed6fe --- /dev/null +++ b/tests/test_preview.py @@ -0,0 +1,1330 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2023, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pathlib +import sys +import typing as t +from os import path as osp + +import jinja2 +import numpy as np +import pytest + +import smartsim +import smartsim._core._cli.utils as _utils +from smartsim import Experiment +from smartsim._core import Manifest, previewrenderer +from smartsim._core.config import CONFIG +from smartsim._core.control.controller import Controller +from smartsim._core.control.job import Job +from smartsim.database import Orchestrator +from smartsim.entity.entity import SmartSimEntity +from smartsim.error.errors import PreviewFormatError +from smartsim.settings import QsubBatchSettings, RunSettings + +pytestmark = pytest.mark.group_b + + +@pytest.fixture +def choose_host(): + def _choose_host(wlmutils, index: int = 0): + hosts = wlmutils.get_test_hostlist() + if hosts: + return hosts[index] + return None + + return _choose_host + + +@pytest.fixture +def preview_object(test_dir) -> t.Dict[str, Job]: + """ + Bare bones orch + """ + rs = RunSettings(exe="echo", exe_args="ifname=lo") + s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) + o = Orchestrator() + o.entity = s + s.db_identifier = "test_db_id" + s.ports = [1235] + s.num_shards = 1 + job = Job("faux-name", "faux-step-id", s, "slurm", True) + active_dbjobs: t.Dict[str, Job] = {"mock_job": job} + return active_dbjobs + + +@pytest.fixture +def preview_object_multidb(test_dir) -> t.Dict[str, Job]: + """ + Bare bones orch + """ + rs = RunSettings(exe="echo", exe_args="ifname=lo") + s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) + o = Orchestrator() + o.entity = s + s.db_identifier = "testdb_reg" + s.ports = [8750] + s.num_shards = 1 + job = Job("faux-name", "faux-step-id", s, "slurm", True) + + rs2 = RunSettings(exe="echo", exe_args="ifname=lo") + s2 = SmartSimEntity(name="faux-name_2", path=test_dir, run_settings=rs) + o2 = Orchestrator() + o2.entity = s2 + s2.db_identifier = "testdb_reg2" + s2.ports = [8752] + s2.num_shards = 1 + job2 = Job("faux-name_2", "faux-step-id_2", s2, "slurm", True) + + active_dbjobs: t.Dict[str, Job] = {"mock_job": job, "mock_job2": job2} + return active_dbjobs + + +def add_batch_resources(wlmutils, batch_settings): + if isinstance(batch_settings, QsubBatchSettings): + for key, value in wlmutils.get_batch_resources().items(): + batch_settings.set_resource(key, value) + + +def test_get_ifname_filter(): + """Test get_ifname filter""" + + # Test input and expected output + value_dict = ( + (["+ifname=ib0"], "ib0"), + ("", ""), + ("+ifnameib0", ""), + ("=ib0", ""), + (["_ifname=bad_if_key"], "bad_if_key"), + (["ifname=mock_if_name"], "mock_if_name"), + ("IFname=case_sensitive_key", ""), + ("xfname=not_splittable", ""), + (None, ""), + ) + + template_str = "{{ value | get_ifname }}" + template_dict = {"ts": template_str} + + loader = jinja2.DictLoader(template_dict) + env = jinja2.Environment(loader=loader, autoescape=True) + env.filters["get_ifname"] = previewrenderer.get_ifname + + t = env.get_template("ts") + + for input, expected_output in value_dict: + output = t.render(value=input) + # assert that that filter output matches expected output + assert output == expected_output + + +def test_get_dbtype_filter(): + """Test get_dbtype filter to extract database backend from config""" + + template_str = "{{ config | get_dbtype }}" + template_dict = {"ts": template_str} + loader = jinja2.DictLoader(template_dict) + env = jinja2.Environment(loader=loader, autoescape=True) + env.filters["get_dbtype"] = previewrenderer.get_dbtype + + t = env.get_template("ts") + output = t.render(config=CONFIG.database_cli) + + assert output in CONFIG.database_cli + # Test empty input + test_string = "" + output = t.render(config=test_string) + assert output == "" + # Test empty path + test_string = "SmartSim/smartsim/_core/bin/" + output = t.render(config=test_string) + assert output == "" + # Test no hyphen + test_string = "SmartSim/smartsim/_core/bin/rediscli" + output = t.render(config=test_string) + assert output == "" + # Test no LHS + test_string = "SmartSim/smartsim/_core/bin/redis-" + output = t.render(config=test_string) + assert output == "" + # Test no RHS + test_string = "SmartSim/smartsim/_core/bin/-cli" + output = t.render(config=test_string) + assert output == "" + + +def test_experiment_preview(test_dir, wlmutils): + """Test correct preview output fields for Experiment preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_experiment_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render(exp, verbosity_level="debug") + + # Evaluate output + summary_lines = output.split("\n") + summary_lines = [item.replace("\t", "").strip() for item in summary_lines[-3:]] + assert 3 == len(summary_lines) + summary_dict = dict(row.split(": ") for row in summary_lines) + assert set(["Experiment Name", "Experiment Path", "Launcher"]).issubset( + summary_dict + ) + + +def test_experiment_preview_properties(test_dir, wlmutils): + """Test correct preview output properties for Experiment preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_experiment_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render(exp, verbosity_level="debug") + + # Evaluate output + summary_lines = output.split("\n") + summary_lines = [item.replace("\t", "").strip() for item in summary_lines[-3:]] + assert 3 == len(summary_lines) + summary_dict = dict(row.split(": ") for row in summary_lines) + assert exp.name == summary_dict["Experiment Name"] + assert exp.exp_path == summary_dict["Experiment Path"] + assert exp.launcher == summary_dict["Launcher"] + + +def test_orchestrator_preview_render(test_dir, wlmutils, choose_host): + """Test correct preview output properties for Orchestrator preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_orchestrator_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular database + orc = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Database Identifier" in output + assert "Shards" in output + assert "TCP/IP Port(s)" in output + assert "Network Interface" in output + assert "Type" in output + assert "Executable" in output + + db_path = _utils.get_db_path() + if db_path: + db_type, _ = db_path.name.split("-", 1) + + assert orc.db_identifier in output + assert str(orc.num_shards) in output + assert orc._interfaces[0] in output + assert db_type in output + assert CONFIG.database_exe in output + assert orc.run_command in output + assert str(orc.db_nodes) in output + + +def test_preview_to_file(test_dir, wlmutils): + """ + Test that if an output_filename is given, a file + is rendered for Experiment preview" + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_preview_output_filename" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + filename = "test_preview_output_filename.txt" + path = pathlib.Path(test_dir) / filename + # Execute preview method + exp.preview( + output_format=previewrenderer.Format.PLAINTEXT, + output_filename=str(path), + verbosity_level="debug", + ) + + # Evaluate output + assert path.exists() + assert path.is_file() + + +def test_model_preview(test_dir, wlmutils): + """ + Test correct preview output fields for Model preview + """ + # Prepare entities + exp_name = "test_model_preview" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + model_params = {"port": 6379, "password": "unbreakable_password"} + rs1 = RunSettings("bash", "multi_tags_template.sh") + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hello_world_model = exp.create_model( + "echo-hello", run_settings=rs1, params=model_params + ) + + spam_eggs_model = exp.create_model("echo-spam", run_settings=rs2) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output + assert "Model Name" in rendered_preview + assert "Executable" in rendered_preview + assert "Executable Arguments" in rendered_preview + assert "Model Parameters" in rendered_preview + + +def test_model_preview_properties(test_dir, wlmutils): + """ + Test correct preview output properties for Model preview + """ + # Prepare entities + exp_name = "test_model_preview_parameters" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + hw_name = "echo-hello" + hw_port = 6379 + hw_password = "unbreakable_password" + hw_rs = "multi_tags_template.sh" + model_params = {"port": hw_port, "password": hw_password} + hw_param1 = "bash" + rs1 = RunSettings(hw_param1, hw_rs) + + se_name = "echo-spam" + se_param1 = "echo" + se_param2 = "spam" + se_param3 = "eggs" + rs2 = exp.create_run_settings(se_param1, [se_param2, se_param3]) + + hello_world_model = exp.create_model(hw_name, run_settings=rs1, params=model_params) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output for hello world model + assert hw_name in rendered_preview + assert hw_param1 in rendered_preview + assert hw_rs in rendered_preview + assert "port" in rendered_preview + assert "password" in rendered_preview + assert str(hw_port) in rendered_preview + assert hw_password in rendered_preview + + assert hw_name == hello_world_model.name + assert hw_param1 in hello_world_model.run_settings.exe[0] + assert hw_rs == hello_world_model.run_settings.exe_args[0] + assert None == hello_world_model.batch_settings + assert "port" in list(hello_world_model.params.items())[0] + assert hw_port in list(hello_world_model.params.items())[0] + assert "password" in list(hello_world_model.params.items())[1] + assert hw_password in list(hello_world_model.params.items())[1] + + # Evaluate outputfor spam eggs model + assert se_name in rendered_preview + assert se_param1 in rendered_preview + assert se_param2 in rendered_preview + assert se_param3 in rendered_preview + + assert se_name == spam_eggs_model.name + assert se_param1 in spam_eggs_model.run_settings.exe[0] + assert se_param2 == spam_eggs_model.run_settings.exe_args[0] + assert se_param3 == spam_eggs_model.run_settings.exe_args[1] + + +def test_preview_model_tagged_files(fileutils, test_dir, wlmutils): + """ + Test model with tagged files in preview. + """ + # Prepare entities + exp_name = "test_model_preview_parameters" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + model_params = {"port": 6379, "password": "unbreakable_password"} + model_settings = RunSettings("bash", "multi_tags_template.sh") + + hello_world_model = exp.create_model( + "echo-hello", run_settings=model_settings, params=model_params + ) + + config = fileutils.get_test_conf_path( + osp.join("generator_files", "multi_tags_template.sh") + ) + hello_world_model.attach_generator_files(to_configure=[config]) + exp.generate(hello_world_model, overwrite=True) + + preview_manifest = Manifest(hello_world_model) + + # Execute preview method + rendered_preview = previewrenderer.render( + exp, preview_manifest, verbosity_level="debug" + ) + + # Evaluate output + assert "Tagged Files for Model Configuration" in rendered_preview + assert "generator_files/multi_tags_template.sh" in rendered_preview + assert "generator_files/multi_tags_template.sh" in hello_world_model.files.tagged[0] + + +def test_model_key_prefixing(test_dir, wlmutils): + """ + Test preview for enabling key prefixing for a Model + """ + # Prepare entities + exp_name = "test_model_key_prefixing" + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + db = exp.create_database(port=6780, interface="lo") + exp.generate(db, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + model = exp.create_model("model_test", run_settings=rs1) + + # enable key prefixing on model + model.enable_key_prefixing() + exp.generate(model, overwrite=True) + + preview_manifest = Manifest(db, model) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Key Prefix" in output + assert "model_test" in output + assert "Outgoing Key Collision Prevention (Key Prefixing)" in output + assert "Tensors: On" in output + assert "Datasets: On" in output + assert "ML Models/Torch Scripts: Off" in output + assert "Aggregation Lists: On" in output + + +def test_ensembles_preview(test_dir, wlmutils): + """ + Test ensemble preview fields are correct in template render + """ + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-ensembles-preview", exp_path=test_dir, launcher=test_launcher + ) + + # setup ensemble parameter space + learning_rate = list(np.linspace(0.01, 0.5)) + train_params = {"LR": learning_rate} + + # define how each member should run + run = exp.create_run_settings(exe="python", exe_args="./train-model.py") + + ensemble = exp.create_ensemble( + "Training-Ensemble", + params=train_params, + params_as_args=["LR"], + run_settings=run, + perm_strategy="random", + n_models=4, + ) + + preview_manifest = Manifest(ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Ensemble Name" in output + assert "Members" in output + assert "Ensemble Parameters" in output + + +def test_preview_models_and_ensembles(test_dir, wlmutils): + """ + Test preview of separate model entity and ensemble entity + """ + exp_name = "test-preview-model-and-ensemble" + test_dir = pathlib.Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=str(test_dir), launcher=test_launcher) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hw_name = "echo-hello" + se_name = "echo-spam" + ens_name = "echo-ensemble" + hello_world_model = exp.create_model(hw_name, run_settings=rs1) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + hello_ensemble = exp.create_ensemble(ens_name, run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model, hello_ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Models" in output + assert hw_name in output + assert se_name in output + + assert "Ensembles" in output + assert ens_name + "_1" in output + assert ens_name + "_2" in output + + +def test_ensemble_preview_client_configuration(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher + ) + # Create Orchestrator + db = exp.create_database(port=6780, interface="lo") + exp.generate(db, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(db, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Database Identifier" in output + assert "Database Backend" in output + assert "Type" in output + + +def test_ensemble_preview_client_configuration_multidb(test_dir, wlmutils): + """ + Test preview of client configuration and key prefixing in Ensemble preview + with multiple databases + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-multidb-clinet-config", exp_path=test_dir, launcher=test_launcher + ) + # Create Orchestrator + db1_dbid = "db_1" + db1 = exp.create_database(port=6780, interface="lo", db_identifier=db1_dbid) + exp.generate(db1, overwrite=True) + # Create another Orchestrator + db2_dbid = "db_2" + db2 = exp.create_database(port=6784, interface="lo", db_identifier=db2_dbid) + exp.generate(db2, overwrite=True) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(db1, db2, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Client Configuration" in output + assert "Database Identifier" in output + assert "Database Backend" in output + assert "TCP/IP Port(s)" in output + assert "Type" in output + + assert db1_dbid in output + assert db2_dbid in output + + +def test_ensemble_preview_attached_files(fileutils, test_dir, wlmutils): + """ + Test the preview of tagged, copy, and symlink files attached + to an ensemble + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment( + "test-preview-attached-files", exp_path=test_dir, launcher=test_launcher + ) + ensemble = exp.create_ensemble( + "dir_test", replicas=1, run_settings=RunSettings("python", exe_args="sleep.py") + ) + ensemble.entities = [] + params = {"THERMO": [10, 20], "STEPS": [20, 30]} + ensemble = exp.create_ensemble( + "dir_test", + params=params, + run_settings=RunSettings("python", exe_args="sleep.py"), + ) + gen_dir = fileutils.get_test_conf_path(osp.join("generator_files", "test_dir")) + symlink_dir = fileutils.get_test_conf_path( + osp.join("generator_files", "to_symlink_dir") + ) + copy_dir = fileutils.get_test_conf_path(osp.join("generator_files", "to_copy_dir")) + + ensemble.attach_generator_files() + ensemble.attach_generator_files( + to_configure=[gen_dir, copy_dir], to_copy=copy_dir, to_symlink=symlink_dir + ) + preview_manifest = Manifest(ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Tagged Files for Model Configuration" in output + assert "Copy Files" in output + assert "Symlink" in output + assert "Ensemble Parameters" in output + assert "Model Parameters" in output + + assert "generator_files/test_dir" in output + assert "generator_files/to_copy_dir" in output + assert "generator_files/to_symlink_dir" in output + + for model in ensemble: + assert "generator_files/test_dir" in model.files.tagged[0] + for copy in model.files.copy: + assert "generator_files/to_copy_dir" in copy + for link in model.files.link: + assert "generator_files/to_symlink_dir" in link + + +def test_preview_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): + """ + Test preview of DBModel on colocated ensembles + """ + + exp_name = "test-preview-colocated-db-model-ensemble" + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create the ensemble of two identical SmartSim Model + colo_ensemble = exp.create_ensemble( + "colocated_ens", run_settings=colo_settings, replicas=2 + ) + + # Create colocated SmartSim Model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Create and save ML model to filesystem + content = "empty test" + model_path = pathlib.Path(test_dir) / "model1.pt" + model_path.write_text(content) + + # Test adding a model from ensemble + colo_ensemble.add_ml_model( + "cnn", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + + # Colocate a database with the first ensemble members + for i, entity in enumerate(colo_ensemble): + entity.colocate_db_tcp( + port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + ) + # Add ML models to each ensemble member to make sure they + # do not conflict with other ML models + entity.add_ml_model( + "cnn2", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + entity.disable_key_prefixing() + + # Add another ensemble member + colo_ensemble.add_model(colo_model) + + # Colocate a database with the new ensemble member + colo_model.colocate_db_tcp( + port=test_port + len(colo_ensemble) - 1, + db_cpus=1, + debug=True, + ifname=test_interface, + ) + # Add a ML model to the new ensemble member + model_inputs = "args_0" + model_outputs = "Identity" + model_name = "cnn2" + model_backend = "TF" + colo_model.add_ml_model( + model_name, + model_backend, + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=model_inputs, + outputs=model_outputs, + ) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Models" in output + assert "Name" in output + assert "Backend" in output + assert "Path" in output + assert "Device" in output + assert "Devices Per Node" in output + assert "Inputs" in output + assert "Outputs" in output + + assert model_name in output + assert model_backend in output + assert "Path" in output + assert "/model1.pt" in output + assert "CPU" in output + assert model_inputs in output + assert model_outputs in output + + +def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): + """ + Test preview of DB Scripts on colocated DB from ensemble + """ + + exp_name = "test-preview-colocated-db-script" + + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 + + expected_torch_script = "torchscript.py" + test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + torch_script = fileutils.get_test_conf_path(expected_torch_script) + + # Create SmartSim Experiment + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create SmartSim Ensemble with two identical models + colo_ensemble = exp.create_ensemble( + "colocated_ensemble", run_settings=colo_settings, replicas=2 + ) + + # Create a SmartSim model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Colocate a db with each ensemble entity and add a script + # to each entity via file + for i, entity in enumerate(colo_ensemble): + entity.disable_key_prefixing() + entity.colocate_db_tcp( + port=test_port + i, + db_cpus=1, + debug=True, + ifname=test_interface, + ) + + entity.add_script( + "test_script1", + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Colocate a db with the non-ensemble Model + colo_model.colocate_db_tcp( + port=test_port + len(colo_ensemble), + db_cpus=1, + debug=True, + ifname=test_interface, + ) + + # Add a script to the non-ensemble model + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + cm_name2 = "test_script2" + colo_ensemble.add_script( + cm_name2, + script=torch_script_str, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Add the third SmartSim model to the ensemble + colo_ensemble.add_model(colo_model) + + # Add another script via file to the entire ensemble + cm_name1 = "test_script1" + colo_model.add_script( + cm_name1, + script_path=torch_script, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + ) + + # Assert we have added one model to the ensemble + assert len(colo_ensemble._db_scripts) == 1 + # Assert we have added both models to each entity + assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Scripts" in output + assert "Name" in output + assert "Path" in output + assert "Devices Per Node" in output + + assert cm_name2 in output + assert expected_torch_script in output + assert test_device in output + assert cm_name1 in output + + +def test_preview_active_infrastructure(wlmutils, test_dir, preview_object): + """Test active infrastructure without other orchestrators""" + + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp_name = "test_active_infrastructure_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + # Execute method for template rendering + output = previewrenderer.render( + exp, active_dbjobs=preview_object, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Database Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + +def test_preview_orch_active_infrastructure( + wlmutils, test_dir, choose_host, preview_object +): + """ + Test correct preview output properties for active infrastructure preview + with other orchestrators + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_orchestrator_active_infrastructure_preview" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + orc2 = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + db_identifier="orc_2", + ) + + orc3 = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + db_identifier="orc_3", + ) + + preview_manifest = Manifest(orc2, orc3) + + # Execute method for template rendering + output = previewrenderer.render( + exp, preview_manifest, active_dbjobs=preview_object, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Database Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + +def test_preview_multidb_active_infrastructure( + wlmutils, test_dir, choose_host, preview_object_multidb +): + """multiple started databases active infrastructure""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + + # start a new Experiment for this section + exp = Experiment( + "test_preview_multidb_active_infrastructure", + exp_path=test_dir, + launcher=test_launcher, + ) + + # Execute method for template rendering + output = previewrenderer.render( + exp, active_dbjobs=preview_object_multidb, verbosity_level="debug" + ) + + assert "Active Infrastructure" in output + assert "Database Identifier" in output + assert "Shards" in output + assert "Network Interface" in output + assert "Type" in output + assert "TCP/IP" in output + + assert "testdb_reg" in output + assert "testdb_reg2" in output + assert "Ochestrators" not in output + + +def test_preview_active_infrastructure_orchestrator_error( + wlmutils, test_dir, choose_host, monkeypatch: pytest.MonkeyPatch +): + """Demo error when trying to preview a started orchestrator""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_active_infrastructure_preview_orch_error" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + + monkeypatch.setattr( + smartsim.database.orchestrator.Orchestrator, "is_active", lambda x: True + ) + + orc = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + db_identifier="orc_1", + ) + + # Retrieve any active jobs + active_dbjobs = exp._control.active_orchestrator_jobs + + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render( + exp, preview_manifest, active_dbjobs=active_dbjobs, verbosity_level="debug" + ) + + assert "WARNING: Cannot preview orc_1, because it is already started" in output + + +def test_active_orchestrator_jobs_property( + wlmutils, + test_dir, + preview_object, +): + """Ensure db_jobs remaines unchanged after deletion + of active_orchestrator_jobs property stays intact when retrieving db_jobs""" + + # Retrieve parameters from testing environment + test_launcher = wlmutils.get_test_launcher() + + # start a new Experiment for this section + exp = Experiment( + "test-active_orchestrator_jobs-property", + exp_path=test_dir, + launcher=test_launcher, + ) + + controller = Controller() + controller._jobs.db_jobs = preview_object + + # Modify the returned job collection + active_orchestrator_jobs = exp._control.active_orchestrator_jobs + active_orchestrator_jobs["test"] = "test_value" + + # Verify original collection is not also modified + assert not exp._control.active_orchestrator_jobs.get("test", None) + + +def test_verbosity_info_ensemble(test_dir, wlmutils): + """ + Test preview of separate model entity and ensemble entity + with verbosity level set to info + """ + exp_name = "test-model-and-ensemble" + test_dir = pathlib.Path(test_dir) / exp_name + test_dir.mkdir(parents=True) + test_launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, exp_path=str(test_dir), launcher=test_launcher) + + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + + hw_name = "echo-hello" + se_name = "echo-spam" + ens_name = "echo-ensemble" + hello_world_model = exp.create_model(hw_name, run_settings=rs1) + spam_eggs_model = exp.create_model(se_name, run_settings=rs2) + hello_ensemble = exp.create_ensemble(ens_name, run_settings=rs1, replicas=3) + + exp.generate(hello_world_model, spam_eggs_model, hello_ensemble) + + preview_manifest = Manifest(hello_world_model, spam_eggs_model, hello_ensemble) + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + assert "Executable" not in output + assert "Executable Arguments" not in output + + assert "echo_ensemble_1" not in output + + +def test_verbosity_info_colocated_db_model_ensemble( + fileutils, test_dir, wlmutils, mlutils +): + """Test preview of DBModel on colocated ensembles, first adding the DBModel to the + ensemble, then colocating DB. + """ + + exp_name = "test-colocated-db-model-ensemble-reordered" + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + test_device = mlutils.get_test_device() + test_num_gpus = 1 + + test_script = fileutils.get_test_conf_path("run_tf_dbmodel_smartredis.py") + + exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) + colo_settings = exp.create_run_settings(exe=sys.executable, exe_args=test_script) + colo_settings.set_nodes(1) + colo_settings.set_tasks(1) + + # Create the ensemble of two identical SmartSim Model + colo_ensemble = exp.create_ensemble( + "colocated_ens", run_settings=colo_settings, replicas=2 + ) + + # Create colocated SmartSim Model + colo_model = exp.create_model("colocated_model", colo_settings) + + # Create and save ML model to filesystem + content = "empty test" + model_path = pathlib.Path(test_dir) / "model1.pt" + model_path.write_text(content) + + # Test adding a model from ensemble + colo_ensemble.add_ml_model( + "cnn", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + + # Colocate a database with the first ensemble members + for i, entity in enumerate(colo_ensemble): + entity.colocate_db_tcp( + port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + ) + # Add ML models to each ensemble member to make sure they + # do not conflict with other ML models + entity.add_ml_model( + "cnn2", + "TF", + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs="args_0", + outputs="Identity", + ) + entity.disable_key_prefixing() + + # Add another ensemble member + colo_ensemble.add_model(colo_model) + + # Colocate a database with the new ensemble member + colo_model.colocate_db_tcp( + port=test_port + len(colo_ensemble) - 1, + db_cpus=1, + debug=True, + ifname=test_interface, + ) + # Add a ML model to the new ensemble member + model_inputs = "args_0" + model_outputs = "Identity" + model_name = "cnn2" + model_backend = "TF" + colo_model.add_ml_model( + model_name, + model_backend, + model_path=model_path, + device=test_device, + devices_per_node=test_num_gpus, + first_device=0, + inputs=model_inputs, + outputs=model_outputs, + ) + + exp.generate(colo_ensemble) + + preview_manifest = Manifest(colo_ensemble) + + # Execute preview method + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + assert "Outgoing Key Collision Prevention (Key Prefixing)" not in output + assert "Devices Per Node" not in output + + +def test_verbosity_info_orchestrator(test_dir, wlmutils, choose_host): + """Test correct preview output properties for Orchestrator preview""" + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + test_interface = wlmutils.get_test_interface() + test_port = wlmutils.get_test_port() + exp_name = "test_orchestrator_preview_properties" + exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) + # create regular database + orc = exp.create_database( + port=test_port, + interface=test_interface, + hosts=choose_host(wlmutils), + ) + preview_manifest = Manifest(orc) + + # Execute method for template rendering + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + # Evaluate output + assert "Executable" not in output + assert "Run Command" not in output + + +def test_verbosity_info_ensemble(test_dir, wlmutils): + """ + Test client configuration and key prefixing in Ensemble preview + """ + # Prepare entities + test_launcher = wlmutils.get_test_launcher() + exp = Experiment("key_prefix_test", exp_path=test_dir, launcher=test_launcher) + # Create Orchestrator + db = exp.create_database(port=6780, interface="lo") + exp.generate(db, overwrite=True) + rs1 = exp.create_run_settings("echo", ["hello", "world"]) + # Create ensemble + ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) + # enable key prefixing on ensemble + ensemble.enable_key_prefixing() + exp.generate(ensemble, overwrite=True) + rs2 = exp.create_run_settings("echo", ["spam", "eggs"]) + # Create model + ml_model = exp.create_model("tf_training", rs2) + + for sim in ensemble.entities: + ml_model.register_incoming_entity(sim) + + exp.generate(ml_model, overwrite=True) + preview_manifest = Manifest(db, ml_model, ensemble) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") + + # Evaluate output + assert "Outgoing Key Collision Prevention (Key Prefixing)" in output + + +def test_check_output_format_error(): + """ + Test error when invalid ouput format is given. + """ + # Prepare entities + exp_name = "test_output_format" + exp = Experiment(exp_name) + + # Execute preview method + with pytest.raises(PreviewFormatError) as ex: + exp.preview(output_format="hello") + assert ( + "The only valid output format currently available is plain_text" + in ex.value.args[0] + ) + + +def test_check_verbosity_level_error(): + """ + Testing that an error does occur when a string verbosity is passed + """ + # Prepare entities + exp_name = "test_verbosity_level_error" + exp = Experiment(exp_name) + + # Execute preview method + with pytest.raises(ValueError) as ex: + exp.preview(verbosity_level="hello") + + +def test_check_verbosity_level(): + """ + Testing that an error doesnt occur when a string verbosity is passed + """ + # Prepare entities + exp_name = "test_verbosity_level" + exp = Experiment(exp_name) + + # Execute preview method + exp.preview(verbosity_level="info") + + +def test_preview_colocated_db_singular_model(wlmutils, test_dir): + """Test preview behavior when a colocated db is only added to + one model. The expected behviour is that both models are colocated + """ + + test_launcher = wlmutils.get_test_launcher() + + exp = Experiment("colocated test", exp_path=test_dir, launcher=test_launcher) + + rs = exp.create_run_settings("sleep", ["100"]) + + model_1 = exp.create_model("model_1", run_settings=rs) + model_2 = exp.create_model("model_2", run_settings=rs) + + model_1.colocate_db() + + exp.generate(model_1, model_2, overwrite=True) + + preview_manifest = Manifest(model_1, model_2) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + assert "model_1" in output + assert "model_2" in output + assert "Client Configuration" in output + + +def test_preview_db_script(wlmutils, test_dir): + """ + Test preview of model instance with a torch script. + """ + test_launcher = wlmutils.get_test_launcher() + # Initialize the Experiment and set the launcher to auto + + exp = Experiment("getting-started", launcher=test_launcher) + + # Initialize a RunSettings object + model_settings = exp.create_run_settings(exe="python", exe_args="params.py") + + # Initialize a Model object + model_instance = exp.create_model("model_name", model_settings) + model_instance.colocate_db_tcp() + + # TorchScript string + torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" + + # Attach TorchScript to Model + model_instance.add_script( + name="example_script", + script=torch_script_str, + device="GPU", + devices_per_node=2, + first_device=0, + ) + preview_manifest = Manifest(model_instance) + + # Call preview renderer for testing output + output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") + + # Evaluate output + assert "Torch Script" in output diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 554e42cbd..6ce93c6f9 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -29,8 +29,9 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim.database import Orchestrator +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -50,11 +51,11 @@ def test_local_orchestrator(test_dir, wlmutils): first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) - orc.set_path(test_dir) + orc.set_path(osp.join(test_dir, "orchestrator")) exp.start(orc) statuses = exp.get_status(orc) - assert [stat != status.STATUS_FAILED for stat in statuses] + assert [stat != SmartSimStatus.STATUS_FAILED for stat in statuses] # simulate user shutting down main thread exp._control._jobs.actively_monitoring = False @@ -68,7 +69,7 @@ def test_reconnect_local_orc(test_dir): exp_name = "test-orc-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) - checkpoint = osp.join(first_dir, "smartsim_db.dat") + checkpoint = osp.join(first_dir, "orchestrator", "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) # let statuses update once @@ -76,7 +77,7 @@ def test_reconnect_local_orc(test_dir): statuses = exp_2.get_status(reloaded_orc) for stat in statuses: - if stat == status.STATUS_FAILED: + if stat == SmartSimStatus.STATUS_FAILED: exp_2.stop(reloaded_orc) assert False exp_2.stop(reloaded_orc) diff --git a/tests/test_run_settings.py b/tests/test_run_settings.py index b9439f41a..056dad64b 100644 --- a/tests/test_run_settings.py +++ b/tests/test_run_settings.py @@ -185,55 +185,34 @@ def test_add_exe_args_list_of_mixed(): settings.add_exe_args(["1", "2", 3]) -def test_add_exe_args_space_delimited_string(): +def test_add_exe_args_list_of_lists(): """Ensure that any non-string exe arg fails validation for all""" settings = RunSettings("python") - expected = ["1", "2", "3"] - settings.add_exe_args("1 2 3") - - assert settings.exe_args == expected - - -def test_add_exe_args_list_of_mixed_lists(): - """Ensure that any non-string exe arg fails validation for all""" - settings = RunSettings("python") - with pytest.raises(TypeError) as type_error: - settings.add_exe_args([["1", "2", 3], ["4", "5", 6]]) - - assert "Executable arguments should be a list of str" in type_error.value.args - - -def test_add_exe_args_list_of_mixed_lists_init(): - """Ensure that any non-string exe arg fails validation for all""" - exe_args = [["1", "2", 3], ["4", "5", 6]] - - with pytest.raises(TypeError) as type_error: - settings = RunSettings("python", exe_args=exe_args) - - assert "Executable arguments were not list of str or str" in type_error.value.args + with pytest.raises(TypeError): + settings.add_exe_args(["1", "2", "3"], ["1", "2", "3"]) -def test_add_exe_args_list_of_str_lists_init(): - """Ensure that list[list[str]] pass validation""" +def test_init_exe_args_list_of_lists(): + """Ensure that a list of lists exe arg fails validation""" exe_args = [["1", "2", "3"], ["4", "5", "6"]] + with pytest.raises(TypeError): + _ = RunSettings("python", exe_args=exe_args) - settings = RunSettings("python", exe_args=exe_args) - assert settings.exe_args == exe_args +def test_init_exe_args_list_of_lists_mixed(): + """Ensure that a list of lists exe arg fails validation""" + exe_args = [["1", "2", 3], ["4", "5", 6]] + with pytest.raises(TypeError): + _ = RunSettings("python", exe_args=exe_args) -def test_add_exe_args_list_of_str_lists(): - """Ensure that list[list[str]] fail validation when added via method""" - exe_args = [["1", "2", "3"], ["4", "5", "6"]] - +def test_add_exe_args_space_delimited_string(): + """Ensure that any non-string exe arg fails validation for all""" settings = RunSettings("python") + expected = ["1", "2", "3"] + settings.add_exe_args("1 2 3") - with pytest.raises(TypeError) as type_error: - settings.add_exe_args(exe_args) - - # NOTE that this behavior differs from sending constructor args like - # tested in test_add_exe_args_list_of_str_lists_init where it's allowed - assert "Executable arguments should be a list of str" in type_error.value.args + assert settings.exe_args == expected def test_format_run_args(): @@ -360,6 +339,7 @@ def test_set_format_args(set_str, val, key): pytest.param("set_task_map", (3,), id="set_task_map"), pytest.param("set_cpus_per_task", (4,), id="set_cpus_per_task"), pytest.param("set_hostlist", ("hostlist",), id="set_hostlist"), + pytest.param("set_node_feature", ("P100",), id="set_node_feature"), pytest.param( "set_hostlist_from_file", ("~/hostfile",), id="set_hostlist_from_file" ), diff --git a/tests/test_schema_utils.py b/tests/test_schema_utils.py new file mode 100644 index 000000000..78789f8ef --- /dev/null +++ b/tests/test_schema_utils.py @@ -0,0 +1,217 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import collections +import json + +import pydantic +import pytest + +from smartsim._core.schemas.utils import ( + _DEFAULT_MSG_DELIM, + SchemaRegistry, + SocketSchemaTranslator, + _Message, +) + +# The tests in this file belong to the group_b group +pytestmark = pytest.mark.group_b + + +class Person(pydantic.BaseModel): + name: str + age: int + + +class Dog(pydantic.BaseModel): + name: str + age: int + + +class Book(pydantic.BaseModel): + title: str + num_pages: int + + +def test_equivalent_messages_are_equivalent(): + book = Book(title="A Story", num_pages=250) + msg_1 = _Message(book, "header") + msg_2 = _Message(book, "header") + + assert msg_1 is not msg_2 + assert msg_1 == msg_2 + assert str(msg_1) == str(msg_2) + assert msg_1 == _Message.from_str(str(msg_1), Book) + + +def test_schema_registrartion(): + registry = SchemaRegistry() + assert registry._map == {} + + registry.register("person")(Person) + assert registry._map == {"person": Person} + + registry.register("book")(Book) + assert registry._map == {"person": Person, "book": Book} + + +def test_cannot_register_a_schema_under_an_empty_str(): + registry = SchemaRegistry() + with pytest.raises(KeyError, match="Key cannot be the empty string"): + registry.register("") + + +def test_schema_to_string(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("book")(Book) + person = Person(name="Bob", age=36) + book = Book(title="The Greatest Story of All Time", num_pages=10_000) + assert registry.to_string(person) == str(_Message(person, "person")) + assert registry.to_string(book) == str(_Message(book, "book")) + + +def test_schemas_with_same_shape_are_mapped_correctly(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("dog")(Dog) + + person = Person(name="Mark", age=34) + dog = Dog(name="Fido", age=5) + + parsed_person = registry.from_string(registry.to_string(person)) + parsed_dog = registry.from_string(registry.to_string(dog)) + + assert isinstance(parsed_person, Person) + assert isinstance(parsed_dog, Dog) + + assert parsed_person == person + assert parsed_dog == dog + + +def test_registry_errors_if_types_overloaded(): + registry = SchemaRegistry() + registry.register("schema")(Person) + + with pytest.raises(KeyError): + registry.register("schema")(Book) + + +def test_registry_errors_if_msg_type_registered_with_delim_present(): + registry = SchemaRegistry() + with pytest.raises(ValueError, match="cannot contain delimiter"): + registry.register(f"some_key_with_the_{_DEFAULT_MSG_DELIM}_as_a_substring") + + +def test_registry_errors_on_unknown_schema(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(TypeError): + registry.to_string(Book(title="The Shortest Story of All Time", num_pages=1)) + + +def test_registry_correctly_maps_to_expected_type(): + registry = SchemaRegistry() + registry.register("person")(Person) + registry.register("book")(Book) + person = Person(name="Bob", age=36) + book = Book(title="The Most Average Story of All Time", num_pages=500) + assert registry.from_string(str(_Message(person, "person"))) == person + assert registry.from_string(str(_Message(book, "book"))) == book + + +def test_registery_errors_if_type_key_not_recognized(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(ValueError, match="^No type of value .* registered$"): + registry.from_string(str(_Message(Person(name="Grunk", age=5_000), "alien"))) + + +def test_registry_errors_if_type_key_is_missing(): + registry = SchemaRegistry() + registry.register("person")(Person) + + with pytest.raises(ValueError, match="Failed to determine schema type"): + registry.from_string("This string does not contain a delimiter") + + +class MockSocket: + def __init__(self, send_queue, recv_queue): + self.send_queue = send_queue + self.recv_queue = recv_queue + + def send_string(self, str_, *_args, **_kwargs): + assert isinstance(str_, str) + self.send_queue.append(str_) + + def recv_string(self, *_args, **_kwargs): + str_ = self.recv_queue.popleft() + assert isinstance(str_, str) + return str_ + + +class Request(pydantic.BaseModel): ... + + +class Response(pydantic.BaseModel): ... + + +def test_socket_schema_translator_uses_schema_registries(): + server_to_client = collections.deque() + client_to_server = collections.deque() + + server_socket = MockSocket(server_to_client, client_to_server) + client_socket = MockSocket(client_to_server, server_to_client) + + req_reg = SchemaRegistry() + res_reg = SchemaRegistry() + + req_reg.register("message")(Request) + res_reg.register("message")(Response) + + server = SocketSchemaTranslator(server_socket, res_reg, req_reg) + client = SocketSchemaTranslator(client_socket, req_reg, res_reg) + + # Check sockets are able to communicate seamlessly with schemas only + client.send(Request()) + assert len(client_to_server) == 1 + req = server.recv() + assert len(client_to_server) == 0 + assert isinstance(req, Request) + + server.send(Response()) + assert len(server_to_client) == 1 + res = client.recv() + assert len(server_to_client) == 0 + assert isinstance(res, Response) + + # Ensure users cannot send unexpected schemas + with pytest.raises(TypeError, match="Unregistered schema"): + client.send(Response()) + with pytest.raises(TypeError, match="Unregistered schema"): + server.send(Request()) diff --git a/tests/test_serialize.py b/tests/test_serialize.py index 9e92a4866..b2dc0b7a7 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -27,6 +27,7 @@ import json import logging from pathlib import Path +from uuid import uuid4 import pytest @@ -59,7 +60,7 @@ def manifest_json(test_dir, config) -> str: def test_serialize_creates_a_manifest_json_file_if_dne(test_dir, manifest_json): - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) serialize.save_launch_manifest(lmb.finalize()) assert manifest_json.is_file() @@ -71,28 +72,29 @@ def test_serialize_creates_a_manifest_json_file_if_dne(test_dir, manifest_json): assert len(manifest["runs"]) == 1 -def test_serialize_does_not_write_manifest_json_if_telemetry_monitor_is_off( +def test_serialize_does_write_manifest_json_if_telemetry_monitor_is_off( test_dir, monkeypatch, manifest_json ): + """Ensure that the manifest is written even if telemetry is not collected""" monkeypatch.setattr( smartsim._core.config.config.Config, _CFG_TM_ENABLED_ATTR, property(lambda self: False), ) - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) serialize.save_launch_manifest(lmb.finalize()) - assert not manifest_json.exists() + assert manifest_json.exists() def test_serialize_appends_a_manifest_json_exists(test_dir, manifest_json): serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() ) serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() ) serialize.save_launch_manifest( - LaunchedManifestBuilder("exp", test_dir, "launcher").finalize() + LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())).finalize() ) assert manifest_json.is_file() @@ -108,7 +110,7 @@ def test_serialize_overwites_file_if_not_json(test_dir, manifest_json): with open(manifest_json, "w") as f: f.write("This is not a json\n") - lmb = LaunchedManifestBuilder("exp", test_dir, "launcher") + lmb = LaunchedManifestBuilder("exp", test_dir, "launcher", str(uuid4())) serialize.save_launch_manifest(lmb.finalize()) with open(manifest_json, "r") as f: assert isinstance(json.load(f), dict) diff --git a/tests/test_slurm_get_alloc.py b/tests/test_slurm_get_alloc.py index aa12ce362..561e3d984 100644 --- a/tests/test_slurm_get_alloc.py +++ b/tests/test_slurm_get_alloc.py @@ -26,7 +26,7 @@ import pytest -from smartsim.slurm import _get_alloc_cmd +from smartsim.wlm.slurm import _get_alloc_cmd # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b diff --git a/tests/test_slurm_settings.py b/tests/test_slurm_settings.py index aa5b2be11..d9d820244 100644 --- a/tests/test_slurm_settings.py +++ b/tests/test_slurm_settings.py @@ -105,6 +105,7 @@ def test_mpmd_compound_env_exports(): step = SrunStep("teststep", "./", srun) + step.meta["status_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" in launch_cmd and len(env_cmds) == 1 @@ -164,6 +165,7 @@ def test_mpmd_non_compound_env_exports(): step = SrunStep("teststep", "./", srun) + step.meta["status_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" not in launch_cmd and len(env_cmds) == 0 @@ -223,6 +225,7 @@ def test_mpmd_non_compound_no_exports(): step = SrunStep("teststep", "./", srun) + step.meta["status_dir"] = "" launch_cmd = step.get_launch_cmd() env_cmds = [v for v in launch_cmd if v == "env"] assert "env" not in launch_cmd and len(env_cmds) == 0 @@ -338,6 +341,21 @@ def test_set_hostlist(): rs.set_hostlist([5]) +def test_set_node_feature(): + rs = SrunSettings("python") + rs.set_node_feature(["P100", "V100"]) + assert rs.run_args["C"] == "P100,V100" + + rs.set_node_feature("P100") + assert rs.run_args["C"] == "P100" + + with pytest.raises(TypeError): + rs.set_node_feature(5) + + with pytest.raises(TypeError): + rs.set_node_feature(["P100", 5]) + + def test_set_hostlist_from_file(): rs = SrunSettings("python") rs.set_hostlist_from_file("./path/to/hostfile") diff --git a/tests/test_slurm_validation.py b/tests/test_slurm_validation.py index 02baddce6..fbb6406c6 100644 --- a/tests/test_slurm_validation.py +++ b/tests/test_slurm_validation.py @@ -28,7 +28,11 @@ import pytest from smartsim.error.errors import LauncherError -from smartsim.slurm import _get_system_partition_info, get_default_partition, validate +from smartsim.wlm.slurm import ( + _get_system_partition_info, + get_default_partition, + validate, +) # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b diff --git a/tests/test_smartredis.py b/tests/test_smartredis.py index 282e708cc..6f7b19934 100644 --- a/tests/test_smartredis.py +++ b/tests/test_smartredis.py @@ -27,10 +27,11 @@ import pytest -from smartsim import Experiment, status +from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.database import Orchestrator from smartsim.entity import Ensemble, Model +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -59,22 +60,17 @@ ) -def test_exchange(fileutils, test_dir, wlmutils): +def test_exchange(local_experiment, local_db, prepare_db, fileutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a model. """ - exp = Experiment( - "smartredis_ensemble_exchange", exp_path=test_dir, launcher="local" - ) - + db = prepare_db(local_db).orchestrator # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + local_experiment.reconnect_orchestrator(db.checkpoint_file) - rs = exp.create_run_settings("python", "producer.py --exchange") + rs = local_experiment.create_run_settings("python", "producer.py --exchange") params = {"mult": [1, -10]} ensemble = Ensemble( name="producer", @@ -89,21 +85,17 @@ def test_exchange(fileutils, test_dir, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) - try: - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) - finally: - # stop the orchestrator - exp.stop(orc) + statuses = local_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -def test_consumer(fileutils, test_dir, wlmutils): +def test_consumer(local_experiment, local_db, prepare_db, fileutils): """Run three processes, each one of the first two processes puts a tensor on the DB; the third process accesses the tensors put by the two producers. @@ -111,17 +103,11 @@ def test_consumer(fileutils, test_dir, wlmutils): and the consumer accesses the two results. """ - exp = Experiment( - "smartredis_ensemble_consumer", exp_path=test_dir, launcher="local" - ) - - # create and start a database - orc = Orchestrator(port=wlmutils.get_test_port()) - exp.generate(orc) - exp.start(orc, block=False) + db = prepare_db(local_db).orchestrator + local_experiment.reconnect_orchestrator(db.checkpoint_file) - rs_prod = exp.create_run_settings("python", "producer.py") - rs_consumer = exp.create_run_settings("python", "consumer.py") + rs_prod = local_experiment.create_run_settings("python", "producer.py") + rs_consumer = local_experiment.create_run_settings("python", "consumer.py") params = {"mult": [1, -10]} ensemble = Ensemble( name="producer", params=params, run_settings=rs_prod, perm_strat="step" @@ -138,15 +124,11 @@ def test_consumer(fileutils, test_dir, wlmutils): config = fileutils.get_test_conf_path("smartredis") ensemble.attach_generator_files(to_copy=[config]) - exp.generate(ensemble) + local_experiment.generate(ensemble) # start the models - exp.start(ensemble, summary=False) + local_experiment.start(ensemble, summary=False) # get and confirm statuses - statuses = exp.get_status(ensemble) - try: - assert all([stat == status.STATUS_COMPLETED for stat in statuses]) - finally: - # stop the orchestrator - exp.stop(orc) + statuses = local_experiment.get_status(ensemble) + assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) diff --git a/tests/test_step_info.py b/tests/test_step_info.py index ec589ae76..fcccaa9cd 100644 --- a/tests/test_step_info.py +++ b/tests/test_step_info.py @@ -26,8 +26,8 @@ import pytest -from smartsim import status from smartsim._core.launcher.stepInfo import * +from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group pytestmark = pytest.mark.group_b @@ -35,7 +35,9 @@ def test_str(): step_info = StepInfo( - status=status.STATUS_COMPLETED, launcher_status="COMPLETED", returncode=0 + status=SmartSimStatus.STATUS_COMPLETED, + launcher_status="COMPLETED", + returncode=0, ) expected_output = "Status: Completed | Launcher Status COMPLETED | Returncode 0" @@ -45,4 +47,4 @@ def test_str(): def test_default(): step_info = UnmanagedStepInfo() - assert step_info._get_smartsim_status(None) == status.STATUS_FAILED + assert step_info._get_smartsim_status(None) == SmartSimStatus.STATUS_FAILED diff --git a/tests/test_symlinking.py b/tests/test_symlinking.py new file mode 100644 index 000000000..2b70e3e9f --- /dev/null +++ b/tests/test_symlinking.py @@ -0,0 +1,254 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib + +import pytest + +from smartsim import Experiment +from smartsim._core.config import CONFIG +from smartsim._core.control.controller import Controller, _AnonymousBatchJob +from smartsim.database.orchestrator import Orchestrator +from smartsim.entity.ensemble import Ensemble +from smartsim.entity.model import Model +from smartsim.settings.base import RunSettings +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +controller = Controller() +slurm_controller = Controller(launcher="slurm") + +rs = RunSettings("echo", ["spam", "eggs"]) +bs = SbatchSettings() +batch_rs = SrunSettings("echo", ["spam", "eggs"]) + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +model = Model("test_model", params={}, path="", run_settings=rs) +batch_model = Model( + "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs +) +anon_batch_model = _AnonymousBatchJob(batch_model) + + +@pytest.mark.parametrize( + "entity", + [pytest.param(ens, id="ensemble"), pytest.param(model, id="model")], +) +def test_symlink(test_dir, entity): + """Test symlinking historical output files""" + entity.path = test_dir + if entity.type == Ensemble: + for member in ens.models: + symlink_with_create_job_step(test_dir, member) + else: + symlink_with_create_job_step(test_dir, entity) + + +def symlink_with_create_job_step(test_dir, entity): + """Function that helps cut down on repeated testing code""" + exp_dir = pathlib.Path(test_dir) + entity.path = test_dir + status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + step = controller._create_job_step(entity, status_dir) + controller.symlink_output_files(step, entity) + assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() + assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( + status_dir / entity.name / (entity.name + ".out") + ) + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( + status_dir / entity.name / (entity.name + ".err") + ) + + +@pytest.mark.parametrize( + "entity", + [ + pytest.param(ens, id="ensemble"), + pytest.param(orc, id="orchestrator"), + pytest.param(anon_batch_model, id="model"), + ], +) +def test_batch_symlink(entity, test_dir): + """Test symlinking historical output files""" + exp_dir = pathlib.Path(test_dir) + entity.path = test_dir + status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) + for step in substeps: + slurm_controller.symlink_output_files(step, entity) + assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() + assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( + status_dir / entity.name / step.entity_name / (step.entity_name + ".out") + ) + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( + status_dir / entity.name / step.entity_name / (step.entity_name + ".err") + ) + + +def test_symlink_error(test_dir): + """Ensure FileNotFoundError is thrown""" + bad_model = Model( + "bad_model", + params={}, + path=pathlib.Path(test_dir, "badpath"), + run_settings=RunSettings("echo"), + ) + telem_dir = pathlib.Path(test_dir, "bad_model_telemetry") + bad_step = controller._create_job_step(bad_model, telem_dir) + with pytest.raises(FileNotFoundError): + controller.symlink_output_files(bad_step, bad_model) + + +def test_failed_model_launch_symlinks(test_dir): + exp_name = "failed-exp" + exp = Experiment(exp_name, exp_path=test_dir) + test_model = exp.create_model( + "test_model", run_settings=batch_rs, batch_settings=bs + ) + exp.generate(test_model) + with pytest.raises(TypeError): + exp.start(test_model) + + _should_not_be_symlinked(pathlib.Path(test_model.path)) + assert not pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink() + assert not pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink() + + +def test_failed_ensemble_launch_symlinks(test_dir): + exp_name = "failed-exp" + exp = Experiment(exp_name, exp_path=test_dir) + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=batch_rs, replicas=3 + ) + exp.generate(test_ensemble) + with pytest.raises(TypeError): + exp.start(test_ensemble) + + _should_not_be_symlinked(pathlib.Path(test_ensemble.path)) + assert not pathlib.Path( + test_ensemble.path, f"{test_ensemble.name}.out" + ).is_symlink() + assert not pathlib.Path( + test_ensemble.path, f"{test_ensemble.name}.err" + ).is_symlink() + + for i in range(len(test_ensemble.models)): + assert not pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ).is_symlink() + assert not pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ).is_symlink() + + +def test_non_batch_ensemble_symlinks(test_dir): + exp_name = "test-non-batch-ensemble" + rs = RunSettings("echo", ["spam", "eggs"]) + exp = Experiment(exp_name, exp_path=test_dir) + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_non_batch_model_symlinks(test_dir): + exp_name = "test-non-batch-model" + exp = Experiment(exp_name, exp_path=test_dir) + rs = RunSettings("echo", ["spam", "eggs"]) + + test_model = exp.create_model("test_model", path=test_dir, run_settings=rs) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_non_batch_orchestrator_symlinks(test_dir): + exp = Experiment("test-non-batch-orc", exp_path=test_dir) + + db = exp.create_database(interface="lo") + exp.generate(db) + exp.start(db, block=True) + exp.stop(db) + + for i in range(db.db_nodes): + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def _should_not_be_symlinked(non_linked_path: pathlib.Path): + """Helper function for assertions about paths that should NOT be symlinked""" + assert non_linked_path.exists() + assert not non_linked_path.is_symlink() + + +def _should_be_symlinked(linked_path: pathlib.Path, open_file: bool): + """Helper function for assertions about paths that SHOULD be symlinked""" + assert linked_path.exists() + assert linked_path.is_symlink() + # ensure the source file exists + assert pathlib.Path(os.readlink(linked_path)).exists() + if open_file: + with open(pathlib.Path(os.readlink(linked_path)), "r") as file: + log_contents = file.read() + assert "spam eggs" in log_contents diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index ac3599d7d..c1bfe2719 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -26,6 +26,7 @@ import logging +import multiprocessing as mp import pathlib import sys import time @@ -39,32 +40,23 @@ from smartsim import Experiment from smartsim._core.control.job import Job, JobEntity from smartsim._core.control.jobmanager import JobManager -from smartsim._core.entrypoints.telemetrymonitor import ( - ManifestEventHandler, - can_shutdown, - event_loop, - faux_return_code, - get_parser, - get_ts, - hydrate_persistable, - load_manifest, - track_event, -) +from smartsim._core.entrypoints.telemetrymonitor import get_parser from smartsim._core.launcher.launcher import WLMLauncher from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd from smartsim._core.launcher.stepInfo import StepInfo from smartsim._core.utils import serialize +from smartsim._core.utils.helpers import get_ts_ms +from smartsim._core.utils.telemetry.manifest import Run, RuntimeManifest +from smartsim._core.utils.telemetry.telemetry import ( + ManifestEventHandler, + TelemetryMonitor, + TelemetryMonitorArgs, +) +from smartsim._core.utils.telemetry.util import map_return_code, write_event from smartsim.error.errors import UnproxyableStepError from smartsim.settings.base import RunSettings -from smartsim.status import ( - STATUS_CANCELLED, - STATUS_COMPLETED, - STATUS_FAILED, - STATUS_NEW, - STATUS_PAUSED, - STATUS_RUNNING, -) +from smartsim.status import SmartSimStatus ALL_ARGS = {"-exp_dir", "-frequency"} PROXY_ENTRY_POINT = "smartsim._core.entrypoints.indirect" @@ -80,8 +72,7 @@ pytest.test_launcher == "local", reason="Test requires WLM" ) - -logger = logging.getLogger() +logger = logging.getLogger(__name__) # The tests in this file belong to the slow_tests group pytestmark = pytest.mark.slow_tests @@ -93,7 +84,21 @@ def turn_on_tm(monkeypatch): yield -def snooze_nonblocking( +def write_stop_file(entity: JobEntity, test_dir: pathlib.Path, duration: int): + time.sleep(duration) + write_event( + get_ts_ms(), + entity.task_id, + entity.step_id, + entity.type, + "stop", + test_dir, + "mock stop event", + 0, + ) + + +def snooze_blocking( test_dir: pathlib.Path, max_delay: int = 20, post_data_delay: int = 2 ): # let the non-blocking experiment complete. @@ -151,18 +156,56 @@ def test_parser(): def test_ts(): """Ensure expected output type""" - ts = get_ts() + ts = get_ts_ms() assert isinstance(ts, int) +@pytest.mark.parametrize( + ["freq"], + [ + pytest.param("1", id="1s delay"), + pytest.param("1.0", id="1s (float) freq"), + pytest.param("1.5", id="1.5s (float) freq"), + pytest.param("60", id="upper bound freq"), + pytest.param("60.0", id="upper bound (float) freq"), + ], +) +def test_valid_frequencies(freq: t.Union[int, float], test_dir: str): + """Ensure validation does not raise an exception on values in valid range""" + # check_frequency(float(freq)) + telmon_args = TelemetryMonitorArgs(test_dir, float(freq), 30, logging.DEBUG) + # telmon_args raises ValueError on bad inputs + assert telmon_args is not None + + +@pytest.mark.parametrize( + ["freq"], + [ + pytest.param("-1", id="negative freq"), + pytest.param("0", id="0s freq"), + pytest.param("0.9", id="0.9s freq"), + pytest.param("0.9999", id="lower bound"), + pytest.param("600.0001", id="just over upper"), + pytest.param("3600", id="too high"), + pytest.param("100000", id="bonkers high"), + ], +) +def test_invalid_frequencies(freq: t.Union[int, float], test_dir: str): + """Ensure validation raises an exception on values outside valid range""" + exp_err_msg = "in the range" + with pytest.raises(ValueError) as ex: + TelemetryMonitorArgs(test_dir, float(freq), 30, logging.DEBUG) + assert exp_err_msg in "".join(ex.value.args) + + @pytest.mark.parametrize( ["etype", "task_id", "step_id", "timestamp", "evt_type"], [ - pytest.param("ensemble", "", "123", get_ts(), "start", id="start event"), - pytest.param("ensemble", "", "123", get_ts(), "stop", id="stop event"), + pytest.param("ensemble", "", "123", get_ts_ms(), "start", id="start event"), + pytest.param("ensemble", "", "123", get_ts_ms(), "stop", id="stop event"), ], ) -def test_track_event( +def test_write_event( etype: str, task_id: str, step_id: str, @@ -172,7 +215,7 @@ def test_track_event( ): """Ensure that track event writes a file to the expected location""" exp_path = pathlib.Path(test_dir) - track_event(timestamp, task_id, step_id, etype, evt_type, exp_path, logger) + write_event(timestamp, task_id, step_id, etype, evt_type, exp_path) expected_output = exp_path / f"{evt_type}.json" @@ -180,6 +223,59 @@ def test_track_event( assert expected_output.is_file() +@pytest.mark.parametrize( + ["entity_type", "task_id", "step_id", "timestamp", "evt_type"], + [ + pytest.param("ensemble", "", "123", get_ts_ms(), "start", id="start event"), + pytest.param("ensemble", "", "123", get_ts_ms(), "stop", id="stop event"), + ], +) +def test_write_event_overwrite( + entity_type: str, + task_id: str, + step_id: str, + timestamp: int, + evt_type: str, + test_dir: str, +): + """Ensure that `write_event` does not overwrite an existing file if called more than once""" + exp_path = pathlib.Path(test_dir) + write_event(timestamp, task_id, step_id, entity_type, evt_type, exp_path) + + expected_output = exp_path / f"{evt_type}.json" + + assert expected_output.exists() + assert expected_output.is_file() + + # grab whatever is in the file now to compare against + original_content = expected_output.read_text() + + updated_timestamp = get_ts_ms() + updated_task_id = task_id + "xxx" + updated_step_id = step_id + "xxx" + updated_entity = entity_type + "xxx" + + # write to the same location + write_event( + updated_timestamp, + updated_task_id, + updated_step_id, + updated_entity, + evt_type, + exp_path, + ) + + # read in file content after attempted overwrite + with open(expected_output, "r") as validate_fp: + validate_output = validate_fp.read() + + # verify the content matches the old content + assert str(timestamp) in validate_output + assert str(updated_timestamp) not in validate_output + assert "xxx" not in validate_output + assert validate_output == original_content + + def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): """Ensure that the runtime manifest loads correctly""" sample_manifest_path = fileutils.get_test_conf_path("telemetry/telemetry.json") @@ -194,7 +290,7 @@ def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): test_manifest = pathlib.Path(test_manifest_path) assert test_manifest.exists() - manifest = load_manifest(test_manifest_path) + manifest = RuntimeManifest.load_manifest(test_manifest_path) assert manifest.name == "my-exp" assert str(manifest.path) == "/path/to/my-exp" assert manifest.launcher == "Slurm" @@ -213,7 +309,7 @@ def test_load_manifest_colo_model(fileutils: FileUtils): sample_manifest = pathlib.Path(sample_manifest_path) assert sample_manifest.exists() - manifest = load_manifest(sample_manifest_path) + manifest = RuntimeManifest.load_manifest(sample_manifest_path) assert manifest.name == "my-exp" assert str(manifest.path) == "/tmp/my-exp" assert manifest.launcher == "Slurm" @@ -229,7 +325,7 @@ def test_load_manifest_serial_models(fileutils: FileUtils): sample_manifest = pathlib.Path(sample_manifest_path) assert sample_manifest.exists() - manifest = load_manifest(sample_manifest_path) + manifest = RuntimeManifest.load_manifest(sample_manifest_path) assert manifest.name == "my-exp" assert str(manifest.path) == "/tmp/my-exp" assert manifest.launcher == "Slurm" @@ -246,7 +342,7 @@ def test_load_manifest_db_and_models(fileutils: FileUtils): sample_manifest = pathlib.Path(sample_manifest_path) assert sample_manifest.exists() - manifest = load_manifest(sample_manifest_path) + manifest = RuntimeManifest.load_manifest(sample_manifest_path) assert manifest.name == "my-exp" assert str(manifest.path) == "/tmp/my-exp" assert manifest.launcher == "Slurm" @@ -255,6 +351,12 @@ def test_load_manifest_db_and_models(fileutils: FileUtils): assert len(manifest.runs[0].orchestrators) == 1 assert len(manifest.runs[1].models) == 1 + # verify collector paths from manifest are deserialized to collector config + assert manifest.runs[0].orchestrators[0].collectors["client"] + assert manifest.runs[0].orchestrators[0].collectors["memory"] + # verify collector paths missing from manifest are empty + assert not manifest.runs[0].orchestrators[0].collectors["client_count"] + def test_load_manifest_db_and_models_1run(fileutils: FileUtils): """Ensure that the runtime manifest loads correctly when containing models & @@ -266,7 +368,7 @@ def test_load_manifest_db_and_models_1run(fileutils: FileUtils): sample_manifest = pathlib.Path(sample_manifest_path) assert sample_manifest.exists() - manifest = load_manifest(sample_manifest_path) + manifest = RuntimeManifest.load_manifest(sample_manifest_path) assert manifest.name == "my-exp" assert str(manifest.path) == "/tmp/my-exp" assert manifest.launcher == "Slurm" @@ -289,7 +391,7 @@ def test_persistable_computed_properties( task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool ): name = f"test-{etype}-{uuid.uuid4()}" - timestamp = get_ts() + timestamp = get_ts_ms() exp_dir = pathlib.Path("/foo/bar") stored = { "name": name, @@ -300,7 +402,8 @@ def test_persistable_computed_properties( "step_id": step_id, }, } - persistables = hydrate_persistable(etype, stored, exp_dir) + faux_experiment = {"launcher": "local"} + persistables = Run.load_entity(etype, stored, exp_dir, faux_experiment) persistable = persistables[0] if persistables else None assert persistable.is_managed == exp_ismanaged @@ -314,7 +417,7 @@ def test_deserialize_ensemble(fileutils: FileUtils): sample_manifest = pathlib.Path(sample_manifest_path) assert sample_manifest.exists() - manifest = load_manifest(sample_manifest_path) + manifest = RuntimeManifest.load_manifest(sample_manifest_path) assert manifest assert len(manifest.runs) == 1 @@ -324,70 +427,85 @@ def test_deserialize_ensemble(fileutils: FileUtils): assert len(manifest.runs[0].models) == 8 -def test_shutdown_conditions(): - """Ensure conditions to shutdown telemetry monitor are correctly evaluated""" +def test_shutdown_conditions__no_monitored_jobs(test_dir: str): + """Show that an event handler w/no monitored jobs can shutdown""" job_entity1 = JobEntity() job_entity1.name = "xyz" job_entity1.step_id = "123" job_entity1.task_id = "" - logger = logging.getLogger() + mani_handler = ManifestEventHandler("xyz") - # show that an event handler w/no monitored jobs can shutdown - mani_handler = ManifestEventHandler("xyz", logger) - assert can_shutdown(mani_handler, logger) + tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) + telmon = TelemetryMonitor(tm_args) + telmon._action_handler = mani_handler # replace w/mock handler - # show that an event handler w/a monitored job cannot shutdown - mani_handler = ManifestEventHandler("xyz", logger) + assert telmon._can_shutdown() + + +def test_shutdown_conditions__has_monitored_job(test_dir: str): + """Show that an event handler w/a monitored job cannot shutdown""" + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + + mani_handler = ManifestEventHandler("xyz") mani_handler.job_manager.add_job( job_entity1.name, job_entity1.step_id, job_entity1, False ) - assert not can_shutdown(mani_handler, logger) + tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) + telmon = TelemetryMonitor(tm_args) + telmon._action_handler = mani_handler + + assert not telmon._can_shutdown() assert not bool(mani_handler.job_manager.db_jobs) assert bool(mani_handler.job_manager.jobs) - # show that an event handler w/a monitored db cannot shutdown - mani_handler = ManifestEventHandler("xyz", logger) - job_entity1.type = "orchestrator" - mani_handler.job_manager.add_job( - job_entity1.name, job_entity1.step_id, job_entity1, False - ) - assert not can_shutdown(mani_handler, logger) - assert bool(mani_handler.job_manager.db_jobs) - assert not bool(mani_handler.job_manager.jobs) - # show that an event handler w/a dbs & tasks cannot shutdown - job_entity2 = JobEntity() - job_entity2.name = "xyz" - job_entity2.step_id = "123" - job_entity2.task_id = "" +def test_shutdown_conditions__has_db(test_dir: str): + """Show that an event handler w/a monitored db cannot shutdown""" + job_entity1 = JobEntity() + job_entity1.name = "xyz" + job_entity1.step_id = "123" + job_entity1.task_id = "" + job_entity1.type = "orchestrator" # <---- make entity appear as db - mani_handler = ManifestEventHandler("xyz", logger) - job_entity1.type = "orchestrator" + mani_handler = ManifestEventHandler("xyz") + ## TODO: see next comment and combine an add_job method on manieventhandler + # and _use within_ manieventhandler + # PROBABLY just encapsulating the body of for run in runs: for entity in run.flatten()... mani_handler.job_manager.add_job( job_entity1.name, job_entity1.step_id, job_entity1, False ) - - mani_handler.job_manager.add_job( - job_entity2.name, job_entity2.step_id, job_entity2, False - ) - assert not can_shutdown(mani_handler, logger) - assert bool(mani_handler.job_manager.db_jobs) - assert bool(mani_handler.job_manager.jobs) - - # ... now, show that removing 1 of 2 jobs still doesn't shutdown - mani_handler.job_manager.db_jobs.popitem() - assert not can_shutdown(mani_handler, logger) - - # ... now, show that removing final job will allow shutdown - mani_handler.job_manager.jobs.popitem() - assert can_shutdown(mani_handler, logger) + ## TODO: !!!!!! shouldn't add_job (or something on mani_handler) + # allow me to add a job to "all the places" in one call... even a private one? + mani_handler._tracked_jobs[job_entity1.key] = job_entity1 + tm_args = TelemetryMonitorArgs(test_dir, 1, 10, logging.DEBUG) + telmon = TelemetryMonitor(tm_args) + telmon._action_handler = mani_handler # replace w/mock handler + + assert not telmon._can_shutdown() + assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_db]) + assert not bool(mani_handler.job_manager.jobs) -def test_auto_shutdown(): +@pytest.mark.parametrize( + "expected_duration", + [ + pytest.param(2000, id="2s cooldown"), + pytest.param(3000, id="3s cooldown"), + pytest.param(5000, id="5s cooldown"), + pytest.param(10000, id="10s cooldown"), + ], +) +@pytest.mark.asyncio +async def test_auto_shutdown__no_jobs(test_dir: str, expected_duration: int): """Ensure that the cooldown timer is respected""" class FauxObserver: + """Mock for the watchdog file system event listener""" + def __init__(self): self.stop_count = 0 @@ -400,35 +518,96 @@ def is_alive(self) -> bool: return True - job_entity1 = JobEntity() - job_entity1.name = "xyz" - job_entity1.step_id = "123" - job_entity1.task_id = "" - - frequency = 1 + frequency = 1000 + # monitor_pattern = f"{test_dir}/mock_mani.json" # show that an event handler w/out a monitored task will automatically stop mani_handler = ManifestEventHandler("xyz", logger) observer = FauxObserver() - duration = 2 + expected_duration = 2000 - ts0 = get_ts() - event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) - ts1 = get_ts() + ts0 = get_ts_ms() + tm_args = TelemetryMonitorArgs( + test_dir, frequency / 1000, expected_duration / 1000, logging.DEBUG + ) + telmon = TelemetryMonitor(tm_args) + telmon._observer = observer # replace w/mock observer + telmon._action_handler = mani_handler # replace w/mock handler + + # with NO jobs registered, monitor should notice that it can + # shutdown immediately but wait for the cooldown period + await telmon.monitor() # observer, mani_handler, frequency, duration) + ts1 = get_ts_ms() - assert ts1 - ts0 >= duration + test_duration = ts1 - ts0 + assert test_duration >= expected_duration assert observer.stop_count == 1 - # show that the new cooldown duration is respected + +@pytest.mark.parametrize( + "cooldown_ms, task_duration_ms", + [ + pytest.param(2000, 2000, id="2s task + 2s cooldown"), + pytest.param(3000, 4000, id="3s task + 4s cooldown"), + pytest.param(5000, 5000, id="5s task + 5s cooldown"), + pytest.param(5000, 10000, id="5s task + 10s cooldown"), + ], +) +@pytest.mark.asyncio +async def test_auto_shutdown__has_db( + test_dir: str, cooldown_ms: int, task_duration_ms: int +): + """Ensure that the cooldown timer is respected with a running db""" + + class FauxObserver: + """Mock for the watchdog file system event listener""" + + def __init__(self): + self.stop_count = 0 + + def stop(self): + self.stop_count += 1 + + def is_alive(self) -> bool: + if self.stop_count > 0: + return False + + return True + + entity = JobEntity() + entity.name = "db_0" + entity.step_id = "123" + entity.task_id = "" + entity.type = "orchestrator" + entity.telemetry_on = True + entity.status_dir = test_dir + + p = mp.Process( + target=write_stop_file, + args=(entity, pathlib.Path(test_dir), (task_duration_ms / 1000)), + ) + + frequency = 1000 + + # show that when a monitored task completes,the telmon automatically stops mani_handler = ManifestEventHandler("xyz", logger) observer = FauxObserver() - duration = 5 + expected_duration = (cooldown_ms / 1000) + (task_duration_ms / 1000) - ts0 = get_ts() - event_loop(observer, mani_handler, frequency, logger, cooldown_duration=duration) - ts1 = get_ts() + tm_args = TelemetryMonitorArgs( + test_dir, frequency / 1000, (cooldown_ms / 1000), logging.DEBUG + ) + telmon = TelemetryMonitor(tm_args) + telmon._observer = observer # replace w/mock observer + telmon._action_handler = mani_handler # replace w/mock handler + + ts0 = get_ts_ms() + p.start() # another process write the stop.json and telmon picks it up + await telmon.monitor() + ts1 = get_ts_ms() - assert ts1 - ts0 >= duration + test_duration = ts1 - ts0 + assert test_duration >= expected_duration assert observer.stop_count == 1 @@ -455,7 +634,7 @@ def test_telemetry_single_model(fileutils, test_dir, wlmutils, config): smartsim_model = exp.create_model("perroquet", app_settings) exp.generate(smartsim_model) exp.start(smartsim_model, block=True) - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir start_events = list(telemetry_output_path.rglob("start.json")) @@ -494,9 +673,9 @@ def test_telemetry_single_model_nonblocking( exp.start(smartsim_model) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -534,7 +713,10 @@ def test_telemetry_serial_models(fileutils, test_dir, wlmutils, monkeypatch, con exp.generate(*smartsim_models) exp.start(*smartsim_models, block=True) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(*smartsim_models) + ] ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir @@ -578,10 +760,13 @@ def test_telemetry_serial_models_nonblocking( exp.start(*smartsim_models) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(*smartsim_models)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(*smartsim_models) + ] ) start_events = list(telemetry_output_path.rglob("start.json")) @@ -618,7 +803,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config try: exp.start(orc, block=True) - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -627,9 +812,9 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config assert len(stop_events) <= 1 finally: exp.stop(orc) - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == STATUS_CANCELLED + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 @@ -655,13 +840,12 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con # create regular database orc = exp.create_database(port=test_port, interface=test_interface) - orc.set_path(test_dir) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: exp.start(orc) - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -671,8 +855,8 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con finally: exp.stop(orc) - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=10) - assert exp.get_status(orc)[0] == STATUS_CANCELLED + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 @@ -717,10 +901,10 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch, conf exp.stop(orc) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == STATUS_CANCELLED - assert exp.get_status(smartsim_model)[0] == STATUS_COMPLETED + assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(smartsim_model)[0] == SmartSimStatus.STATUS_COMPLETED start_events = list(telemetry_output_path.rglob("database/**/start.json")) stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) @@ -759,10 +943,15 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch, config): ens = exp.create_ensemble("troupeau", run_settings=app_settings, replicas=5) exp.generate(ens) exp.start(ens, block=True) - assert all([status == STATUS_COMPLETED for status in exp.get_status(ens)]) + assert all( + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(ens) + ] + ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - snooze_nonblocking(telemetry_output_path, max_delay=60, post_data_delay=30) + snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) @@ -798,7 +987,10 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, c exp.generate(smartsim_model) exp.start(smartsim_model, block=True) assert all( - [status == STATUS_COMPLETED for status in exp.get_status(smartsim_model)] + [ + status == SmartSimStatus.STATUS_COMPLETED + for status in exp.get_status(smartsim_model) + ] ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir @@ -819,7 +1011,12 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, c ], ) def test_telemetry_autoshutdown( - test_dir, wlmutils, monkeypatch, frequency, cooldown, config + test_dir: str, + wlmutils, + monkeypatch: pytest.MonkeyPatch, + frequency: int, + cooldown: int, + config: cfg.Config, ): """ Ensure that the telemetry monitor process shuts down after the desired @@ -830,6 +1027,8 @@ def test_telemetry_autoshutdown( ctx.setattr(cfg.Config, "telemetry_frequency", frequency) ctx.setattr(cfg.Config, "telemetry_cooldown", cooldown) + cooldown_ms = cooldown * 1000 + # Set experiment name exp_name = "telemetry_ensemble" @@ -839,9 +1038,11 @@ def test_telemetry_autoshutdown( # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - start_time = get_ts() - stop_time = start_time - exp.start(block=False) + rs = RunSettings("python", exe_args=["sleep.py", "1"]) + model = exp.create_model("model", run_settings=rs) + + start_time = get_ts_ms() + exp.start(model, block=True) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir empty_mani = list(telemetry_output_path.rglob("manifest.json")) @@ -854,13 +1055,15 @@ def test_telemetry_autoshutdown( # give some leeway during testing for the cooldown to get hit for i in range(10): if popen.poll() is not None: - stop_time = get_ts() print(f"Completed polling for telemetry shutdown after {i} attempts") break - time.sleep(3) + time.sleep(2) + + stop_time = get_ts_ms() + duration = stop_time - start_time assert popen.returncode is not None - assert stop_time >= (start_time + cooldown) + assert duration >= cooldown_ms class MockStep(Step): @@ -935,7 +1138,7 @@ def test_unmanaged_steps_are_proxyed_through_indirect( @for_all_wlm_launchers -def test_unmanaged_steps_are_not_proxied_if_the_telemetry_monitor_is_disabled( +def test_unmanaged_steps_are_not_proxyed_if_the_telemetry_monitor_is_disabled( wlm_launcher, mock_step_meta_dict, test_dir, monkeypatch ): monkeypatch.setattr(cfg.Config, CFG_TM_ENABLED_ATTR, False) @@ -1026,52 +1229,53 @@ def test_multistart_experiment( telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) - db_stop_events = list(telemetry_output_path.rglob("database/**/stop.json")) assert len(db_start_events) == 1 - assert len(db_stop_events) == 1 m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) - m_stop_events = list(telemetry_output_path.rglob("model/**/stop.json")) assert len(m_start_events) == 1 - assert len(m_stop_events) == 1 e_start_events = list(telemetry_output_path.rglob("ensemble/**/start.json")) - e_stop_events = list(telemetry_output_path.rglob("ensemble/**/stop.json")) assert len(e_start_events) == 2 - assert len(e_stop_events) == 2 @pytest.mark.parametrize( "status_in, expected_out", [ - pytest.param(STATUS_CANCELLED, 1, id="failure on cancellation"), - pytest.param(STATUS_COMPLETED, 0, id="success on completion"), - pytest.param(STATUS_FAILED, 1, id="failure on failed"), - pytest.param(STATUS_NEW, None, id="failure on new"), - pytest.param(STATUS_PAUSED, None, id="failure on paused"), - pytest.param(STATUS_RUNNING, None, id="failure on running"), + pytest.param(SmartSimStatus.STATUS_CANCELLED, 1, id="failure on cancellation"), + pytest.param(SmartSimStatus.STATUS_COMPLETED, 0, id="success on completion"), + pytest.param(SmartSimStatus.STATUS_FAILED, 1, id="failure on failed"), + pytest.param(SmartSimStatus.STATUS_NEW, None, id="failure on new"), + pytest.param(SmartSimStatus.STATUS_PAUSED, None, id="failure on paused"), + pytest.param(SmartSimStatus.STATUS_RUNNING, None, id="failure on running"), ], ) def test_faux_rc(status_in: str, expected_out: t.Optional[int]): """Ensure faux response codes match expectations.""" step_info = StepInfo(status=status_in) - rc = faux_return_code(step_info) + rc = map_return_code(step_info) assert rc == expected_out @pytest.mark.parametrize( "status_in, expected_out, expected_has_jobs", [ - pytest.param(STATUS_CANCELLED, 1, False, id="failure on cancellation"), - pytest.param(STATUS_COMPLETED, 0, False, id="success on completion"), - pytest.param(STATUS_FAILED, 1, False, id="failure on failed"), - pytest.param(STATUS_NEW, None, True, id="failure on new"), - pytest.param(STATUS_PAUSED, None, True, id="failure on paused"), - pytest.param(STATUS_RUNNING, None, True, id="failure on running"), + pytest.param( + SmartSimStatus.STATUS_CANCELLED, 1, False, id="failure on cancellation" + ), + pytest.param( + SmartSimStatus.STATUS_COMPLETED, 0, False, id="success on completion" + ), + pytest.param(SmartSimStatus.STATUS_FAILED, 1, False, id="failure on failed"), + pytest.param(SmartSimStatus.STATUS_NEW, None, True, id="failure on new"), + pytest.param(SmartSimStatus.STATUS_PAUSED, None, True, id="failure on paused"), + pytest.param( + SmartSimStatus.STATUS_RUNNING, None, True, id="failure on running" + ), ], ) -def test_wlm_completion_handling( +@pytest.mark.asyncio +async def test_wlm_completion_handling( test_dir: str, monkeypatch: pytest.MonkeyPatch, status_in: str, @@ -1084,7 +1288,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: return _faux_updates - ts = get_ts() + ts = get_ts_ms() with monkeypatch.context() as ctx: # don't actually start a job manager ctx.setattr(JobManager, "start", lambda x: ...) @@ -1107,7 +1311,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: mani_handler._tracked_jobs = {job_entity.key: job_entity} mani_handler.job_manager.jobs[job.name] = job - mani_handler.on_timestep(ts) + await mani_handler.on_timestep(ts) # see that the job queue was properly manipulated has_jobs = bool(mani_handler._tracked_jobs) diff --git a/tests/utils/test_network.py b/tests/utils/test_network.py new file mode 100644 index 000000000..cdc3168ef --- /dev/null +++ b/tests/utils/test_network.py @@ -0,0 +1,30 @@ +import pytest + +from smartsim._core.utils.network import find_free_port + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_find_free_port_no_start(): + """Test that a free port is identified and returned when no + starting port number is specified""" + port = find_free_port() + assert port > 0 + + +@pytest.mark.parametrize( + "start_at", + [ + pytest.param(1000, id="start at 1000"), + pytest.param(2000, id="start at 2000"), + pytest.param(5000, id="start at 5000"), + pytest.param(10000, id="start at 10000"), + pytest.param(16000, id="start at 16000"), + ], +) +def test_find_free_port_range_specified(start_at): + """Test that a free port greater than or equal to the specified + starting port number is identified and returned""" + port = find_free_port(start_at) + assert port >= start_at diff --git a/tests/utils/test_security.py b/tests/utils/test_security.py new file mode 100644 index 000000000..1a7a9586b --- /dev/null +++ b/tests/utils/test_security.py @@ -0,0 +1,234 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import pathlib +import stat + +import pytest +from sympy import public + +from smartsim._core.config.config import get_config +from smartsim._core.utils.security import KeyManager, _KeyLocator, _KeyPermissions + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + + +def test_keylocator_filename_resolution(test_dir: str) -> None: + """Ensure the key locator resolves filenames as expected.""" + key_path = pathlib.Path(test_dir) + key_category = "mycategory" + key_file = "mykey" + locator = _KeyLocator(key_path, key_file, key_category) + + assert locator.public_filename == f"{key_file}.key", "public mismatch" + assert locator.private_filename == f"{key_file}.key_secret", "private mismatch" + + +def test_keylocator_dir_resolution(test_dir: str) -> None: + """Ensure the key locator resolves paths as expected.""" + key_path = pathlib.Path(test_dir) + key_name = "test" + key_category = "mycategory" + + locator = _KeyLocator(key_path, key_name, key_category) + + # we expect a category and pub/priv subdirectory + exp_pub = pathlib.Path(f"{test_dir}/{key_category}/pub").resolve() + assert str(locator.public_dir) == str(exp_pub) + + exp_priv = pathlib.Path(f"{test_dir}/{key_category}/priv").resolve() + assert str(locator.private_dir) == str(exp_priv) + + # and to be explicit... prove pub & priv are not same directory + assert str(locator.private_dir) != str(locator.public_dir) + + +def test_key_manager_dir_preparation( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the KeyManager creates the appropriate directory + structure required for public/private key pairs.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + km.create_directories() + + # verify the expected paths are created + server_locator = _KeyLocator(pathlib.Path(test_dir), "curve", "server") + client_locator = _KeyLocator(pathlib.Path(test_dir), "curve", "client") + + locators = [server_locator, client_locator] + + for locator in locators: + assert locator.public_dir.exists() + assert locator.private_dir.exists() + + +def test_key_manager_get_existing_keys_only_no_keys_found( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager cannot load keys when + directed not to create missing keys.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + # use create=False to only load pre-existing keys + server_keys, client_keys = km.get_keys(create=False) + + assert server_keys.empty + assert client_keys.empty + + +def test_key_manager_get_existing_keys_only_existing( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager can load keys when + they exist from a previous call.""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + + # use a KeyManager to create some keys + km = KeyManager(cfg, as_server=True, as_client=True) + old_server_keys, old_client_keys = km.get_keys(create=True) + + # create a new KM to verify keys reload + km = KeyManager(cfg, as_server=True, as_client=True) + + # use create=True to manifest any bugs missing existing keys + server_keys, client_keys = km.get_keys(create=True) + + # ensure we loaded something + assert not server_keys.empty + assert not client_keys.empty + + # and show the old keys were reloaded from disk + assert server_keys == old_server_keys + assert client_keys == old_client_keys + + +def test_key_manager_get_or_create_keys_default( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager creates keys when none can be loaded""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg) + + key_set = km.get_keys() + + # public keys are returned by default + assert key_set[0].public != b"" + assert key_set[1].public != b"" + + # default behavior will only return public keys + assert not key_set[0].private + assert not key_set[1].private + + +@pytest.mark.parametrize( + "as_server, as_client", + [ + pytest.param(False, True, id="as-client"), + pytest.param(True, False, id="as-server"), + pytest.param(True, True, id="as-both"), + pytest.param(False, False, id="public-only"), + ], +) +def test_key_manager_as_context( + as_server: bool, + as_client: bool, + test_dir: str, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Ensure the key manager loads the correct keys + when passed `as_server=True` and `as_client=True`""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg, as_server=as_server, as_client=as_client) + + server_keyset, client_keyset = km.get_keys() + + assert bool(server_keyset.public) == True + assert bool(server_keyset.private) == as_server + + assert bool(client_keyset.public) == True + assert bool(client_keyset.private) == as_client + + +def test_key_manager_applied_permissions( + test_dir: str, monkeypatch: pytest.MonkeyPatch +) -> None: + """Ensure the key manager applies the appropriate file-system + permissions to the keys and directories""" + with monkeypatch.context() as ctx: + ctx.setenv("SMARTSIM_KEY_PATH", test_dir) + + cfg = get_config() + km = KeyManager(cfg, as_client=True, as_server=True) + + server_keys, client_keys = km.get_keys() + + # ensure public dirs are open for reading by others + s_pub_stat = km._server_locator.public_dir.stat() + c_pub_stat = km._client_locator.public_dir.stat() + + assert stat.S_IMODE(s_pub_stat.st_mode) == _KeyPermissions.PUBLIC_DIR + assert stat.S_IMODE(c_pub_stat.st_mode) == _KeyPermissions.PUBLIC_DIR + + # ensure private dirs are open only to owner + s_priv_stat = km._server_locator.private_dir.stat() + c_priv_stat = km._client_locator.private_dir.stat() + + assert stat.S_IMODE(s_priv_stat.st_mode) == _KeyPermissions.PRIVATE_DIR + assert stat.S_IMODE(c_priv_stat.st_mode) == _KeyPermissions.PRIVATE_DIR + + # ensure public files are open for reading by others + s_pub_stat = km._server_locator.public.stat() + c_pub_stat = km._client_locator.public.stat() + + assert stat.S_IMODE(s_pub_stat.st_mode) == _KeyPermissions.PUBLIC_KEY + assert stat.S_IMODE(c_pub_stat.st_mode) == _KeyPermissions.PUBLIC_KEY + + # ensure private files are read-only for owner + s_priv_stat = km._server_locator.private.stat() + c_priv_stat = km._client_locator.private.stat() + + assert stat.S_IMODE(s_priv_stat.st_mode) == _KeyPermissions.PRIVATE_KEY + assert stat.S_IMODE(c_priv_stat.st_mode) == _KeyPermissions.PRIVATE_KEY