Skip to content

Commit

Permalink
Merge branch 'develop' into 612
Browse files Browse the repository at this point in the history
  • Loading branch information
ankona authored Mar 27, 2024
2 parents 5da35ef + 4b35cc9 commit 104d189
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 12 deletions.
1 change: 1 addition & 0 deletions doc/api/smartsim_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ steps to a batch.
.. autosummary::

SrunSettings.set_nodes
SrunSettings.set_node_feature
SrunSettings.set_tasks
SrunSettings.set_tasks_per_node
SrunSettings.set_walltime
Expand Down
12 changes: 12 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ To be released at some future point in time
Description

- Update telemetry monitor, add telemetry collectors
- Add method to specify node features for a Slurm job
- Colo Orchestrator setup now blocks application start until setup finished
- ExecArgs handling correction
- ReadTheDocs config file added and enabled on PRs
- Enforce changelog updates
Expand All @@ -36,6 +38,14 @@ Detailed Notes
database metric collectors. Improve telemetry monitor logging. Create
telemetry subpackage at `smartsim._core.utils.telemetry`. Refactor
telemetry monitor entrypoint. (SmartSim-PR460_)
- Users can now specify node features for a Slurm job through
``SrunSettings.set_node_feature``. The method accepts a string
or list of strings. (SmartSim-PR529_)
- The request to the colocated entrypoints file within the shell script
is now a blocking process. Once the Orchestrator is setup, it returns
which moves the process to the background and allows the application to
start. This prevents the application from requesting a ML model or
script that has not been uploaded to the Orchestrator yet. (SmartSim-PR522_)
- Add checks and tests to ensure SmartSim users cannot initialize run settings
with a list of lists as the exe_args argument. (SmartSim-PR517_)
- Add readthedocs configuration file and enable readthedocs builds
Expand All @@ -62,6 +72,8 @@ Detailed Notes

.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460
.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512
.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529
.. _SmartSim-PR522: https://github.com/CrayLabs/SmartSim/pull/522
.. _SmartSim-PR524: https://github.com/CrayLabs/SmartSim/pull/524
.. _SmartSim-PR520: https://github.com/CrayLabs/SmartSim/pull/520
.. _SmartSim-PR518: https://github.com/CrayLabs/SmartSim/pull/518
Expand Down
19 changes: 13 additions & 6 deletions smartsim/_core/entrypoints/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import tempfile
import typing as t
from pathlib import Path
from subprocess import PIPE, STDOUT
from subprocess import STDOUT
from types import FrameType

import filelock
Expand Down Expand Up @@ -177,6 +177,7 @@ def main(
db_scripts: t.List[t.List[str]],
db_identifier: str,
) -> None:
# pylint: disable=too-many-statements
global DBPID # pylint: disable=global-statement

lo_address = current_ip("lo")
Expand All @@ -201,8 +202,17 @@ def main(
# we generally want to catch all exceptions here as
# if this process dies, the application will most likely fail
try:
process = psutil.Popen(cmd, stdout=PIPE, stderr=STDOUT)
DBPID = process.pid
hostname = socket.gethostname()
filename = (
f"colo_orc_{hostname}.log"
if os.getenv("SMARTSIM_LOG_LEVEL") == "debug"
else os.devnull
)
with open(filename, "w", encoding="utf-8") as file:
process = psutil.Popen(cmd, stdout=file.fileno(), stderr=STDOUT)
DBPID = process.pid
# printing to stdout shell file for extraction
print(f"__PID__{DBPID}__PID__", flush=True)

except Exception as e:
cleanup()
Expand Down Expand Up @@ -249,9 +259,6 @@ def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None:
# Make sure we don't keep this around
del client

for line in iter(process.stdout.readline, b""):
print(line.decode("utf-8").rstrip(), flush=True)

except Exception as e:
cleanup()
logger.error(f"Colocated database process failed: {str(e)}")
Expand Down
15 changes: 9 additions & 6 deletions smartsim/_core/launcher/colocated.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,14 @@ def write_colocated_launch_script(
# STDOUT of the job
if colocated_settings["debug"]:
script_file.write("export SMARTSIM_LOG_LEVEL=debug\n")

script_file.write(f"{colocated_cmd}\n")
script_file.write("DBPID=$!\n\n")
script_file.write(f"db_stdout=$({colocated_cmd})\n")
# extract and set DBPID within the shell script that is
# enclosed between __PID__ and sent to stdout by the colocated
# entrypoints file
script_file.write(
"DBPID=$(echo $db_stdout | sed -n "
"'s/.*__PID__\\([0-9]*\\)__PID__.*/\\1/p')\n"
)

# Write the actual launch command for the app
script_file.write("$@\n\n")
Expand Down Expand Up @@ -190,10 +195,8 @@ def _build_colocated_wrapper_cmd(
db_script_cmd = _build_db_script_cmd(db_scripts)
db_cmd.extend(db_script_cmd)

# run colocated db in the background
db_cmd.append("&")

cmd.extend(db_cmd)

return " ".join(cmd)


Expand Down
13 changes: 13 additions & 0 deletions smartsim/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,19 @@ def set_time(self, hours: int = 0, minutes: int = 0, seconds: int = 0) -> None:
self._fmt_walltime(int(hours), int(minutes), int(seconds))
)

def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
"""Specify the node feature for this job
:param feature_list: node feature to launch on
:type feature_list: str | list[str]
"""
logger.warning(
(
"Feature specification not implemented for this "
f"RunSettings type: {type(self)}"
)
)

@staticmethod
def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str:
"""Convert hours, minutes, and seconds into valid walltime format
Expand Down
15 changes: 15 additions & 0 deletions smartsim/settings/slurmSettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,21 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None:
"""
self.run_args["bcast"] = dest_path

def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None:
"""Specify the node feature for this job
This sets ``-C``
:param feature_list: node feature to launch on
:type feature_list: str | list[str]
:raises TypeError: if not str or list of str
"""
if isinstance(feature_list, str):
feature_list = [feature_list.strip()]
elif not all(isinstance(feature, str) for feature in feature_list):
raise TypeError("node_feature argument must be string or list of strings")
self.run_args["C"] = ",".join(feature_list)

@staticmethod
def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str:
"""Convert hours, minutes, and seconds into valid walltime format
Expand Down
1 change: 1 addition & 0 deletions tests/test_run_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ def test_set_format_args(set_str, val, key):
pytest.param("set_task_map", (3,), id="set_task_map"),
pytest.param("set_cpus_per_task", (4,), id="set_cpus_per_task"),
pytest.param("set_hostlist", ("hostlist",), id="set_hostlist"),
pytest.param("set_node_feature", ("P100",), id="set_node_feature"),
pytest.param(
"set_hostlist_from_file", ("~/hostfile",), id="set_hostlist_from_file"
),
Expand Down
15 changes: 15 additions & 0 deletions tests/test_slurm_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,21 @@ def test_set_hostlist():
rs.set_hostlist([5])


def test_set_node_feature():
rs = SrunSettings("python")
rs.set_node_feature(["P100", "V100"])
assert rs.run_args["C"] == "P100,V100"

rs.set_node_feature("P100")
assert rs.run_args["C"] == "P100"

with pytest.raises(TypeError):
rs.set_node_feature(5)

with pytest.raises(TypeError):
rs.set_node_feature(["P100", 5])


def test_set_hostlist_from_file():
rs = SrunSettings("python")
rs.set_hostlist_from_file("./path/to/hostfile")
Expand Down

0 comments on commit 104d189

Please sign in to comment.