more updates to docs

apoorvkh · apoorvkh · Oct 30, 2024 · Oct 19, 2024 · Oct 19, 2024 · Oct 20, 2024
commit e54a5338450192e40ef385b08dd882e59315d1ad
diff --git a/src/torchrunx/agent.py b/src/torchrunx/agent.py
@@ -1,3 +1,5 @@
+"""Primary logic for agent processes."""
+
 from __future__ import annotations
 
 __all__ = ["main"]
@@ -22,8 +24,21 @@
 
 
 def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_port: int) -> None:
+    """Main function for agent processes (started on each node).
+
+    This function spawns local worker processes (which run the target function). All agents monitor
+    their worker statuses (including returned objects and raised exceptions) and communicate these
+    with each other (and launcher). All agents terminate if failure occurs in any agent.
+
+    Arguments:
+        launcher_agent_group: The communication group between launcher and all agents.
+        logger_hostname: The hostname of the launcher (for logging).
+        logger_port: The port of the launcher (for logging).
+    """
     agent_rank = launcher_agent_group.rank - 1
 
+    # Communicate initial payloads between launcher/agents
+
     payload = AgentPayload(
         hostname=socket.getfqdn(),
         port=get_open_port(),
@@ -38,6 +53,8 @@ def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_
     worker_global_ranks = launcher_payload.worker_global_ranks[agent_rank]
     num_workers = len(worker_global_ranks)
 
+    # Stream logs to logging server
+
     logger = logging.getLogger()
 
     log_records_to_socket(
@@ -50,7 +67,7 @@ def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_
 
     redirect_stdio_to_logger(logger)
 
-    # spawn workers
+    # Spawn worker processes
 
     ctx = dist_mp.start_processes(
         name=f"{hostname}_",
@@ -84,6 +101,9 @@ def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_
         ),  # pyright: ignore [reportArgumentType]
     )
 
+    # Monitor and communicate agent statuses
+    # Terminate gracefully upon failure
+
     try:
         status = None
         while True:

diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -87,7 +87,7 @@ def run(  # noqa: C901, PLR0912
         agent_payloads = None
 
         try:
-            # start logging server
+            # Start logging server (recieves LogRecords from agents/workers)
 
             log_receiver = _build_logging_server(
                 log_handlers=log_handlers,
@@ -105,7 +105,7 @@ def run(  # noqa: C901, PLR0912
 
             log_process.start()
 
-            # start agents on each node
+            # Start agents on each node
 
             for i, hostname in enumerate(hostnames):
                 _execute_command(
@@ -122,7 +122,7 @@ def run(  # noqa: C901, PLR0912
                     ssh_config_file=self.ssh_config_file,
                 )
 
-            # initialize launcher-agent process group
+            # Initialize launcher-agent process group
             # ranks = (launcher, agent_{hostnames[0]}, ..., agent[-1])
 
             launcher_agent_group = LauncherAgentGroup(
@@ -132,7 +132,7 @@ def run(  # noqa: C901, PLR0912
                 rank=0,
             )
 
-            # build and sync payloads between launcher and agents
+            # Sync initial payloads between launcher and agents
 
             _cumulative_workers = [0, *itertools.accumulate(workers_per_host)]
 
@@ -152,7 +152,7 @@ def run(  # noqa: C901, PLR0912
 
             launcher_payload, agent_payloads = launcher_agent_group.sync_payloads(payload=payload)
 
-            # loop to monitor agent statuses (until failed or done)
+            # Monitor agent statuses (until failed or done)
 
             while True:
                 # could raise AgentFailedError
@@ -187,6 +187,7 @@ def run(  # noqa: C901, PLR0912
                         ssh_config_file=self.ssh_config_file,
                     )
 
+        # if launch is successful: return objects from workers
         return_values = [s.return_values for s in agent_statuses]
         return LaunchResult(hostnames=hostnames, return_values=return_values)
 
@@ -216,23 +217,32 @@ def launch(
 ) -> LaunchResult:
     """Launch a distributed PyTorch function on the specified nodes.
 
-    :param func:
-    :param func_args:
-    :param func_kwargs:
-    :param hostnames: Nodes to launch the function on. Default infers from a SLURM environment or runs on localhost.
-    :param workers_per_host: Number of processes to run per node. Can define per node with :type:`list[int]`.
-    :param ssh_config_file: An SSH configuration file for connecting to nodes, by default loads ``~/.ssh/config`` or ``/etc/ssh/ssh_config``.
-    :param backend: `Backend <https://pytorch.org/docs/stable/distributed.html#torch.distributed.Backend>`_ to initialize worker process group with. Default uses NCCL (if GPUs available) or GLOO. Disabled by ``None``.
-    :param timeout: Worker process group timeout (seconds).
-    :param default_env_vars: A list of environmental variables to be copied from the launcher process to workers. Allows for bash pattern matching syntax.
-    :param extra_env_vars: Additional, user-specified variables to copy.
-    :param env_file: A file (like ``.env``) with additional environment variables to copy.
-    :param log_handlers: A list of handlers to manage agent and worker logs. Default uses an automatic basic logging scheme.
-    :raises RuntimeError: Due to various misconfigurations.
-    :raises AgentFailedError: If any agent fails (e.g. due to signal from OS).
-    :raises WorkerFailedError: If any worker fails (e.g. due to segmentation faults).
-    :raises Exception: Propagates exceptions raised in worker processes.
-    """  # noqa: E501
+    Arguments:
+        func: Function to run on each worker.
+        func_args: Positional arguments for ``func``.
+        func_kwargs: Keyword arguments for ``func``.
+        hostnames: Nodes on which to launch the function.
+            Defaults to nodes inferred from a SLURM environment or localhost.
+        workers_per_host: Number of processes to run per node.
+            Can specify different counts per node with a list.
+        ssh_config_file: Path to an SSH configuration file for connecting to nodes.
+            Defaults to ``~/.ssh/config`` or ``/etc/ssh/ssh_config``.
+        backend: `Backend <https://pytorch.org/docs/stable/distributed.html#torch.distributed.Backend>`_
+            for worker process group. Defaults to NCCL (GPU) or GLOO (CPU). Set `None` to disable.
+        timeout: Worker process group timeout (seconds).
+        default_env_vars: Environment variables to copy from the launcher process to workers.
+            Supports bash pattern matching syntax.
+        extra_env_vars: Additional user-specified environment variables to copy.
+        env_file: Path to a file (e.g., `.env`) with additional environment variables to copy.
+        log_handlers: Handlers to manage agent and worker logs.
+            Defaults to an automatic basic logging scheme.
+
+    Raises:
+        RuntimeError: If there are configuration issues.
+        AgentFailedError: If an agent fails, e.g. from an OS signal.
+        WorkerFailedError: If a worker fails, e.g. from a segmentation fault.
+        Exception: Any exception raised in a worker process is propagated.
+    """
     return Launcher(
         hostnames=hostnames,
         workers_per_host=workers_per_host,

diff --git a/src/torchrunx/utils/comm.py b/src/torchrunx/utils/comm.py
@@ -47,7 +47,7 @@ def __post_init__(self) -> None:
         """Initialize process group.
 
         Raises:
-        torch.distributed.DistStoreError: if group initialization times out.
+            torch.distributed.DistStoreError: if group initialization times out.
         """
         self.group = dist.init_process_group(
             backend="gloo",
@@ -69,7 +69,11 @@ def _deserialize(self, serialized: bytes) -> Any:
         return cloudpickle.loads(serialized)
 
     def _all_gather(self, obj: Any) -> list:
-        """Gather object from every rank to list on every rank."""
+        """Gather object from every rank to list on every rank.
+
+        Raises:
+            AgentFailedError: if any agent fails (observed by this communication).
+        """
         try:
             object_bytes = self._serialize(obj)
             object_list = [b""] * self.world_size
@@ -125,8 +129,8 @@ class AgentStatus:
     """Status of each agent (to be synchronized in LauncherAgentGroup).
 
     Attributes:
-      state: Whether the agent is running, failed, or done.
-      return_values: Objects returned (or exceptions raised) by workers (indexed by local rank).
+        state: Whether the agent is running, failed, or done.
+        return_values: Objects returned (or exceptions raised) by workers (indexed by local rank).
     """
 
     state: Literal["running", "failed", "done"]
@@ -139,10 +143,13 @@ def from_result(cls, result: RunProcsResult | None) -> Self:
         """Convert RunProcsResult (from polling worker process context) to AgentStatus."""
         if result is None:
             return cls(state="running")
+
         for local_rank, failure in result.failures.items():
             result.return_values[local_rank] = WorkerFailedError(failure.message)
+
         return_values = list(result.return_values.values())
-        failed = any(isinstance(v, ExceptionFromWorker) for v in return_values)
+
+        failed = any(isinstance(v, (ExceptionFromWorker, WorkerFailedError)) for v in return_values)
         state = "failed" if failed else "done"
 
         return cls(

diff --git a/src/torchrunx/utils/environment.py b/src/torchrunx/utils/environment.py
@@ -16,11 +16,7 @@ def in_slurm_job() -> bool:
 
 
 def slurm_hosts() -> list[str]:
-    """Retrieves hostnames of Slurm-allocated nodes.
-
-    :return: Hostnames of nodes in current Slurm allocation
-    :rtype: list[str]
-    """
+    """Retrieves hostnames of Slurm-allocated nodes."""
     # TODO: sanity check SLURM variables, commands
     if not in_slurm_job():
         msg = "Not in a SLURM job"

diff --git a/src/torchrunx/utils/logging.py b/src/torchrunx/utils/logging.py
@@ -44,10 +44,10 @@ def add_filter_to_handler(
     """A filter for ``logging.Handler`` such that only specific agent/worker logs are handled.
 
     Args:
-      handler: ``logging.Handler`` to be modified.
-      hostname: Name of specified host.
-      local_rank: Rank of specified worker (or ``None`` for agent).
-      log_level: Minimum log level to capture.
+        handler: ``logging.Handler`` to be modified.
+        hostname: Name of specified host.
+        local_rank: Rank of specified worker (or ``None`` for agent).
+        log_level: Minimum log level to capture.
     """
 
     def _filter(record: WorkerLogRecord) -> bool:

diff --git a/src/torchrunx/worker.py b/src/torchrunx/worker.py
@@ -1,3 +1,5 @@
+"""Arguments and entrypoint for the worker processes."""
+
 from __future__ import annotations
 
 import datetime
@@ -20,6 +22,8 @@
 
 @dataclass
 class WorkerArgs:
+    """Arguments passed from agent to spawned workers."""
+
     function: Callable
     logger_hostname: str
     logger_port: int
@@ -34,10 +38,13 @@ class WorkerArgs:
     timeout: int
 
     def serialize(self) -> SerializedWorkerArgs:
+        """Arguments must be serialized (to bytes) before passed to spawned workers."""
         return SerializedWorkerArgs(worker_args=self)
 
 
 class SerializedWorkerArgs:
+    """We use cloudpickle as a serialization backend (as it supports nearly all Python types)."""
+
     def __init__(self, worker_args: WorkerArgs) -> None:
         self.bytes = cloudpickle.dumps(worker_args)
 
@@ -46,8 +53,16 @@ def deserialize(self) -> WorkerArgs:
 
 
 def worker_entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | ExceptionFromWorker:
+    """Function called by spawned worker processes.
+
+    Workers first prepare a process group (for communicating with all other workers).
+    They then invoke the user-provided function.
+    Logs are transmitted to the launcher process.
+    """
     worker_args: WorkerArgs = serialized_worker_args.deserialize()
 
+    # Start logging to the logging server (i.e. the launcher)
+
     logger = logging.getLogger()
 
     log_records_to_socket(
@@ -60,13 +75,17 @@ def worker_entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | Exc
 
     redirect_stdio_to_logger(logger)
 
+    # Set rank/world environment variables
+
     os.environ["RANK"] = str(worker_args.rank)
     os.environ["LOCAL_RANK"] = str(worker_args.local_rank)
     os.environ["LOCAL_WORLD_SIZE"] = str(worker_args.local_world_size)
     os.environ["WORLD_SIZE"] = str(worker_args.world_size)
     os.environ["MASTER_ADDR"] = worker_args.main_agent_hostname
     os.environ["MASTER_PORT"] = str(worker_args.main_agent_port)
 
+    # Prepare the process group (e.g. for communication within the user's function)
+
     if worker_args.backend is not None:
         backend = worker_args.backend
         if backend == "auto":
@@ -85,6 +104,8 @@ def worker_entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | Exc
             timeout=datetime.timedelta(seconds=worker_args.timeout),
         )
 
+    # Invoke the user's function on this worker
+
     try:
         return worker_args.function()
     except Exception as e: