enabled linting for docs; clarified public/private functions

apoorvkh · apoorvkh · Oct 30, 2024 · Oct 19, 2024 · Oct 19, 2024 · Oct 20, 2024
commit cbf40b9f0c4c547a5a56e1e5e2fe0121dce1afbd
diff --git a/pyproject.toml b/pyproject.toml
@@ -41,7 +41,6 @@ src = ["src", "tests"]
 [tool.ruff.lint]
 select = ["ALL"]
 ignore = [
-  "D",  # documentation
   "ANN101", "ANN102", "ANN401",  # self / cls / Any annotations
   "BLE001",  # blind exceptions
   "TD",  # todo syntax
@@ -54,9 +53,12 @@ ignore = [
 ]
 [tool.ruff.lint.per-file-ignores]
 "tests/**/*.py" = [
+  "D",
   "S101",  # allow asserts
   "T201"  # allow prints
 ]
+[tool.ruff.lint.pydocstyle]
+convention = "google"
 
 [tool.pyright]
 include = ["src", "tests"]

diff --git a/src/torchrunx/agent.py b/src/torchrunx/agent.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+__all__ = ["main"]
+
 import datetime
 import logging
 import os
@@ -25,83 +27,6 @@
 )
 
 
-@dataclass
-class WorkerArgs:
-    function: Callable
-    logger_hostname: str
-    logger_port: int
-    main_agent_hostname: str
-    main_agent_port: int
-    backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None
-    rank: int
-    local_rank: int
-    local_world_size: int
-    world_size: int
-    hostname: str
-    timeout: int
-
-    def serialize(self) -> SerializedWorkerArgs:
-        return SerializedWorkerArgs(worker_args=self)
-
-
-class SerializedWorkerArgs:
-    def __init__(self, worker_args: WorkerArgs) -> None:
-        self.bytes = cloudpickle.dumps(worker_args)
-
-    def deserialize(self) -> WorkerArgs:
-        return cloudpickle.loads(self.bytes)
-
-
-def entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | WorkerException:
-    worker_args: WorkerArgs = serialized_worker_args.deserialize()
-
-    logger = logging.getLogger()
-
-    log_records_to_socket(
-        logger=logger,
-        hostname=worker_args.hostname,
-        worker_rank=worker_args.local_rank,
-        logger_hostname=worker_args.logger_hostname,
-        logger_port=worker_args.logger_port,
-    )
-
-    redirect_stdio_to_logger(logger)
-
-    os.environ["RANK"] = str(worker_args.rank)
-    os.environ["LOCAL_RANK"] = str(worker_args.local_rank)
-    os.environ["LOCAL_WORLD_SIZE"] = str(worker_args.local_world_size)
-    os.environ["WORLD_SIZE"] = str(worker_args.world_size)
-    os.environ["MASTER_ADDR"] = worker_args.main_agent_hostname
-    os.environ["MASTER_PORT"] = str(worker_args.main_agent_port)
-
-    if worker_args.backend is not None:
-        backend = worker_args.backend
-        if backend == "auto":
-            backend = "nccl" if torch.cuda.is_available() else "gloo"
-
-        dist.init_process_group(
-            backend=backend,
-            world_size=worker_args.world_size,
-            rank=worker_args.rank,
-            store=dist.TCPStore(  # pyright: ignore [reportPrivateImportUsage]
-                host_name=worker_args.main_agent_hostname,
-                port=worker_args.main_agent_port,
-                world_size=worker_args.world_size,
-                is_master=(worker_args.rank == 0),
-            ),
-            timeout=datetime.timedelta(seconds=worker_args.timeout),
-        )
-
-    try:
-        return worker_args.function()
-    except Exception as e:
-        traceback.print_exc()
-        return WorkerException(exception=e)
-    finally:
-        sys.stdout.flush()
-        sys.stderr.flush()
-
-
 def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_port: int) -> None:
     agent_rank = launcher_agent_group.rank - 1
 
@@ -135,7 +60,7 @@ def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_
 
     ctx = dist_mp.start_processes(
         name=f"{hostname}_",
-        entrypoint=entrypoint,
+        entrypoint=_entrypoint,
         args={
             i: (
                 WorkerArgs(
@@ -179,3 +104,80 @@ def main(launcher_agent_group: LauncherAgentGroup, logger_hostname: str, logger_
         ctx.close()
         sys.stdout.flush()
         sys.stderr.flush()
+
+
+@dataclass
+class WorkerArgs:
+    function: Callable
+    logger_hostname: str
+    logger_port: int
+    main_agent_hostname: str
+    main_agent_port: int
+    backend: Literal["nccl", "gloo", "mpi", "ucc", "auto"] | None
+    rank: int
+    local_rank: int
+    local_world_size: int
+    world_size: int
+    hostname: str
+    timeout: int
+
+    def serialize(self) -> SerializedWorkerArgs:
+        return SerializedWorkerArgs(worker_args=self)
+
+
+class SerializedWorkerArgs:
+    def __init__(self, worker_args: WorkerArgs) -> None:
+        self.bytes = cloudpickle.dumps(worker_args)
+
+    def deserialize(self) -> WorkerArgs:
+        return cloudpickle.loads(self.bytes)
+
+
+def _entrypoint(serialized_worker_args: SerializedWorkerArgs) -> Any | WorkerException:
+    worker_args: WorkerArgs = serialized_worker_args.deserialize()
+
+    logger = logging.getLogger()
+
+    log_records_to_socket(
+        logger=logger,
+        hostname=worker_args.hostname,
+        worker_rank=worker_args.local_rank,
+        logger_hostname=worker_args.logger_hostname,
+        logger_port=worker_args.logger_port,
+    )
+
+    redirect_stdio_to_logger(logger)
+
+    os.environ["RANK"] = str(worker_args.rank)
+    os.environ["LOCAL_RANK"] = str(worker_args.local_rank)
+    os.environ["LOCAL_WORLD_SIZE"] = str(worker_args.local_world_size)
+    os.environ["WORLD_SIZE"] = str(worker_args.world_size)
+    os.environ["MASTER_ADDR"] = worker_args.main_agent_hostname
+    os.environ["MASTER_PORT"] = str(worker_args.main_agent_port)
+
+    if worker_args.backend is not None:
+        backend = worker_args.backend
+        if backend == "auto":
+            backend = "nccl" if torch.cuda.is_available() else "gloo"
+
+        dist.init_process_group(
+            backend=backend,
+            world_size=worker_args.world_size,
+            rank=worker_args.rank,
+            store=dist.TCPStore(  # pyright: ignore [reportPrivateImportUsage]
+                host_name=worker_args.main_agent_hostname,
+                port=worker_args.main_agent_port,
+                world_size=worker_args.world_size,
+                is_master=(worker_args.rank == 0),
+            ),
+            timeout=datetime.timedelta(seconds=worker_args.timeout),
+        )
+
+    try:
+        return worker_args.function()
+    except Exception as e:
+        traceback.print_exc()
+        return WorkerException(exception=e)
+    finally:
+        sys.stdout.flush()
+        sys.stderr.flush()
diff --git a/src/torchrunx/environment.py b/src/torchrunx/environment.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+__all__ = ["in_slurm_job", "slurm_hosts", "slurm_workers", "auto_hosts", "auto_workers"]
+
 import os
 import subprocess
 
@@ -29,8 +31,7 @@ def slurm_hosts() -> list[str]:
 
 
 def slurm_workers() -> int:
-    """
-    |  Determines number of workers per node in current Slurm allocation using
+    """|  Determines number of workers per node in current Slurm allocation using
     |  the ``SLURM_JOB_GPUS`` or ``SLURM_CPUS_ON_NODE`` environmental variables.
 
     :return: The implied number of workers per node
@@ -52,8 +53,7 @@ def slurm_workers() -> int:
 
 
 def auto_hosts() -> list[str]:
-    """
-    Automatically determine hostname list
+    """Automatically determine hostname list
 
     :return: Hostnames in Slurm allocation, or ['localhost']
     :rtype: list[str]
@@ -65,8 +65,7 @@ def auto_hosts() -> list[str]:
 
 
 def auto_workers() -> int:
-    """
-    Automatically determine number of workers per host
+    """Automatically determine number of workers per host
 
     :return: Workers per host
     :rtype: int

diff --git a/src/torchrunx/launcher.py b/src/torchrunx/launcher.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+__all__ = ["AgentKilledError", "Launcher", "launch", "LaunchResult"]
+
 import fnmatch
 import ipaddress
 import itertools
@@ -54,14 +56,14 @@ def run(  # noqa: C901, PLR0912
         func: Callable,
         func_args: tuple[Any] | None = None,
         func_kwargs: dict[str, Any] | None = None,
-        log_handlers: list[Handler] | Literal["auto"] | None = "auto"
+        log_handlers: list[Handler] | Literal["auto"] | None = "auto",
     ) -> LaunchResult:
         if not dist.is_available():
             msg = "The torch.distributed package is not available."
             raise RuntimeError(msg)
 
-        hostnames = resolve_hostnames(self.hostnames)
-        workers_per_host = resolve_workers_per_host(self.workers_per_host, len(hostnames))
+        hostnames = _resolve_hostnames(self.hostnames)
+        workers_per_host = _resolve_workers_per_host(self.workers_per_host, len(hostnames))
 
         launcher_hostname = socket.getfqdn()
         launcher_port = get_open_port()
@@ -75,7 +77,7 @@ def run(  # noqa: C901, PLR0912
         try:
             # start logging server
 
-            log_receiver = build_logging_server(
+            log_receiver = _build_logging_server(
                 log_handlers=log_handlers,
                 launcher_hostname=launcher_hostname,
                 hostnames=hostnames,
@@ -94,8 +96,8 @@ def run(  # noqa: C901, PLR0912
             # start agents on each node
 
             for i, hostname in enumerate(hostnames):
-                execute_command(
-                    command=build_launch_command(
+                _execute_command(
+                    command=_build_launch_command(
                         launcher_hostname=launcher_hostname,
                         launcher_port=launcher_port,
                         logger_port=log_receiver.port,
@@ -168,7 +170,7 @@ def run(  # noqa: C901, PLR0912
             # cleanup: SIGTERM all agents
             if agent_payloads is not None:
                 for agent_payload, agent_hostname in zip(agent_payloads, hostnames):
-                    execute_command(
+                    _execute_command(
                         command=f"kill {agent_payload.process_id}",
                         hostname=agent_hostname,
                         ssh_config_file=self.ssh_config_file,
@@ -200,8 +202,7 @@ def launch(
     env_file: str | os.PathLike | None = None,
     log_handlers: list[Handler] | Literal["auto"] | None = "auto",
 ) -> LaunchResult:
-    """
-    Launch a distributed PyTorch function on the specified nodes.
+    """Launch a distributed PyTorch function on the specified nodes.
 
     :param func:
     :param func_args:
@@ -249,8 +250,7 @@ def all(self, by: Literal["rank"]) -> list[Any]:
         pass
 
     def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[Any]] | list[Any]:
-        """
-        Get all worker return values by rank or hostname.
+        """Get all worker return values by rank or hostname.
 
         :param by: Whether to aggregate all return values by hostname, or just output all of them \
                    in order of rank, defaults to ``'hostname'``
@@ -264,17 +264,15 @@ def all(self, by: Literal["hostname", "rank"] = "hostname") -> dict[str, list[An
         raise TypeError(msg)
 
     def values(self, hostname: str) -> list[Any]:
-        """
-        Get worker return values for host ``hostname``.
+        """Get worker return values for host ``hostname``.
 
         :param hostname: The host to get return values from
         """
         host_idx = self.hostnames.index(hostname)
         return self.return_values[host_idx]
 
     def value(self, rank: int) -> Any:
-        """
-        Get worker return value from global rank ``rank``.
+        """Get worker return value from global rank ``rank``.
 
         :param rank: Global worker rank to get return value from
         """
@@ -292,15 +290,15 @@ def value(self, rank: int) -> Any:
         raise ValueError(msg)
 
 
-def resolve_hostnames(hostnames: list[str] | Literal["auto", "slurm"]) -> list[str]:
+def _resolve_hostnames(hostnames: list[str] | Literal["auto", "slurm"]) -> list[str]:
     if hostnames == "auto":
         return auto_hosts()
     if hostnames == "slurm":
         return slurm_hosts()
     return hostnames
 
 
-def resolve_workers_per_host(
+def _resolve_workers_per_host(
     workers_per_host: int | list[int] | Literal["auto", "slurm"],
     num_hosts: int,
 ) -> list[int]:
@@ -318,7 +316,7 @@ def resolve_workers_per_host(
     return workers_per_host
 
 
-def build_logging_server(
+def _build_logging_server(
     log_handlers: list[Handler] | Literal["auto"] | None,
     launcher_hostname: str,
     hostnames: list[str],
@@ -343,7 +341,7 @@ def build_logging_server(
     )
 
 
-def build_launch_command(
+def _build_launch_command(
     launcher_hostname: str,
     launcher_port: int,
     logger_port: int,
@@ -385,7 +383,7 @@ def build_launch_command(
     return " && ".join(commands)
 
 
-def execute_command(
+def _execute_command(
     command: str,
     hostname: str,
     ssh_config_file: str | os.PathLike | None = None,