Skip to content
This repository was archived by the owner on Jan 6, 2023. It is now read-only.

Commit 9550c08

Browse files
Kiuk Chungfacebook-github-bot
authored andcommitted
add support for jetter to Role (base_image) for mast launches (#58252)
Summary: Pull Request resolved: pytorch/pytorch#58252 Pull Request resolved: #149 1. Adds `ml_image` buck macro 2. Adds `--run_path` option to `torch.distributed.run` 3. Adds `tsm/driver/fb/test/patched/foo` (for unittesting) 4. Changes to `distributed_sum` to use `ml_image` (see Test plan for how this was tested in local and mast) NOTE: need to enable jetter for flow and local schedulers (will do this on a separate diff since this diff is already really big) Reviewed By: tierex Differential Revision: D28421033 fbshipit-source-id: 96edcecf639143e31ec6c86ec713a2e2d7790f3d
1 parent b52d62c commit 9550c08

File tree

1 file changed

+35
-7
lines changed

1 file changed

+35
-7
lines changed

torchelastic/tsm/driver/api.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,17 @@ class Container:
9292
A ``Resource`` can be bound to a specific scheduler backend or ``SchedulerBackend.ALL`` (default)
9393
to specify that the same ``Resource`` is to be used for all schedulers.
9494
95+
An optional ``base_image`` can be specified if the scheduler supports a
96+
concept of base images. For schedulers that run Docker containers the
97+
base image is not useful since the application image itself can be
98+
built from a base image (using the ``FROM base/image:latest`` construct in
99+
the Dockerfile). However the base image is useful for schedulers that
100+
work with simple image artifacts (e.g. ``*.tar.gz``) that do not have a built-in
101+
concept of base images. For these schedulers, specifying a base image that
102+
includes dependencies while the main image is the actual application code
103+
makes it possible to make changes to the application code without incurring
104+
the cost of re-building the uber artifact.
105+
95106
Usage:
96107
97108
::
@@ -106,9 +117,13 @@ class Container:
106117
.require(Resource(cpu=1, gpu=1, memMB=500), "custom_scheduler")
107118
.ports(tcp_store=8080, tensorboard=8081)
108119
120+
# for schedulers that support base_images
121+
my_container = Container(image="my/trainer:1", base_image="common/ml-tools:latest")
122+
.require(...)
109123
"""
110124

111125
image: str
126+
base_image: Optional[str] = None
112127
resources: Resource = NULL_RESOURCE
113128
port_map: Dict[str, int] = field(default_factory=dict)
114129

@@ -130,6 +145,9 @@ def ports(self, **kwargs: int) -> "Container":
130145
# sentinel value used to represent missing string attributes, such as image or entrypoint
131146
MISSING: str = "<MISSING>"
132147

148+
# sentinel value used to represent "unset" optional string attributes
149+
NONE: str = "<NONE>"
150+
133151
# sentinel value used as the "zero" element in the container group
134152
NULL_CONTAINER: Container = Container(image=MISSING)
135153

@@ -141,9 +159,11 @@ class macros:
141159
142160
Available macros:
143161
144-
1. ``img_root`` - root directory of the pulled image on the container
145-
2. ``app_id`` - application id as assigned by the scheduler
146-
3. ``replica_id`` - unique id for each instance of a replica of a Role,
162+
1. ``img_root`` - root directory of the pulled conatiner.image
163+
2. ``base_img_root`` - root directory of the pulled container.base_image
164+
(resolves to "<NONE>" if no base_image set)
165+
3. ``app_id`` - application id as assigned by the scheduler
166+
4. ``replica_id`` - unique id for each instance of a replica of a Role,
147167
for instance a role with 3 replicas could have the 0, 1, 2
148168
as replica ids. Note that when the container fails and is
149169
replaced, the new container will have the same ``replica_id``
@@ -163,15 +183,25 @@ class macros:
163183
"""
164184

165185
img_root = "${img_root}"
186+
base_img_root = "${base_img_root}"
166187
app_id = "${app_id}"
167188
replica_id = "${replica_id}"
168189

169190
@staticmethod
170-
def substitute(args: List[str], img_root: str, app_id: str, replica_id: str):
191+
def substitute(
192+
args: List[str],
193+
img_root: str,
194+
app_id: str,
195+
replica_id: str,
196+
base_img_root: str = NONE,
197+
):
171198
args_sub = []
172199
for arg in args:
173200
sub = Template(arg).safe_substitute(
174-
img_root=img_root, app_id=app_id, replica_id=replica_id
201+
img_root=img_root,
202+
app_id=app_id,
203+
replica_id=replica_id,
204+
base_img_root=base_img_root,
175205
)
176206
args_sub.append(sub)
177207
return args_sub
@@ -419,8 +449,6 @@ def is_terminal(state: AppState) -> bool:
419449
return state in _TERMINAL_STATES
420450

421451

422-
NONE: str = "<NONE>"
423-
424452
# =======================
425453
# ==== Status API =======
426454
# =======================

0 commit comments

Comments
 (0)