Skip to content

Commit db9f5bb

Browse files
authored
Prevent user from defining NCCL_TOPO_FILE when topologyFileConfingMap is set (#189)
1 parent 8fbb2a7 commit db9f5bb

File tree

3 files changed

+179
-0
lines changed

3 files changed

+179
-0
lines changed

tools/pytorchjob-generator/chart/templates/_helpers.tpl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ env:
9090
fieldPath: metadata.labels['sakkara.member.rank']
9191
{{- end }}
9292
{{- if .Values.topologyFileConfigMap }}
93+
{{- range $variable := .Values.environmentVariables }}
94+
{{- if eq $variable.name "NCCL_TOPO_FILE" }}
95+
{{ required "If topologyFileConfigMap is defined, environment variables must not define NCCL_TOPO_FILE" nil }}
96+
{{- end }}
97+
{{- end }}
98+
# Put the path to virtualTopology.xml file that was volume-mounted into the expected environment variable for CUDA
9399
- name: NCCL_TOPO_FILE
94100
value: /var/run/nvidia-topologyd/virtualTopology.xml
95101
{{- end }}

tools/pytorchjob-generator/chart/tests/__snapshot__/helloworld_test.yaml.snap

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1362,6 +1362,160 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
13621362
- emptyDir:
13631363
medium: Memory
13641364
name: dshm
1365+
Harmless environment variables can be set when topologyFileConfigMap is provided:
1366+
1: |
1367+
apiVersion: workload.codeflare.dev/v1beta2
1368+
kind: AppWrapper
1369+
metadata:
1370+
annotations:
1371+
workload.codeflare.dev.mlbatch/pytorchGeneratorVersion: 1.1.9
1372+
labels:
1373+
kueue.x-k8s.io/queue-name: default-queue
1374+
name: my-job
1375+
namespace: my-namespace
1376+
spec:
1377+
components:
1378+
- template:
1379+
apiVersion: kubeflow.org/v1
1380+
kind: PyTorchJob
1381+
metadata:
1382+
name: my-job
1383+
spec:
1384+
pytorchReplicaSpecs:
1385+
Master:
1386+
replicas: 1
1387+
restartPolicy: Never
1388+
template:
1389+
spec:
1390+
affinity:
1391+
nodeAffinity:
1392+
requiredDuringSchedulingIgnoredDuringExecution:
1393+
nodeSelectorTerms:
1394+
- matchExpressions:
1395+
- key: autopilot.ibm.com/gpuhealth
1396+
operator: NotIn
1397+
values:
1398+
- ERR
1399+
- TESTING
1400+
- EVICT
1401+
containers:
1402+
- command:
1403+
- sh
1404+
- -c
1405+
- |
1406+
echo "Environment variables set by the kubeflow training operator:"
1407+
echo ${MASTER_ADDR}:${MASTER_PORT}
1408+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1409+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1410+
echo "Other injected environment variables:"
1411+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1412+
#
1413+
# User commands
1414+
#
1415+
git clone https://github.com/dbarnett/python-helloworld
1416+
cd python-helloworld
1417+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1418+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1419+
env:
1420+
- name: NCCL_TOPO_FILE
1421+
value: /var/run/nvidia-topologyd/virtualTopology.xml
1422+
- name: EXAMPLE_VAR1
1423+
value: "42"
1424+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1425+
imagePullPolicy: IfNotPresent
1426+
name: pytorch
1427+
resources:
1428+
limits:
1429+
cpu: 500m
1430+
memory: 1Gi
1431+
nvidia.com/gpu: 8
1432+
nvidia.com/roce_gdr: 0
1433+
requests:
1434+
cpu: 500m
1435+
memory: 1Gi
1436+
nvidia.com/gpu: 8
1437+
nvidia.com/roce_gdr: 0
1438+
volumeMounts:
1439+
- mountPath: /var/run/nvidia-topologyd
1440+
name: topology-volume
1441+
- mountPath: /dev/shm
1442+
name: dshm
1443+
imagePullSecrets: []
1444+
priorityClassName: default-priority
1445+
volumes:
1446+
- configMap:
1447+
name: nvidia-topo-gdr
1448+
name: topology-volume
1449+
- emptyDir:
1450+
medium: Memory
1451+
name: dshm
1452+
Worker:
1453+
replicas: 3
1454+
restartPolicy: Never
1455+
template:
1456+
spec:
1457+
affinity:
1458+
nodeAffinity:
1459+
requiredDuringSchedulingIgnoredDuringExecution:
1460+
nodeSelectorTerms:
1461+
- matchExpressions:
1462+
- key: autopilot.ibm.com/gpuhealth
1463+
operator: NotIn
1464+
values:
1465+
- ERR
1466+
- TESTING
1467+
- EVICT
1468+
containers:
1469+
- command:
1470+
- sh
1471+
- -c
1472+
- |
1473+
echo "Environment variables set by the kubeflow training operator:"
1474+
echo ${MASTER_ADDR}:${MASTER_PORT}
1475+
echo "PYTHONUNBUFFERED:"${PYTHONUNBUFFERED}
1476+
echo My global rank is ${RANK} / ${WORLD_SIZE}
1477+
echo "Other injected environment variables:"
1478+
echo "NVME_MOUNT_PATH: "${NVME_MOUNT_PATH}
1479+
#
1480+
# User commands
1481+
#
1482+
git clone https://github.com/dbarnett/python-helloworld
1483+
cd python-helloworld
1484+
echo executing: torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1485+
torchrun --nnodes=${WORLD_SIZE} --node_rank=${RANK} --nproc_per_node=8 --rdzv_id=101 --rdzv_endpoint="${MASTER_ADDR}:${MASTER_PORT}" helloworld.py
1486+
env:
1487+
- name: NCCL_TOPO_FILE
1488+
value: /var/run/nvidia-topologyd/virtualTopology.xml
1489+
- name: EXAMPLE_VAR1
1490+
value: "42"
1491+
image: ghcr.io/foundation-model-stack/base:pytorch-latest-nightly-20230126
1492+
imagePullPolicy: IfNotPresent
1493+
name: pytorch
1494+
resources:
1495+
limits:
1496+
cpu: 500m
1497+
memory: 1Gi
1498+
nvidia.com/gpu: 8
1499+
nvidia.com/roce_gdr: 0
1500+
requests:
1501+
cpu: 500m
1502+
memory: 1Gi
1503+
nvidia.com/gpu: 8
1504+
nvidia.com/roce_gdr: 0
1505+
volumeMounts:
1506+
- mountPath: /var/run/nvidia-topologyd
1507+
name: topology-volume
1508+
- mountPath: /dev/shm
1509+
name: dshm
1510+
imagePullSecrets: []
1511+
priorityClassName: default-priority
1512+
volumes:
1513+
- configMap:
1514+
name: nvidia-topo-gdr
1515+
name: topology-volume
1516+
- emptyDir:
1517+
medium: Memory
1518+
name: dshm
13651519
scheduler can be set:
13661520
1: |
13671521
apiVersion: workload.codeflare.dev/v1beta2

tools/pytorchjob-generator/chart/tests/helloworld_test.yaml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,3 +270,22 @@ tests:
270270
asserts:
271271
- matchSnapshot:
272272
path: spec.components[0].template
273+
274+
- it: Harmless environment variables can be set when topologyFileConfigMap is provided
275+
set:
276+
topologyFileConfigMap: nvidia-topo-gdr
277+
environmentVariables:
278+
- name: EXAMPLE_VAR1
279+
value: 42
280+
asserts:
281+
- matchSnapshot:
282+
path: spec.components[0].template
283+
284+
- it: NCCL_TOPO_FILE environment variables cannot be set when topologyFileConfigMap is provided
285+
set:
286+
topologyFileConfigMap: nvidia-topo-gdr
287+
environmentVariables:
288+
- name: NCCL_TOPO_FILE
289+
value: myFile
290+
asserts:
291+
- failedTemplate: {}

0 commit comments

Comments
 (0)