@@ -1362,6 +1362,160 @@ Enabling sshGitConfig injects the envvars, volumes, and volumeMounts:
1362
1362
- emptyDir :
1363
1363
medium : Memory
1364
1364
name : dshm
1365
+ Harmless environment variables can be set when topologyFileConfigMap is provided :
1366
+ 1 : |
1367
+ apiVersion : workload .codeflare .dev / v1beta2
1368
+ kind : AppWrapper
1369
+ metadata :
1370
+ annotations :
1371
+ workload .codeflare .dev .mlbatch / pytorchGeneratorVersion : 1.1 .9
1372
+ labels :
1373
+ kueue .x - k8s .io / queue - name : default - queue
1374
+ name : my - job
1375
+ namespace : my - namespace
1376
+ spec :
1377
+ components :
1378
+ - template :
1379
+ apiVersion : kubeflow .org / v1
1380
+ kind : PyTorchJob
1381
+ metadata :
1382
+ name : my - job
1383
+ spec :
1384
+ pytorchReplicaSpecs :
1385
+ Master :
1386
+ replicas : 1
1387
+ restartPolicy : Never
1388
+ template :
1389
+ spec :
1390
+ affinity :
1391
+ nodeAffinity :
1392
+ requiredDuringSchedulingIgnoredDuringExecution :
1393
+ nodeSelectorTerms :
1394
+ - matchExpressions :
1395
+ - key : autopilot .ibm .com / gpuhealth
1396
+ operator : NotIn
1397
+ values :
1398
+ - ERR
1399
+ - TESTING
1400
+ - EVICT
1401
+ containers :
1402
+ - command :
1403
+ - sh
1404
+ - - c
1405
+ - |
1406
+ echo " Environment variables set by the kubeflow training operator:"
1407
+ echo $ {MASTER_ADDR }:$ {MASTER_PORT }
1408
+ echo " PYTHONUNBUFFERED:" $ {PYTHONUNBUFFERED }
1409
+ echo My global rank is $ {RANK } / $ {WORLD_SIZE }
1410
+ echo " Other injected environment variables:"
1411
+ echo " NVME_MOUNT_PATH: " $ {NVME_MOUNT_PATH }
1412
+ #
1413
+ # User commands
1414
+ #
1415
+ git clone https :// github.com/dbarnett/python-helloworld
1416
+ cd python - helloworld
1417
+ echo executing : torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1418
+ torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1419
+ env :
1420
+ - name : NCCL_TOPO_FILE
1421
+ value : / var /run/nvidia-topologyd/virtualTopology.xml
1422
+ - name : EXAMPLE_VAR1
1423
+ value : " 42"
1424
+ image : ghcr .io / foundation - model - stack / base :pytorch - latest - nightly - 20230126
1425
+ imagePullPolicy : IfNotPresent
1426
+ name : pytorch
1427
+ resources :
1428
+ limits :
1429
+ cpu : 500m
1430
+ memory : 1Gi
1431
+ nvidia .com / gpu : 8
1432
+ nvidia .com / roce_gdr : 0
1433
+ requests :
1434
+ cpu : 500m
1435
+ memory : 1Gi
1436
+ nvidia .com / gpu : 8
1437
+ nvidia .com / roce_gdr : 0
1438
+ volumeMounts :
1439
+ - mountPath : / var /run/nvidia-topologyd
1440
+ name : topology - volume
1441
+ - mountPath : / dev / shm
1442
+ name : dshm
1443
+ imagePullSecrets : []
1444
+ priorityClassName : default - priority
1445
+ volumes :
1446
+ - configMap :
1447
+ name : nvidia - topo - gdr
1448
+ name : topology - volume
1449
+ - emptyDir :
1450
+ medium : Memory
1451
+ name : dshm
1452
+ Worker :
1453
+ replicas : 3
1454
+ restartPolicy : Never
1455
+ template :
1456
+ spec :
1457
+ affinity :
1458
+ nodeAffinity :
1459
+ requiredDuringSchedulingIgnoredDuringExecution :
1460
+ nodeSelectorTerms :
1461
+ - matchExpressions :
1462
+ - key : autopilot .ibm .com / gpuhealth
1463
+ operator : NotIn
1464
+ values :
1465
+ - ERR
1466
+ - TESTING
1467
+ - EVICT
1468
+ containers :
1469
+ - command :
1470
+ - sh
1471
+ - - c
1472
+ - |
1473
+ echo " Environment variables set by the kubeflow training operator:"
1474
+ echo $ {MASTER_ADDR }:$ {MASTER_PORT }
1475
+ echo " PYTHONUNBUFFERED:" $ {PYTHONUNBUFFERED }
1476
+ echo My global rank is $ {RANK } / $ {WORLD_SIZE }
1477
+ echo " Other injected environment variables:"
1478
+ echo " NVME_MOUNT_PATH: " $ {NVME_MOUNT_PATH }
1479
+ #
1480
+ # User commands
1481
+ #
1482
+ git clone https :// github.com/dbarnett/python-helloworld
1483
+ cd python - helloworld
1484
+ echo executing : torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1485
+ torchrun -- nnodes = $ {WORLD_SIZE } -- node_rank = $ {RANK } -- nproc_per_node = 8 -- rdzv_id = 101 -- rdzv_endpoint = " ${MASTER_ADDR}:${MASTER_PORT}" helloworld .py
1486
+ env :
1487
+ - name : NCCL_TOPO_FILE
1488
+ value : / var /run/nvidia-topologyd/virtualTopology.xml
1489
+ - name : EXAMPLE_VAR1
1490
+ value : " 42"
1491
+ image : ghcr .io / foundation - model - stack / base :pytorch - latest - nightly - 20230126
1492
+ imagePullPolicy : IfNotPresent
1493
+ name : pytorch
1494
+ resources :
1495
+ limits :
1496
+ cpu : 500m
1497
+ memory : 1Gi
1498
+ nvidia .com / gpu : 8
1499
+ nvidia .com / roce_gdr : 0
1500
+ requests :
1501
+ cpu : 500m
1502
+ memory : 1Gi
1503
+ nvidia .com / gpu : 8
1504
+ nvidia .com / roce_gdr : 0
1505
+ volumeMounts :
1506
+ - mountPath : / var /run/nvidia-topologyd
1507
+ name : topology - volume
1508
+ - mountPath : / dev / shm
1509
+ name : dshm
1510
+ imagePullSecrets : []
1511
+ priorityClassName : default - priority
1512
+ volumes :
1513
+ - configMap :
1514
+ name : nvidia - topo - gdr
1515
+ name : topology - volume
1516
+ - emptyDir :
1517
+ medium : Memory
1518
+ name : dshm
1365
1519
scheduler can be set :
1366
1520
1 : |
1367
1521
apiVersion : workload .codeflare .dev / v1beta2
0 commit comments