[LF scale-configs] Expect LF scale configs to exist in test-infra (#5745

) This is the first PR in a series to move the LF scale config files into test-infra Updates the scale config validator to expect the LF scale configs to now exist in test-infra as well as the original pytorch/pytorch. Migration plan: 1. Update scale config validator to expect LF scale configs to exist both in test-infra and pytorch (this PR) 2. Update LF autoscaler scripts to read the LF scale config from test-infra 3. Update this validator to no longer expect any scale-config files in pytorch/pytorch 4. Remove the scale-config files from pytorch/pytorch
pytorch · Oct 9, 2024 · 3838b36 · 3838b36
1 parent 40ac0e2
commit 3838b36
Show file tree

Hide file tree

Showing 3 changed files with 579 additions and 56 deletions.
diff --git a/.github/lf-canary-scale-config.yml b/.github/lf-canary-scale-config.yml
@@ -0,0 +1,251 @@
+
+# This file is generated by .github/scripts/validate_scale_config.py in test-infra
+# It defines runner types that will be provisioned by by LF Self-hosted runners
+
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+#
+#
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
+#
+# TODO: Add some documentation on how the auto-scaling works
+#
+# NOTE: Default values,
+#
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     max_available: 20
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  lf.c.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 450
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 50
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+    variants:
+      am2:
+        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+  lf.c.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 500
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.24xlarge.ephemeral:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 2400
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 50
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.large:
+    max_available: 1200
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
+  lf.c.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
+  lf.c.linux.arm64.m7g.4xlarge:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
+  lf.c.linux.arm64.2xlarge.ephemeral:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
+  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: true
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
+  lf.c.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
+    os: linux
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
+  lf.c.windows.g4dn.xlarge:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: true
+    max_available: 100
+    os: windows
+  lf.c.windows.g4dn.xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: g4dn.xlarge
+    is_ephemeral: false
+    max_available: 100
+    os: windows
+  lf.c.windows.4xlarge:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: true
+    max_available: 420
+    os: windows
+  lf.c.windows.4xlarge.nonephemeral:
+    disk_size: 256
+    instance_type: c5d.4xlarge
+    is_ephemeral: false
+    max_available: 420
+    os: windows
+  lf.c.windows.8xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: windows
+  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
+    disk_size: 256
+    instance_type: p3.2xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: windows
+  lf.c.windows.g5.4xlarge.nvidia.gpu:
+    disk_size: 256
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: windows