diff --git a/.github/lf-canary-scale-config.yml b/.github/lf-canary-scale-config.yml new file mode 100644 index 0000000000..26ac07d190 --- /dev/null +++ b/.github/lf-canary-scale-config.yml @@ -0,0 +1,251 @@ + +# This file is generated by .github/scripts/validate_scale_config.py in test-infra +# It defines runner types that will be provisioned by by LF Self-hosted runners + +# scale-config.yml: +# Powers what instance types are available for GHA auto-scaled +# runners. Runners listed here will be available as self hosted +# runners, configuration is directly pulled from the main branch. +# +# +# NOTES: +# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls +# to avoid RequestLimitExceeded issues +# - When updating this file, run the following command to validate the YAML and to generate +# corresponding versions of scale-config for the pytorch/pytorch repo and merge the +# pytorch/pytorch changes before merging these changes. +# `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]`` +# +# TODO: Add some documentation on how the auto-scaling works +# +# NOTE: Default values, +# +# runner_types: +# runner_label: +# instance_type: m4.large +# os: linux +# max_available: 20 +# disk_size: 50 +# is_ephemeral: true + +runner_types: + lf.c.linux.12xlarge: + disk_size: 200 + instance_type: c5.12xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.10xlarge.avx2: + disk_size: 200 + instance_type: m4.10xlarge + is_ephemeral: false + max_available: 450 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.24xl.spr-metal: + disk_size: 200 + instance_type: c7i.metal-24xl + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.16xlarge.spr: + disk_size: 200 + instance_type: c7i.16xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.9xlarge.ephemeral: + disk_size: 200 + instance_type: c5.9xlarge + is_ephemeral: true + max_available: 50 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + variants: + am2: + ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs + lf.c.linux.12xlarge.ephemeral: + disk_size: 200 + instance_type: c5.12xlarge + is_ephemeral: true + max_available: 300 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.16xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.16xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.24xlarge: + disk_size: 150 + instance_type: c5.24xlarge + is_ephemeral: false + max_available: 500 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.24xlarge.ephemeral: + disk_size: 150 + instance_type: c5.24xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.2xlarge: + disk_size: 150 + instance_type: c5.2xlarge + is_ephemeral: false + max_available: 3120 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.4xlarge: + disk_size: 150 + instance_type: c5.4xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.4xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.4xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.8xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.8xlarge + is_ephemeral: false + max_available: 400 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g4dn.12xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g4dn.12xlarge + is_ephemeral: false + max_available: 250 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g4dn.metal.nvidia.gpu: + disk_size: 150 + instance_type: g4dn.metal + is_ephemeral: false + max_available: 300 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g5.48xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.48xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g5.12xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.12xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g5.4xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.4xlarge + is_ephemeral: false + max_available: 2400 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.g6.4xlarge.experimental.nvidia.gpu: + disk_size: 150 + instance_type: g6.4xlarge + is_ephemeral: false + max_available: 50 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.large: + max_available: 1200 + disk_size: 15 + instance_type: c5.large + is_ephemeral: false + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.c.linux.arm64.2xlarge: + disk_size: 256 + instance_type: t4g.2xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.c.linux.arm64.m7g.4xlarge: + disk_size: 256 + instance_type: m7g.4xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.c.linux.arm64.2xlarge.ephemeral: + disk_size: 256 + instance_type: t4g.2xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.c.linux.arm64.m7g.4xlarge.ephemeral: + disk_size: 256 + instance_type: m7g.4xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.c.linux.arm64.m7g.metal: + disk_size: 256 + instance_type: m7g.metal + is_ephemeral: false + max_available: 100 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.c.windows.g4dn.xlarge: + disk_size: 256 + instance_type: g4dn.xlarge + is_ephemeral: true + max_available: 100 + os: windows + lf.c.windows.g4dn.xlarge.nonephemeral: + disk_size: 256 + instance_type: g4dn.xlarge + is_ephemeral: false + max_available: 100 + os: windows + lf.c.windows.4xlarge: + disk_size: 256 + instance_type: c5d.4xlarge + is_ephemeral: true + max_available: 420 + os: windows + lf.c.windows.4xlarge.nonephemeral: + disk_size: 256 + instance_type: c5d.4xlarge + is_ephemeral: false + max_available: 420 + os: windows + lf.c.windows.8xlarge.nvidia.gpu: + disk_size: 256 + instance_type: p3.2xlarge + is_ephemeral: true + max_available: 300 + os: windows + lf.c.windows.8xlarge.nvidia.gpu.nonephemeral: + disk_size: 256 + instance_type: p3.2xlarge + is_ephemeral: false + max_available: 150 + os: windows + lf.c.windows.g5.4xlarge.nvidia.gpu: + disk_size: 256 + instance_type: g5.4xlarge + is_ephemeral: false + max_available: 250 + os: windows diff --git a/.github/lf-scale-config.yml b/.github/lf-scale-config.yml new file mode 100644 index 0000000000..cd2ee5fee2 --- /dev/null +++ b/.github/lf-scale-config.yml @@ -0,0 +1,251 @@ + +# This file is generated by .github/scripts/validate_scale_config.py in test-infra +# It defines runner types that will be provisioned by by LF Self-hosted runners + +# scale-config.yml: +# Powers what instance types are available for GHA auto-scaled +# runners. Runners listed here will be available as self hosted +# runners, configuration is directly pulled from the main branch. +# +# +# NOTES: +# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls +# to avoid RequestLimitExceeded issues +# - When updating this file, run the following command to validate the YAML and to generate +# corresponding versions of scale-config for the pytorch/pytorch repo and merge the +# pytorch/pytorch changes before merging these changes. +# `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]`` +# +# TODO: Add some documentation on how the auto-scaling works +# +# NOTE: Default values, +# +# runner_types: +# runner_label: +# instance_type: m4.large +# os: linux +# max_available: 20 +# disk_size: 50 +# is_ephemeral: true + +runner_types: + lf.linux.12xlarge: + disk_size: 200 + instance_type: c5.12xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.10xlarge.avx2: + disk_size: 200 + instance_type: m4.10xlarge + is_ephemeral: false + max_available: 450 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.24xl.spr-metal: + disk_size: 200 + instance_type: c7i.metal-24xl + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.16xlarge.spr: + disk_size: 200 + instance_type: c7i.16xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.9xlarge.ephemeral: + disk_size: 200 + instance_type: c5.9xlarge + is_ephemeral: true + max_available: 50 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + variants: + am2: + ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs + lf.linux.12xlarge.ephemeral: + disk_size: 200 + instance_type: c5.12xlarge + is_ephemeral: true + max_available: 300 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.16xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.16xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.24xlarge: + disk_size: 150 + instance_type: c5.24xlarge + is_ephemeral: false + max_available: 500 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.24xlarge.ephemeral: + disk_size: 150 + instance_type: c5.24xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.2xlarge: + disk_size: 150 + instance_type: c5.2xlarge + is_ephemeral: false + max_available: 3120 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.4xlarge: + disk_size: 150 + instance_type: c5.4xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.4xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.4xlarge + is_ephemeral: false + max_available: 1000 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.8xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g3.8xlarge + is_ephemeral: false + max_available: 400 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g4dn.12xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g4dn.12xlarge + is_ephemeral: false + max_available: 250 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g4dn.metal.nvidia.gpu: + disk_size: 150 + instance_type: g4dn.metal + is_ephemeral: false + max_available: 300 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g5.48xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.48xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g5.12xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.12xlarge + is_ephemeral: false + max_available: 150 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g5.4xlarge.nvidia.gpu: + disk_size: 150 + instance_type: g5.4xlarge + is_ephemeral: false + max_available: 2400 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.g6.4xlarge.experimental.nvidia.gpu: + disk_size: 150 + instance_type: g6.4xlarge + is_ephemeral: false + max_available: 50 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.large: + max_available: 1200 + disk_size: 15 + instance_type: c5.large + is_ephemeral: false + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64 + lf.linux.arm64.2xlarge: + disk_size: 256 + instance_type: t4g.2xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.linux.arm64.m7g.4xlarge: + disk_size: 256 + instance_type: m7g.4xlarge + is_ephemeral: false + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.linux.arm64.2xlarge.ephemeral: + disk_size: 256 + instance_type: t4g.2xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.linux.arm64.m7g.4xlarge.ephemeral: + disk_size: 256 + instance_type: m7g.4xlarge + is_ephemeral: true + max_available: 200 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.linux.arm64.m7g.metal: + disk_size: 256 + instance_type: m7g.metal + is_ephemeral: false + max_available: 100 + os: linux + ami: al2023-ami-2023.5.202*-kernel-6.1-arm64 + lf.windows.g4dn.xlarge: + disk_size: 256 + instance_type: g4dn.xlarge + is_ephemeral: true + max_available: 100 + os: windows + lf.windows.g4dn.xlarge.nonephemeral: + disk_size: 256 + instance_type: g4dn.xlarge + is_ephemeral: false + max_available: 100 + os: windows + lf.windows.4xlarge: + disk_size: 256 + instance_type: c5d.4xlarge + is_ephemeral: true + max_available: 420 + os: windows + lf.windows.4xlarge.nonephemeral: + disk_size: 256 + instance_type: c5d.4xlarge + is_ephemeral: false + max_available: 420 + os: windows + lf.windows.8xlarge.nvidia.gpu: + disk_size: 256 + instance_type: p3.2xlarge + is_ephemeral: true + max_available: 300 + os: windows + lf.windows.8xlarge.nvidia.gpu.nonephemeral: + disk_size: 256 + instance_type: p3.2xlarge + is_ephemeral: false + max_available: 150 + os: windows + lf.windows.g5.4xlarge.nvidia.gpu: + disk_size: 256 + instance_type: g5.4xlarge + is_ephemeral: false + max_available: 250 + os: windows diff --git a/.github/scripts/validate_scale_config.py b/.github/scripts/validate_scale_config.py index 79d806de8f..bb8098c9c2 100644 --- a/.github/scripts/validate_scale_config.py +++ b/.github/scripts/validate_scale_config.py @@ -14,7 +14,7 @@ import urllib.request -from typing import Any, cast, Dict +from typing import Any, cast, Dict, List, NamedTuple import jsonschema @@ -23,14 +23,15 @@ MAX_AVAILABLE_MINIMUM = 50 # Paths relative to their respective repositories -SCALE_CONFIG_PATH = ".github/scale-config.yml" -PYTORCH_LF_SCALE_CONFIG_PATH = ".github/lf-scale-config.yml" -PYTORCH_LF_CANARY_SCALE_CONFIG_PATH = ".github/lf-canary-scale-config.yml" +META_SCALE_CONFIG_PATH = ".github/scale-config.yml" +LF_SCALE_CONFIG_PATH = ".github/lf-scale-config.yml" +LF_CANARY_SCALE_CONFIG_PATH = ".github/lf-canary-scale-config.yml" RUNNER_TYPE_CONFIG_KEY = "runner_types" GITHUB_PYTORCH_REPO_RAW_URL = "https://raw.githubusercontent.com/pytorch/pytorch/main/" +PREFIX_META = "" PREFIX_LF = "lf." PREFIX_LF_CANARY = "lf.c." @@ -277,84 +278,104 @@ def pull_temp_config_from_github_repo(config_path: str) -> str: return config_path +class ScaleConfigInfo(NamedTuple): + path: str # full path to scale config file + prefix: str # prefix this fleet's runners types should have + + def main() -> None: args = parse_args() - generate_files = False + source_scale_config_info = ScaleConfigInfo( + path=os.path.join(args.test_infra_repo_root, META_SCALE_CONFIG_PATH), + prefix=PREFIX_META, + ) + + # Contains scale configs that are generated from the source scale config + generated_scale_config_infos: List[ScaleConfigInfo] = [ + ScaleConfigInfo( + path=os.path.join(args.test_infra_repo_root, LF_SCALE_CONFIG_PATH), + prefix=PREFIX_LF, + ), + ScaleConfigInfo( + path=os.path.join(args.test_infra_repo_root, LF_CANARY_SCALE_CONFIG_PATH), + prefix=PREFIX_LF_CANARY, + ), + ] + + generate_files = True if args.pytorch_repo_root is None: + # This is expected during a CI run + generate_files = False print( "Using github's pytorch/pytorch repository as the source for the pytorch scale config files" ) - pt_lf_scale_config_path = pull_temp_config_from_github_repo( - PYTORCH_LF_SCALE_CONFIG_PATH + generated_scale_config_infos.append( + ScaleConfigInfo( + path=pull_temp_config_from_github_repo(LF_SCALE_CONFIG_PATH), + prefix=PREFIX_LF, + ) ) - pt_lf_canary_scale_config_path = pull_temp_config_from_github_repo( - PYTORCH_LF_CANARY_SCALE_CONFIG_PATH + generated_scale_config_infos.append( + ScaleConfigInfo( + path=pull_temp_config_from_github_repo(LF_CANARY_SCALE_CONFIG_PATH), + prefix=PREFIX_LF_CANARY, + ) ) else: - # Running locally - generate_files = True - pt_lf_scale_config_path = os.path.join( - args.pytorch_repo_root, PYTORCH_LF_SCALE_CONFIG_PATH + # This is expected during a local run + generated_scale_config_infos.append( + ScaleConfigInfo( + path=os.path.join(args.pytorch_repo_root, LF_SCALE_CONFIG_PATH), + prefix=PREFIX_LF, + ) ) - pt_lf_canary_scale_config_path = os.path.join( - args.pytorch_repo_root, PYTORCH_LF_CANARY_SCALE_CONFIG_PATH + generated_scale_config_infos.append( + ScaleConfigInfo( + path=os.path.join(args.pytorch_repo_root, LF_CANARY_SCALE_CONFIG_PATH), + prefix=PREFIX_LF_CANARY, + ) ) - scale_config_path = os.path.join(args.test_infra_repo_root, SCALE_CONFIG_PATH) - - scale_config = load_yaml_file(scale_config_path) + source_scale_config = load_yaml_file(source_scale_config_info.path) validation_success = True - if not is_config_consistent_internally(scale_config[RUNNER_TYPE_CONFIG_KEY]): + if not is_config_consistent_internally(source_scale_config[RUNNER_TYPE_CONFIG_KEY]): validation_success = False print("scale-config.yml is not internally consistent\n") else: print("scale-config.yml is internally consistent\n") - if generate_files: - generate_repo_scale_config( - scale_config_path, pt_lf_scale_config_path, PREFIX_LF - ) - - generate_repo_scale_config( - scale_config_path, pt_lf_canary_scale_config_path, PREFIX_LF_CANARY - ) - print("Generated updated pytorch/pytorch scale config files\n") - - pt_scale_config = load_yaml_file(pt_lf_scale_config_path) - pytorch_canary_scale_config = load_yaml_file(pt_lf_canary_scale_config_path) + def validate_config(generated_config_info: ScaleConfigInfo) -> bool: + if generate_files: + generate_repo_scale_config( + source_scale_config_info.path, + generated_config_info.path, + generated_config_info.prefix, + ) - if not is_consistent_across_configs( - scale_config[RUNNER_TYPE_CONFIG_KEY], - pt_scale_config[RUNNER_TYPE_CONFIG_KEY], - PREFIX_LF, - ): print( - f"Consistency validation failed between {scale_config_path} and {pt_lf_scale_config_path}\n" + f"Generated updated pytorch/pytorch scale config file at {generated_config_info.path}\n" ) - validation_success = False - else: - print("scale-config.yml is consistent with pytorch/pytorch scale config\n") - if not is_consistent_across_configs( - scale_config[RUNNER_TYPE_CONFIG_KEY], - pytorch_canary_scale_config[RUNNER_TYPE_CONFIG_KEY], - PREFIX_LF_CANARY, - ): - print( - f"Consistency validation failed between {scale_config_path} and {pt_lf_canary_scale_config_path}\n" - ) - validation_success = False - else: - print( - "scale-config.yml is consistent with pytorch/pytorch canary scale config\n" - ) + cloned_scale_config = load_yaml_file(generated_config_info.path) + + if not is_consistent_across_configs( + source_scale_config[RUNNER_TYPE_CONFIG_KEY], + cloned_scale_config[RUNNER_TYPE_CONFIG_KEY], + generated_config_info.prefix, + ): + print( + f"Consistency validation failed between {source_scale_config.path} and {generated_config_info.path}\n" + ) + return False + else: + print(f"scale-config.yml is consistent with {generated_config_info.path}\n") + return True - # # Delete the temp dir, if it was created - # if temp_dir and os.path.exists(temp_dir): - # os.rmdir(temp_dir) + for cloned_config_info in generated_scale_config_infos: + validation_success &= validate_config(cloned_config_info) if not validation_success: print(