Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[autoscaler] make 0 default min/max workers for head node #17757

Merged
merged 15 commits into from
Aug 25, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions deploy/charts/ray/templates/raycluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ spec:
podTypes:
{{- range $key, $val := .Values.podTypes }}
- name: {{ $key }}
{{- if $val.minWorkers }}
minWorkers: {{ $val.minWorkers }}
{{- end }}
{{- if $val.minWorkers }}
maxWorkers: {{ $val.maxWorkers }}
{{- end }}
{{- if $val.rayResources }}
rayResources:
{{- toYaml $val.rayResources | nindent 8 }}
Expand Down
3 changes: 0 additions & 3 deletions deploy/charts/ray/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@ podTypes:
# Since we set headPodType: rayHeadType, the Ray head pod will use the configuration
# defined in this entry of podTypes:
rayHeadType:
# No worker pods of this pod type (just the head). Thus, we set minWorkers and maxWorkers to 0.
minWorkers: 0
maxWorkers: 0
# CPU is the number of CPUs used by this pod type.
# (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.)
CPU: 1
Expand Down
4 changes: 0 additions & 4 deletions deploy/components/example_cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ spec:
# Specify the allowed pod types for this ray cluster and the resources they provide.
podTypes:
- name: head-node
# Minimum number of Ray workers of this Pod type.
minWorkers: 0
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
maxWorkers: 0
podConfig:
apiVersion: v1
kind: Pod
Expand Down
4 changes: 2 additions & 2 deletions doc/source/cluster/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,6 @@ Each node type is identified by a user-specified key.
Ebs:
VolumeSize: 100
resources: {"CPU": 2}
min_workers: 0
max_workers: 0
ray.worker.default:
node_config:
InstanceType: m5.large
Expand Down Expand Up @@ -971,6 +969,8 @@ The minimum number of workers to maintain for this node type regardless of utili

The maximum number of workers to have in the cluster for this node type regardless of utilization. This takes precedence over :ref:`minimum workers <cluster-configuration-node-min-workers>`. By default, the number of workers of a node type is unbounded, constrained only by the cluster-wide :ref:`max_workers <cluster-configuration-max-workers>`. (Prior to Ray 1.3.0, the default value for this field was 0.)

Note, for the nodes of type ``head_node_type`` the default number of max workers is 0.

* **Required:** No
* **Importance:** High
* **Type:** Integer
Expand Down
27 changes: 14 additions & 13 deletions python/ray/autoscaler/_private/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def prepare_config(config: Dict[str, Any]) -> Dict[str, Any]:
with_defaults = fillout_defaults(config)
merge_setup_commands(with_defaults)
validate_docker_config(with_defaults)
fill_node_type_max_workers(with_defaults)
fill_node_type_min_max_workers(with_defaults)
return with_defaults


Expand Down Expand Up @@ -257,27 +257,28 @@ def merge_setup_commands(config):
return config


def fill_node_type_max_workers(config):
def fill_node_type_min_max_workers(config):
"""Sets default per-node max workers to global max_workers.
This equivalent to setting the default per-node max workers to infinity,
with the only upper constraint coming from the global max_workers.
Sets default per-node min workers to zero.
Also sets default max_workers for the head node to zero.
"""
assert "max_workers" in config, "Global max workers should be set."
node_types = config["available_node_types"]
for node_type_name in node_types:
node_type_data = node_types[node_type_name]

# Log a warning if head node type's max_workers is absent.
if (node_type_name == config["head_node_type"]
and "max_workers" not in node_type_data):
cli_logger.warning(
HEAD_TYPE_MAX_WORKERS_WARN_TEMPLATE.format(
node_type=node_type_name,
max_workers=config["max_workers"],
version=ray.__version__))

# The key part of this function:
node_type_data.setdefault("max_workers", config["max_workers"])
node_type_data.setdefault("min_workers", 0)
if "max_workers" not in node_type_data:
if node_type_name == config["head_node_type"]:
logger.info("setting max workers for head node to 0")
node_type_data.setdefault("max_workers", 0)
else:
global_max_workers = config["max_workers"]
logger.info(f"setting max workers for {node_type_name} to "
"{global_max_workers}")
node_type_data.setdefault("max_workers", global_max_workers)


def with_head_node_ip(cmds, head_ip=None):
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aliyun/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aliyun/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aliyun/example-linux.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aws/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aws/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/aws/example-gpu-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ available_node_types:
# GPU head node.
ray.head.gpu:
# worker_image: rayproject/ray:latest-gpu # use this one if you don't need ML dependencies, it's faster to pull
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The node type's CPU and GPU resources are auto-detected based on AWS instance type.
# If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
# You can also set custom resources.
Expand Down
1 change: 0 additions & 1 deletion python/ray/autoscaler/aws/example-launch-templates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ auth:

available_node_types:
ray.head.default:
max_workers: 0
resources: {}
node_config:
# The launch template to use to launch the instances. Any parameters that
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ auth:

available_node_types:
ray.head.default:
max_workers: 0
resources: {}
node_config:
NetworkInterfaces:
Expand Down
7 changes: 0 additions & 7 deletions python/ray/autoscaler/azure/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
node_config:
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/azure/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config, e.g. instance type.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/azure/example-gpu-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray.head.gpu:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config, e.g. instance type.
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/gcp/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for this node type, e.g. instance type. By default
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/gcp/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_default:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 2}
# Provider-specific config for the head node, e.g. instance type. By default
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/gcp/example-gpu-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ auth:
# The node config specifies the launch config and physical instance type.
available_node_types:
ray_head_gpu:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
# The resources provided by this node type.
resources: {"CPU": 6, "GPU": 1}
# Provider-specific config for the head node, e.g. instance type. By default
Expand Down
2 changes: 0 additions & 2 deletions python/ray/autoscaler/gcp/tpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ max_workers: 7

available_node_types:
ray_head_default:
min_workers: 0
max_workers: 0
resources: {"CPU": 2}
node_config:
machineType: n2-standard-2
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/kubernetes/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,6 @@ available_node_types:
# cause problems for other pods.
memory: 512Mi
head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config:
apiVersion: v1
kind: Pod
Expand Down
6 changes: 0 additions & 6 deletions python/ray/autoscaler/kubernetes/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,6 @@ available_node_types:
# cause problems for other pods.
memory: 512Mi
head_node:
# The minimum number of worker nodes of this type to launch.
# This number should be >= 0.
min_workers: 0
# The maximum number of worker nodes of this type to launch.
# This takes precedence over min_workers.
max_workers: 0
node_config:
apiVersion: v1
kind: Pod
Expand Down
10 changes: 8 additions & 2 deletions python/ray/tests/test_autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,6 @@ def finish_starting_nodes(self):
},
"available_node_types": {
"ray.head.default": {
"min_workers": 0,
"max_workers": 0,
"resources": {},
"node_config": {
"head_default_prop": 4
Expand Down Expand Up @@ -2881,6 +2879,14 @@ def metrics_incremented():
self.waitFor(
metrics_incremented, fail_msg="Expected metrics to update")

def testDefaultMinMaxWorkers(self):
config = copy.deepcopy(MOCK_DEFAULT_CONFIG)
config = prepare_config(config)
node_types = config["available_node_types"]
head_node_config = node_types["ray.head.default"]
assert head_node_config["min_workers"] == 0
assert head_node_config["max_workers"] == 0


if __name__ == "__main__":
import sys
Expand Down
6 changes: 4 additions & 2 deletions python/ray/tests/test_autoscaler_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
_azure_configure_key_pair)
from ray.autoscaler._private.gcp import config as gcp_config
from ray.autoscaler._private.util import prepare_config, validate_config,\
_get_default_config, merge_setup_commands
_get_default_config, merge_setup_commands, fill_node_type_min_max_workers
from ray.autoscaler._private.providers import _NODE_PROVIDERS
from ray.autoscaler._private._kubernetes.node_provider import\
KubernetesNodeProvider
Expand Down Expand Up @@ -416,7 +416,8 @@ def testFillEdgeLegacyConfigs(self):
def testExampleFull(self):
"""
Test that example-full yamls are unmodified by prepared_config,
except possibly by having setup_commands merged.
except possibly by having setup_commands merged and
default per-node max/min workers set.
"""
providers = ["aws", "gcp", "azure"]
for provider in providers:
Expand All @@ -425,6 +426,7 @@ def testExampleFull(self):
config = yaml.safe_load(open(path).read())
config_copy = copy.deepcopy(config)
merge_setup_commands(config_copy)
fill_node_type_min_max_workers(config_copy)
assert config_copy == prepare_config(config)

@pytest.mark.skipif(
Expand Down
30 changes: 0 additions & 30 deletions python/ray/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,36 +275,6 @@ def commands_mock(command, stdin):
_check_output_via_pattern("test_ray_up.txt", result)


@pytest.mark.skipif(
sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
reason=("Mac builds don't provide proper locale support"))
@mock_ec2
@mock_iam
def test_ray_up_no_head_max_workers(configure_lang, _unlink_test_ssh_key,
configure_aws):
def commands_mock(command, stdin):
# if we want to have e.g. some commands fail,
# we can have overrides happen here.
# unfortunately, cutting out SSH prefixes and such
# is, to put it lightly, non-trivial
if "uptime" in command:
return PopenBehaviour(stdout=b"MOCKED uptime")
if "rsync" in command:
return PopenBehaviour(stdout=b"MOCKED rsync")
if "ray" in command:
return PopenBehaviour(stdout=b"MOCKED ray")
return PopenBehaviour(stdout=b"MOCKED GENERIC")

with _setup_popen_mock(commands_mock):
# config cache does not work with mocks
runner = CliRunner()
result = runner.invoke(scripts.up, [
MISSING_MAX_WORKER_CONFIG_PATH, "--no-config-cache", "-y",
"--log-style=pretty", "--log-color", "False"
])
_check_output_via_pattern("test_ray_up_no_max_worker.txt", result)


@pytest.mark.skipif(
sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
reason=("Mac builds don't provide proper locale support"))
Expand Down
Loading