ray-project · DmitriGekhtman · Aug 25, 2021 · Aug 11, 2021 · Aug 12, 2021 · Aug 16, 2021
diff --git a/deploy/charts/ray/templates/raycluster.yaml b/deploy/charts/ray/templates/raycluster.yaml
@@ -19,8 +19,12 @@ spec:
   podTypes:
     {{- range $key, $val := .Values.podTypes }}
     - name: {{ $key }}
+      {{- if $val.minWorkers }}
       minWorkers: {{ $val.minWorkers }}
+      {{- end }}
+      {{- if $val.minWorkers }}
       maxWorkers: {{ $val.maxWorkers }}
+      {{- end }}
       {{- if $val.rayResources }}
       rayResources:
         {{- toYaml $val.rayResources | nindent 8 }}

diff --git a/deploy/charts/ray/values.yaml b/deploy/charts/ray/values.yaml
@@ -12,9 +12,6 @@ podTypes:
     # Since we set headPodType: rayHeadType, the Ray head pod will use the configuration
     # defined in this entry of podTypes:
     rayHeadType:
-        # No worker pods of this pod type (just the head). Thus, we set minWorkers and maxWorkers to 0.
-        minWorkers: 0
-        maxWorkers: 0
         # CPU is the number of CPUs used by this pod type.
         # (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.)
         CPU: 1

diff --git a/deploy/components/example_cluster.yaml b/deploy/components/example_cluster.yaml
@@ -29,10 +29,6 @@ spec:
   # Specify the allowed pod types for this ray cluster and the resources they provide.
   podTypes:
   - name: head-node
-    # Minimum number of Ray workers of this Pod type.
-    minWorkers: 0
-    # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
-    maxWorkers: 0
     podConfig:
       apiVersion: v1
       kind: Pod

@@ -349,8 +349,6 @@ Each node type is identified by a user-specified key.
                         Ebs:
                             VolumeSize: 100
                 resources: {"CPU": 2}
-                min_workers: 0
-                max_workers: 0
             ray.worker.default:
                 node_config:
                   InstanceType: m5.large
@@ -971,6 +969,8 @@ The minimum number of workers to maintain for this node type regardless of utili
 
 The maximum number of workers to have in the cluster for this node type regardless of utilization. This takes precedence over :ref:`minimum workers <cluster-configuration-node-min-workers>`. By default, the number of workers of a node type is unbounded, constrained only by the cluster-wide :ref:`max_workers <cluster-configuration-max-workers>`. (Prior to Ray 1.3.0, the default value for this field was 0.)
 
+Note, for the nodes of type ``head_node_type`` the default number of max workers is 0.
+
 * **Required:** No
 * **Importance:** High
 * **Type:** Integer

@@ -157,7 +157,7 @@ def prepare_config(config: Dict[str, Any]) -> Dict[str, Any]:
     with_defaults = fillout_defaults(config)
     merge_setup_commands(with_defaults)
     validate_docker_config(with_defaults)
-    fill_node_type_max_workers(with_defaults)
+    fill_node_type_min_max_workers(with_defaults)
     return with_defaults
 
 
@@ -257,27 +257,28 @@ def merge_setup_commands(config):
     return config
 
 
-def fill_node_type_max_workers(config):
+def fill_node_type_min_max_workers(config):
     """Sets default per-node max workers to global max_workers.
     This equivalent to setting the default per-node max workers to infinity,
     with the only upper constraint coming from the global max_workers.
+    Sets default per-node min workers to zero.
+    Also sets default max_workers for the head node to zero.
     """
     assert "max_workers" in config, "Global max workers should be set."
     node_types = config["available_node_types"]
     for node_type_name in node_types:
         node_type_data = node_types[node_type_name]
 
-        # Log a warning if head node type's max_workers is absent.
-        if (node_type_name == config["head_node_type"]
-                and "max_workers" not in node_type_data):
-            cli_logger.warning(
-                HEAD_TYPE_MAX_WORKERS_WARN_TEMPLATE.format(
-                    node_type=node_type_name,
-                    max_workers=config["max_workers"],
-                    version=ray.__version__))
-
-        # The key part of this function:
-        node_type_data.setdefault("max_workers", config["max_workers"])
+        node_type_data.setdefault("min_workers", 0)
+        if "max_workers" not in node_type_data:
+            if node_type_name == config["head_node_type"]:
+                logger.info("setting max workers for head node to 0")
+                node_type_data.setdefault("max_workers", 0)
+            else:
+                global_max_workers = config["max_workers"]
+                logger.info(f"setting max workers for {node_type_name} to "
+                            "{global_max_workers}")
+                node_type_data.setdefault("max_workers", global_max_workers)
 
 
 def with_head_node_ip(cmds, head_ip=None):

@@ -47,12 +47,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -53,12 +53,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -54,12 +54,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on aliyun instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -44,12 +44,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -59,12 +59,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -47,12 +47,6 @@ available_node_types:
     # GPU head node.
     ray.head.gpu:
         # worker_image: rayproject/ray:latest-gpu   # use this one if you don't need ML dependencies, it's faster to pull
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The node type's CPU and GPU resources are auto-detected based on AWS instance type.
         # If desired, you can override the autodetected CPU and GPU resources advertised to the autoscaler.
         # You can also set custom resources.

@@ -27,7 +27,6 @@ auth:
 
 available_node_types:
   ray.head.default:
-    max_workers: 0
     resources: {}
     node_config:
       # The launch template to use to launch the instances. Any parameters that

@@ -42,7 +42,6 @@ auth:
 
 available_node_types:
   ray.head.default:
-    max_workers: 0
     resources: {}
     node_config:
       NetworkInterfaces:

@@ -47,13 +47,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
-        # The resources provided by this node type.
         resources: {"CPU": 2}
         # Provider-specific config, e.g. instance type.
         node_config:

@@ -62,12 +62,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The resources provided by this node type.
         resources: {"CPU": 2}
         # Provider-specific config, e.g. instance type.

@@ -49,12 +49,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray.head.gpu:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The resources provided by this node type.
         resources: {"CPU": 6, "GPU": 1}
         # Provider-specific config, e.g. instance type.

@@ -40,12 +40,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray_head_default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The resources provided by this node type.
         resources: {"CPU": 2}
         # Provider-specific config for this node type, e.g. instance type. By default

@@ -55,12 +55,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray_head_default:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The resources provided by this node type.
         resources: {"CPU": 2}
         # Provider-specific config for the head node, e.g. instance type. By default

@@ -49,12 +49,6 @@ auth:
 # The node config specifies the launch config and physical instance type.
 available_node_types:
     ray_head_gpu:
-        # The minimum number of worker nodes of this type to launch.
-        # This number should be >= 0.
-        min_workers: 0
-        # The maximum number of worker nodes of this type to launch.
-        # This takes precedence over min_workers.
-        max_workers: 0
         # The resources provided by this node type.
         resources: {"CPU": 6, "GPU": 1}
         # Provider-specific config for the head node, e.g. instance type. By default

@@ -13,8 +13,6 @@ max_workers: 7
 
 available_node_types:
     ray_head_default:
-        min_workers: 0
-        max_workers: 0
         resources: {"CPU": 2}
         node_config:
             machineType: n2-standard-2

@@ -138,12 +138,6 @@ available_node_types:
               # cause problems for other pods.
               memory: 512Mi
   head_node:
-    # The minimum number of worker nodes of this type to launch.
-    # This number should be >= 0.
-    min_workers: 0
-    # The maximum number of worker nodes of this type to launch.
-    # This takes precedence over min_workers.
-    max_workers: 0
     node_config:
       apiVersion: v1
       kind: Pod

@@ -143,12 +143,6 @@ available_node_types:
               # cause problems for other pods.
               memory: 512Mi
   head_node:
-    # The minimum number of worker nodes of this type to launch.
-    # This number should be >= 0.
-    min_workers: 0
-    # The maximum number of worker nodes of this type to launch.
-    # This takes precedence over min_workers.
-    max_workers: 0
     node_config:
       apiVersion: v1
       kind: Pod

diff --git a/python/ray/tests/test_autoscaler.py b/python/ray/tests/test_autoscaler.py
@@ -313,8 +313,6 @@ def finish_starting_nodes(self):
     },
     "available_node_types": {
         "ray.head.default": {
-            "min_workers": 0,
-            "max_workers": 0,
             "resources": {},
             "node_config": {
                 "head_default_prop": 4
@@ -2881,6 +2879,14 @@ def metrics_incremented():
         self.waitFor(
             metrics_incremented, fail_msg="Expected metrics to update")
 
+    def testDefaultMinMaxWorkers(self):
+        config = copy.deepcopy(MOCK_DEFAULT_CONFIG)
+        config = prepare_config(config)
+        node_types = config["available_node_types"]
+        head_node_config = node_types["ray.head.default"]
+        assert head_node_config["min_workers"] == 0
+        assert head_node_config["max_workers"] == 0
+
 
 if __name__ == "__main__":
     import sys

diff --git a/python/ray/tests/test_autoscaler_yaml.py b/python/ray/tests/test_autoscaler_yaml.py
@@ -15,7 +15,7 @@
                                                    _azure_configure_key_pair)
 from ray.autoscaler._private.gcp import config as gcp_config
 from ray.autoscaler._private.util import prepare_config, validate_config,\
-    _get_default_config, merge_setup_commands
+    _get_default_config, merge_setup_commands, fill_node_type_min_max_workers
 from ray.autoscaler._private.providers import _NODE_PROVIDERS
 from ray.autoscaler._private._kubernetes.node_provider import\
     KubernetesNodeProvider
@@ -416,7 +416,8 @@ def testFillEdgeLegacyConfigs(self):
     def testExampleFull(self):
         """
         Test that example-full yamls are unmodified by prepared_config,
-        except possibly by having setup_commands merged.
+        except possibly by having setup_commands merged and
+        default per-node max/min workers set.
         """
         providers = ["aws", "gcp", "azure"]
         for provider in providers:
@@ -425,6 +426,7 @@ def testExampleFull(self):
             config = yaml.safe_load(open(path).read())
             config_copy = copy.deepcopy(config)
             merge_setup_commands(config_copy)
+            fill_node_type_min_max_workers(config_copy)
             assert config_copy == prepare_config(config)
 
     @pytest.mark.skipif(

diff --git a/python/ray/tests/test_cli.py b/python/ray/tests/test_cli.py
@@ -275,36 +275,6 @@ def commands_mock(command, stdin):
         _check_output_via_pattern("test_ray_up.txt", result)
 
 
-@pytest.mark.skipif(
-    sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
-    reason=("Mac builds don't provide proper locale support"))
-@mock_ec2
-@mock_iam
-def test_ray_up_no_head_max_workers(configure_lang, _unlink_test_ssh_key,
-                                    configure_aws):
-    def commands_mock(command, stdin):
-        # if we want to have e.g. some commands fail,
-        # we can have overrides happen here.
-        # unfortunately, cutting out SSH prefixes and such
-        # is, to put it lightly, non-trivial
-        if "uptime" in command:
-            return PopenBehaviour(stdout=b"MOCKED uptime")
-        if "rsync" in command:
-            return PopenBehaviour(stdout=b"MOCKED rsync")
-        if "ray" in command:
-            return PopenBehaviour(stdout=b"MOCKED ray")
-        return PopenBehaviour(stdout=b"MOCKED GENERIC")
-
-    with _setup_popen_mock(commands_mock):
-        # config cache does not work with mocks
-        runner = CliRunner()
-        result = runner.invoke(scripts.up, [
-            MISSING_MAX_WORKER_CONFIG_PATH, "--no-config-cache", "-y",
-            "--log-style=pretty", "--log-color", "False"
-        ])
-        _check_output_via_pattern("test_ray_up_no_max_worker.txt", result)
-
-
 @pytest.mark.skipif(
     sys.platform == "darwin" and "travis" in os.environ.get("USER", ""),
     reason=("Mac builds don't provide proper locale support"))