Skip to content

Commit 4248f1b

Browse files
RobertLucianvishalbollu
authored andcommitted
Docs for multi-instance type clusters (#1968)
(cherry picked from commit fce0df9)
1 parent 4f5732a commit 4248f1b

File tree

8 files changed

+250
-87
lines changed

8 files changed

+250
-87
lines changed

CONTRIBUTING.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,12 @@ Create `dev/config/cluster-aws.yaml`. Paste the following config, and update `re
173173
cluster_name: cortex
174174
provider: aws
175175
region: <region> # e.g. us-west-2
176-
instance_type: m5.large
177-
min_instances: 1
178-
max_instances: 5
176+
177+
node_groups:
178+
- name: worker-ng
179+
instance_type: m5.large
180+
min_instances: 1
181+
max_instances: 5
179182

180183
image_operator: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/operator:master
181184
image_manager: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/manager:master
@@ -210,10 +213,13 @@ project: <project_id>
210213
zone: <zone> # e.g. us-east1-c
211214
cluster_name: cortex
212215
provider: gcp
213-
instance_type: n1-standard-2
214-
min_instances: 1
215-
max_instances: 5
216-
# accelerator_type: nvidia-tesla-k80 # optional
216+
217+
node_pools:
218+
- name: worker-np
219+
instance_type: n1-standard-2
220+
min_instances: 1
221+
max_instances: 5
222+
# accelerator_type: nvidia-tesla-k80 # optional
217223
218224
image_operator: /cortexlabs/operator:master
219225
image_manager: gcr.io/<project_id>/cortexlabs/manager:master

docs/clusters/aws/install.md

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -31,23 +31,35 @@ region: us-east-1
3131
# list of availability zones for your region
3232
availability_zones: # default: 3 random availability zones in your region, e.g. [us-east-1a, us-east-1b, us-east-1c]
3333

34-
# instance type
35-
instance_type: m5.large
36-
37-
# minimum number of instances
38-
min_instances: 1
39-
40-
# maximum number of instances
41-
max_instances: 5
42-
43-
# disk storage size per instance (GB)
44-
instance_volume_size: 50
45-
46-
# instance volume type [gp2 | io1 | st1 | sc1]
47-
instance_volume_type: gp2
48-
49-
# instance volume iops (only applicable to io1)
50-
# instance_volume_iops: 3000
34+
# list of cluster node groups; the smaller index, the higher the priority of the node group
35+
node_groups:
36+
- name: ng-cpu # name of the node group
37+
instance_type: m5.large # instance type
38+
min_instances: 1 # minimum number of instances
39+
max_instances: 5 # maximum number of instances
40+
instance_volume_size: 50 # disk storage size per instance (GB)
41+
instance_volume_type: gp2 # instance volume type [gp2 | io1 | st1 | sc1]
42+
# instance_volume_iops: 3000 # instance volume iops (only applicable to io1)
43+
spot: false # enable spot instances
44+
45+
- name: ng-gpu
46+
instance_type: g4dn.xlarge
47+
min_instances: 1
48+
max_instances: 5
49+
instance_volume_size: 50
50+
instance_volume_type: gp2
51+
# instance_volume_iops: 3000
52+
spot: false
53+
54+
- name: ng-inferentia
55+
instance_type: inf1.xlarge
56+
min_instances: 1
57+
max_instances: 5
58+
instance_volume_size: 50
59+
instance_volume_type: gp2
60+
# instance_volume_iops: 3000
61+
spot: false
62+
...
5163

5264
# subnet visibility [public (instances will have public IPs) | private (instances will not have public IPs)]
5365
subnet_visibility: public
@@ -75,9 +87,6 @@ operator_load_balancer_scheme: internet-facing
7587
# additional tags to assign to AWS resources (all resources will automatically be tagged with cortex.dev/cluster-name: <cluster_name>)
7688
tags: # <string>: <string> map of key/value pairs
7789

78-
# enable spot instances
79-
spot: false
80-
8190
# SSL certificate ARN (only necessary when using a custom domain)
8291
ssl_certificate_arn:
8392

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Multi-instance type clusters
2+
3+
The cluster can be configured to provision different instance types depending on what resources the APIs request. The multi instance type cluster has the following advantages over the single-instance type cluster:
4+
5+
* **Lower costs**: Reduced overall compute costs by using the most economical instance for the given workloads.
6+
* **Simpler logistics**: Managing multiple clusters on your own is no longer required.
7+
* **Multi-purpose cluster**: The cluster can now take any range of workloads. One cluster for everything. Just throw a bunch of node groups in the cluster config, and you’re set.
8+
9+
## Best practices
10+
11+
When specifying the node groups in your `cluster.yaml` config, keep in mind that node groups with lower indexes have a higher priority over the other ones. With that mind, the best practices that result from this are:
12+
13+
1. Node groups with smaller instances should have the higher priority.
14+
1. Node groups with CPU-only instances should come before the node groups equipped with GPU/Inferentia instances.
15+
1. The spot node groups should always come first over the ones that have on-demand instances.
16+
17+
## Example node groups
18+
19+
### CPU spot/on-demand with GPU on-demand
20+
21+
```yaml
22+
# cluster.yaml
23+
24+
node_groups:
25+
- name: cpu-spot
26+
instance_type: m5.large
27+
spot: true
28+
- name: cpu
29+
instance_type: m5.large
30+
- name: gpu
31+
instance_type: g4dn.xlarge
32+
```
33+
34+
### CPU on-demand, GPU on-demand and Inferentia on-demand
35+
36+
```yaml
37+
# cluster.yaml
38+
39+
node_groups:
40+
- name: cpu
41+
instance_type: m5.large
42+
- name: gpu
43+
instance_type: g4dn.xlarge
44+
- name: inferentia
45+
instance_type: inf.xlarge
46+
```
47+
48+
### 3 spot CPU node groups with 1 on-demand CPU
49+
50+
```yaml
51+
# cluster.yaml
52+
53+
node_groups:
54+
- name: cpu-0
55+
instance_type: t3.medium
56+
spot: true
57+
- name: cpu-1
58+
instance_type: m5.2xlarge
59+
spot: true
60+
- name: cpu-2
61+
instance_type: m5.8xlarge
62+
spot: true
63+
- name: cpu-3
64+
instance_type: m5.24xlarge
65+
```
66+
67+
The above can also be achieved with the following config.
68+
69+
```yaml
70+
# cluster.yaml
71+
72+
node_groups:
73+
- name: cpu-0
74+
instance_type: t3.medium
75+
spot: true
76+
spot_config:
77+
instance_distribution: [m5.2xlarge, m5.8xlarge]
78+
max_price: 3.27
79+
- name: cpu-1
80+
instance_type: m5.24xlarge
81+
```

docs/clusters/aws/spot.md

Lines changed: 37 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,62 +3,60 @@
33
```yaml
44
# cluster.yaml
55

6-
# whether to use spot instances in the cluster (default: false)
7-
spot: false
6+
node_groups:
7+
- name: node-group-0
88

9-
spot_config:
10-
# additional instance types with identical or better specs than the primary cluster instance type (defaults to only the primary instance type)
11-
instance_distribution: # [similar_instance_type_1, similar_instance_type_2]
9+
# whether to use spot instances for this node group (default: false)
10+
spot: false
1211

13-
# minimum number of on demand instances (default: 0)
14-
on_demand_base_capacity: 0
12+
spot_config:
13+
# additional instance types with identical or better specs than the primary cluster instance type (defaults to only the primary instance type)
14+
instance_distribution: # [similar_instance_type_1, similar_instance_type_2]
1515

16-
# percentage of on demand instances to use after the on demand base capacity has been met [0, 100] (default: 50)
17-
# note: setting this to 0 may hinder cluster scale up when spot instances are not available
18-
on_demand_percentage_above_base_capacity: 0
16+
# minimum number of on demand instances (default: 0)
17+
on_demand_base_capacity: 0
1918

20-
# max price for spot instances (default: the on-demand price of the primary instance type)
21-
max_price: # <float>
19+
# percentage of on demand instances to use after the on demand base capacity has been met [0, 100] (default: 50)
20+
# note: setting this to 0 may hinder cluster scale up when spot instances are not available
21+
on_demand_percentage_above_base_capacity: 0
2222

23-
# number of spot instance pools across which to allocate spot instances [1, 20] (default: number of instances in instance distribution)
24-
instance_pools: 3
23+
# max price for spot instances (default: the on-demand price of the primary instance type)
24+
max_price: # <float>
2525

26-
# fallback to on-demand instances if spot instances were unable to be allocated (default: true)
27-
on_demand_backup: true
26+
# number of spot instance pools across which to allocate spot instances [1, 20] (default: number of instances in instance distribution)
27+
instance_pools: 3
2828
```
2929
3030
Spot instances are not guaranteed to be available. The chances of getting spot instances can be improved by providing `instance_distribution`, a list of alternative instance types to the primary `instance_type` you specified. If left blank, Cortex will only include the primary instance type in the `instance_distribution`. When using `instance_distribution`, use the instance type with the fewest compute resources as your primary `instance_type`. Note that the default value for `max_price` is the on-demand price of the primary instance type, but you may wish to set this to the on-demand price of the most expensive instance type in your `instance_distribution`.
3131

3232
Spot instances can be mixed with on-demand instances by configuring `on_demand_base_capacity` and `on_demand_percentage_above_base_capacity`. `on_demand_base_capacity` enforces the minimum number of nodes that will be fulfilled by on-demand instances as your cluster is scaling up. `on_demand_percentage_above_base_capacity` defines the percentage of instances that will be on-demand after the base capacity has been fulfilled (the rest being spot instances). `instance_pools` is the number of pools per availability zone to allocate your instances from. See [here](https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_InstancesDistribution.html) for more details.
3333

34-
Even if multiple instances are specified in your `instance_distribution` on-demand instances are mixed, there is still a possibility of running into scale up issues when attempting to spin up spot instances. Spot instance requests may not be fulfilled for several reasons. Spot instance pricing fluctuates, therefore the `max_price` may be lower than the current spot pricing rate. Another possibility could be that the availability zones of the cluster ran out of spot instances. `on_demand_backup` can be used mitigate the impact of unfulfilled spot requests by enabling the cluster to spin up on-demand instances if spot instance requests are not fulfilled within 5 minutes.
34+
Even if multiple instances are specified in your `instance_distribution` on-demand instances are mixed, there is still a possibility of running into scale up issues when attempting to spin up spot instances. Spot instance requests may not be fulfilled for several reasons. Spot instance pricing fluctuates, therefore the `max_price` may be lower than the current spot pricing rate. Another possibility could be that the availability zones of the cluster ran out of spot instances. The addition of another on-demand node group to `node_groups` with a lower priority (by having a higher index in the `node_groups` list) can mitigate the impact of unfulfilled spot requests by enabling the cluster to spin up on-demand instances if spot instance requests are not fulfilled within 5 minutes.
3535

3636
There is a spot instance limit associated with your AWS account for each instance family in each region. You can check your current limit and request an increase [here](https://console.aws.amazon.com/servicequotas/home?#!/services/ec2/quotas) (set the region in the upper right corner to your desired region, type "spot" in the search bar, and click on the quota that matches your instance type). Note that the quota values indicate the number of vCPUs available, not the number of instances; different instances have a different numbers of vCPUs, which can be seen [here](https://aws.amazon.com/ec2/instance-types/).
3737

3838
## Example spot configuration
3939

40-
### Only spot instances with backup
40+
### Only spot instances
4141

4242
```yaml
43-
44-
spot: true
45-
46-
spot_config:
47-
on_demand_base_capacity: 0
48-
on_demand_percentage_above_base_capacity: 0
49-
on_demand_backup: true # recommended for production clusters
43+
node_groups:
44+
- name: node-group-1
45+
spot: true
5046
```
5147

5248
### 3 on-demand base capacity with 0% on-demand above base capacity
5349

5450
```yaml
55-
min_instances: 0
56-
max_instances: 5
5751
58-
spot: true
59-
spot_config:
60-
on_demand_base_capacity: 3
61-
on_demand_percentage_above_base_capacity: 0
52+
node_groups:
53+
- name: node-group-1
54+
min_instances: 0
55+
max_instances: 5
56+
spot: true
57+
spot_config:
58+
on_demand_base_capacity: 3
59+
on_demand_percentage_above_base_capacity: 0
6260
6361
# instance 1-3: on-demand
6462
# instance 4-5: spot
@@ -67,13 +65,14 @@ spot_config:
6765
### 0 on-demand base capacity with 50% on-demand above base capacity
6866

6967
```yaml
70-
min_instances: 0
71-
max_instances: 4
72-
73-
spot: true
74-
spot_config:
75-
on_demand_base_capacity: 0
76-
on_demand_percentage_above_base_capacity: 50
68+
node_groups:
69+
- name: node-group-2
70+
min_instances: 0
71+
max_instances: 4
72+
spot: true
73+
spot_config:
74+
on_demand_base_capacity: 0
75+
on_demand_percentage_above_base_capacity: 50
7776
7877
# instance 1: on-demand
7978
# instance 2: spot

docs/clusters/gcp/install.md

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,27 +31,24 @@ project: <your GCP project ID>
3131
# GCP zone for your cluster
3232
zone: us-east1-c
3333

34-
# instance type
35-
instance_type: n1-standard-2
36-
37-
# minimum number of instances
38-
min_instances: 1
39-
40-
# maximum number of instances
41-
max_instances: 5
42-
43-
# enable the use of preemptible instances
44-
preemptible: false
45-
46-
# enable the use of on-demand backup instances which will be used when preemptible capacity runs out
47-
# default is true when preemptible instances are used
48-
# on_demand_backup: true
49-
50-
# GPU to attach to your instance (optional)
51-
# accelerator_type: nvidia-tesla-t4
52-
53-
# the number of GPUs to attach to each instance (optional)
54-
# accelerators_per_instance: 1
34+
# list of cluster node pools; the smaller index, the higher the priority of the node pool
35+
node_pools:
36+
- name: np-cpu # name of the node pool
37+
instance_type: n1-standard-2 # instance type
38+
# accelerator_type: nvidia-tesla-t4 # GPU to attach to your instance (optional)
39+
# accelerators_per_instance: 1 # the number of GPUs to attach to each instance (optional)
40+
min_instances: 1 # minimum number of instances
41+
max_instances: 5 # maximum number of instances
42+
preemptible: false # enable the use of preemptible instances
43+
44+
- name: np-gpu
45+
instance_type: n1-standard-2
46+
accelerator_type: nvidia-tesla-t4
47+
accelerators_per_instance: 1
48+
min_instances: 1
49+
max_instances: 5
50+
preemptible: false
51+
...
5552

5653
# the name of the network in which to create your cluster
5754
# network: default

0 commit comments

Comments
 (0)