Skip to content

Commit

Permalink
OCI provider based on instance-pools and instance-configurations.
Browse files Browse the repository at this point in the history
  • Loading branch information
jlamillan committed Jan 25, 2022
1 parent d378407 commit 65ebd8d
Show file tree
Hide file tree
Showing 22 changed files with 3,101 additions and 4 deletions.
2 changes: 2 additions & 0 deletions cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ You should also take a look at the notes and "gotchas" for your specific cloud p
* [IonosCloud](./cloudprovider/ionoscloud/README.md)
* [OVHcloud](./cloudprovider/ovhcloud/README.md)
* [Linode](./cloudprovider/linode/README.md)
* [OracleCloud](./cloudprovider/oci/README.md)
* [ClusterAPI](./cloudprovider/clusterapi/README.md)
* [BizflyCloud](./cloudprovider/bizflycloud/README.md)

Expand Down Expand Up @@ -165,5 +166,6 @@ Supported cloud providers:
* Equinix Metal https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/packet/README.md
* OVHcloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/ovhcloud/README.md
* Linode https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/linode/README.md
* OCI https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/oci/README.md
* Hetzner https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/hetzner/README.md
* Cluster API https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/clusterapi/README.md
8 changes: 6 additions & 2 deletions cluster-autoscaler/cloudprovider/builder/builder_all.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci

/*
Copyright 2018 The Kubernetes Authors.
Expand Down Expand Up @@ -37,6 +37,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ionoscloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/linode"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/magnum"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ovhcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/packet"
"k8s.io/autoscaler/cluster-autoscaler/config"
Expand All @@ -55,6 +56,7 @@ var AvailableCloudProviders = []string{
cloudprovider.ExoscaleProviderName,
cloudprovider.HuaweicloudProviderName,
cloudprovider.HetznerProviderName,
cloudprovider.OracleCloudProviderName,
cloudprovider.OVHcloudProviderName,
cloudprovider.ClusterAPIProviderName,
cloudprovider.IonoscloudProviderName,
Expand Down Expand Up @@ -105,6 +107,8 @@ func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGro
return ionoscloud.BuildIonosCloud(opts, do, rl)
case cloudprovider.LinodeProviderName:
return linode.BuildLinode(opts, do, rl)
case cloudprovider.OracleCloudProviderName:
return oci.BuildOCI(opts, do, rl)
}
return nil
}
43 changes: 43 additions & 0 deletions cluster-autoscaler/cloudprovider/builder/builder_oci.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//go:build oci
// +build oci

/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package builder

import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/config"
)

// AvailableCloudProviders supported by the cloud provider builder.
var AvailableCloudProviders = []string{
cloudprovider.OracleCloudProviderName,
}

// DefaultCloudProvider for oci-only build is oci.
const DefaultCloudProvider = cloudprovider.OracleCloudProviderName

func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
switch opts.CloudProviderName {
case cloudprovider.OracleCloudProviderName:
return oci.BuildOCI(opts, do, rl)
}

return nil
}
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ const (
HuaweicloudProviderName = "huaweicloud"
// IonoscloudProviderName gets the provider name of ionoscloud
IonoscloudProviderName = "ionoscloud"
// OracleCloudProviderName gets the provider name of oci
OracleCloudProviderName = "oci"
// OVHcloudProviderName gets the provider name of ovhcloud
OVHcloudProviderName = "ovhcloud"
// LinodeProviderName gets the provider name of linode
Expand Down
5 changes: 5 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
approvers:
#- jlamillan
reviewers:
#- jlamillan
#- ericrrath
220 changes: 220 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# Cluster Autoscaler for Oracle Cloud Infrastructure (OCI)

On OCI, the cluster-autoscaler utilizes [Instance Pools](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstancepool.htm)
combined with [Instance Configurations](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm) to
automatically resize a cluster's nodes based on application workload demands by:

- adding nodes to static instance-pool(s) when a pod cannot be scheduled in the cluster because of insufficient resource constraints.
- removing nodes from an instance-pool(s) when the nodes have been underutilized for an extended time, and when pods can be placed on other existing nodes.

The cluster-autoscaler works on a per-instance pool basis. You configure the cluster-autoscaler to tell it which instance pools to target
for expansion and contraction, the minimum and maximum sizes for each pool, and how you want the autoscaling to take place.
Instance pools not referenced in the configuration file are not managed by the cluster-autoscaler.

## Create Required OCI Resources

### IAM Policy (if using Instance Principals)

We recommend setting up and configuring the cluster-autoscaler to use
[Instance Principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
to authenticate to the OCI APIs.

The following policy provides the minimum privileges necessary for Cluster Autoscaler to run:

1: Create a compartment-level dynamic group containing the nodes (compute instances) in the cluster:

```
All {instance.compartment.id = 'ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf'}
```

2: Create a *tenancy-level* policy to allow nodes to manage instance-pools:

```
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-pools in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-configurations in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to manage instance-family in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use subnets in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to read virtual-network-family in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to use vnics in compartment <compartment-name>
Allow dynamic-group acme-oci-cluster-autoscaler-dyn-grp to inspect compartments in compartment <compartment-name>
```

### Instance Pool and Instance Configurations

Before you deploy the cluster auto-scaler on OCI, your need to create one or more static Instance Pools and Instance
Configuration with `cloud-init` specified in the launch details so new nodes automatically joins the existing cluster on
start up.

Advanced Instance Pool and Instance Configuration configuration is out of scope for this document. However, a
working [instance-details.json](./examples/instance-details.json) and [placement-config.json](./examples/placement-config.json)
([example](./examples/instance-details.json) based on Rancher [RKE](https://rancher.com/products/rke/)) using [cloud-init](https://cloudinit.readthedocs.io/en/latest/) are
included in the examples, which can be applied using the [OCI CLI](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm).

Modify the `user_data` in the example [instance-details.json](./examples/instance-details.json) to suit your needs, re-base64 encode, apply:

```bash
# e.g. cloud-init. Modify, re-encode, and update user_data in instance-details.json to suit your needs:

$ echo IyEvYmluL2Jhc2gKdG91hci9saWIvYXB0L....1yZXRyeSAzIGhG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg== | base64 -D

#!/bin/bash
groupadd docker
usermod -aG docker ubuntu
curl --retry 3 https://releases.rancher.com/install-docker/20.10.sh | sh
docker run -d --privileged --restart=unless-stopped --net=host -v /etc/kubernetes:/etc/kubernetes -v /var/run:/var/run rancher/rancher-agent:v2.5.5 --server https://my-rancher.com --token xxxxxx --worker
```

```bash
$ oci compute-management instance-configuration create --instance-details file://./cluster-autoscaler/cloudprovider/oci/examples/instance-details.json --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --query 'data.id' --raw-output

ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq

$ oci compute-management instance-pool create --compartment-id ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf --instance-configuration-id ocid1.instanceconfiguration.oc1.phx.aaaaaaaa3neul67zb3goz43lybosc2o3fv67gj3zazexbb3vfcbypmpznhtq --placement-configurations file://./cluster-autoscaler/cloudprovider/oci/examples/placement-config.json --size 0 --wait-for-state RUNNING --query 'data.id' --raw-output

Action completed. Waiting until the resource has entered state: ('RUNNING',)
ocid1.instancepool.oc1.phx.aaaaaaaayd5bxwrzomzr2b2enchm4mof7uhw7do5hc2afkhks576syikk2ca
```
## Configure Autoscaler
Use the `--nodes=<min-nodes>:<max-nodes>:<instancepool-ocid>` parameter to specify which pre-existing instance
pools to target for automatic expansion and contraction, the minimum and maximum sizes for each node pool, and how you
want the autoscaling to take place. Instance pools not referenced in the configuration file are not managed by the
autoscaler where:
- `<min-nodes>` is the minimum number of nodes allowed in the instance-pool.
- `<max-nodes>` is the maximum number of nodes allowed in the instance-pool. Make sure the maximum number of nodes you specify does not exceed the tenancy limits for the node shape defined for the node pool.
- `<instancepool-ocid>` is the OCIDs of a pre-existing instance-pool.
If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
value in the deployment e.g.:
```yaml
env:
- name: OCI_REGION
value: "us-phoenix-1"
```
### Optional cloud-config file
_Optional_ cloud-config file mounted in the path specified by `--cloud-config`.
An example, of passing optional configuration via `cloud-config` file that uses configures the cluster-autoscaler to use
instance-principals authenticating via instance principalsand only see configured instance-pools in a single compartment:
```ini
[Global]
compartment-id = ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf
region = uk-london-1
use-instance-principals = true
```
### Environment variables
Configuration via environment-variables:
- `OCI_USE_INSTANCE_PRINCIPAL` - Whether to use Instance Principals for authentication rather than expecting an OCI config file to be mounted in the container. Defaults to false.
- `OCI_REGION` - **Required** when using Instance Principals. e.g. `OCI_REGION=us-phoenix-1`. See [region list](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm) for identifiers.
- `OCI_COMPARTMENT_ID` - Restrict the cluster-autoscaler to instance-pools in a single compartment. When unset, the cluster-autoscaler will manage each specified instance-pool no matter which compartment they are in.
- `OCI_REFRESH_INTERVAL` - Optional refresh interval to sync internal cache with OCI API defaults to `2m`.
## Deployment
### Create OCI config secret (only if _not_ using Instance Principals)
If you are opting for a file based OCI configuration (as opposed to instance principals), the OCI config file and private key need to be mounted into the container filesystem using a secret volume.
The following policy is required when the specified is not an administrator to run the cluster-autoscaler:
```
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-pools in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-configurations in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to manage instance-family in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to use subnets in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to read virtual-network-family in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to use vnics in compartment <compartment-name>
Allow group acme-oci-cluster-autoscaler-user-grp to inspect compartments in compartment <compartment-name>
```
Example OCI config file (note `key_file` is the expected path and filename of the OCI API private-key from the perspective of the container):
```bash
$ cat ~/.oci/config
[DEFAULT]
user=ocid1.user.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
fingerprint=xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx:xx
key_file=/root/.oci/api_key.pem
tenancy=ocid1.tenancy.oc1..aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
pass_phrase=
region=us-phoenix-1
```
Create the secret (`api_key.pem` key name is required):
```bash
kubectl create secret generic oci-config -n kube-system --from-file=/Users/me/.oci/config --from-file=api_key.pem=/Users/me/.oci/my_api_key.pem
```
### Example Deployment
Two example deployments of the cluster-autoscaler that manage instancepools are located in the [examples](./examples/) directory.
[oci-ip-cluster-autoscaler-w-principals.yaml](./examples/oci-ip-cluster-autoscaler-w-principals.yaml) uses
instance principals, and [oci-ip-cluster-autoscaler-w-config.yaml](./examples/oci-ip-cluster-autoscaler-w-config.yaml) uses file
based authentication.
Note the 3 specified instance-pools are intended to correspond to different availability domains in the Phoenix, AZ region:
```yaml
...
containers:
- image: docker.io/jlamillan/autoscaler:oci-pr-rc6
name: cluster-autoscaler
command:
- ./cluster-autoscaler
- --cloud-provider=oci
- --nodes=1:10:ocid1.instancepool.oc1.phx.aaaaaaaaqdxy35acq32zjfvkybjmvlbdgj6q3m55qkwwctxhsprmz633k62q
- --nodes=0:10:ocid1.instancepool.oc1.phx.aaaaaaaazldzcu4mi5spz56upbtwnsynz2nk6jvmx7zi4hsta4uggxbulbua
- --nodes=0:20:ocid1.instancepool.oc1.phx.aaaaaaaal3jhoc32ljsfaeif4x2ssfa2a63oehjgqryiueivieee6yaqbkia
```
Instance principal based authentication deployment:
Substitute the OCIDs of _your_ instance pool(s) and set the `OCI_REGION` environment variable to the region where your
instance pool(s) reside before applying the deployment:
```
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-principals.yaml
```
OCI config file based authentication deployment:
```
kubectl apply -f ./cloudprovider/oci/examples/oci-ip-cluster-autoscaler-w-config.yaml
```
## Common Notes and Gotchas:
- You must configure the instance configuration of new compute instances to join the existing cluster when they start. This can
be accomplished with `cloud-init` / `user-data` in the instance launch configuration [example](./examples/instance-details.json).
- If opting for a file based OCI configuration (as opposed to instance principals), ensure the OCI config and private-key
PEM files are mounted into the container filesystem at the [expected path](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm). Note the `key_file` option in the example `~/.oci/config` above references a private-key file mounted into container by the example [volumeMount](./examples/oci-ip-cluster-autoscaler-w-config.yaml#L165)
- Make sure the maximum number of nodes you specify does not exceed the limit for the instance-pool or the tenancy.
- We recommend creating multiple instance-pools with one availability domain specified so new nodes can be created to meet
affinity requirements across availability domains.
- If you are authenticating via instance principals, be sure the `OCI_REGION` environment variable is set to the correct
value in the deployment.
- The cluster-autoscaler will not automatically remove scaled down (terminated) `Node` objects from the Kubernetes API
without assistance from the [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager) (CCM).
If scaled down nodes are lingering in your cluster in the `NotReady` status, ensure the OCI CCM is installed and running
correctly (`oci-cloud-controller-manager`).
- Avoid manually changing node pools that are managed by the cluster-autoscaler. For example, do not add or remove nodes
using kubectl, or using the Console (or the Oracle Cloud Infrastructure CLI or API).
- `--node-group-auto-discovery` and `--node-autoprovisioning-enabled=true` are not supported.
- We set a `nvidia.com/gpu:NoSchedule` taint on nodes in a GPU enabled instance-pool.
## Helpful links
- [Oracle Cloud Infrastructure home](https://cloud.oracle.com)
- [OCI instance configuration documentation](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/creatinginstanceconfig.htm)
- [instance principals](https://docs.oracle.com/en-us/iaas/Content/Identity/Tasks/callingservicesfrominstances.htm)
- [OCI Cloud Controller Manager](https://github.com/oracle/oci-cloud-controller-manager)
- [OCI Container Storage Interface driver](https://github.com/oracle/oci-cloud-controller-manager/blob/master/container-storage-interface.md)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"instanceType": "compute",
"launchDetails": {
"compartmentId": "ocid1.compartment.oc1..aaaaaaaa7ey4sg3a6b5wnv5hlkjlkjadslkfjalskfjalsadfadsf",
"shape": "VM.Standard2.8",
"sourceDetails":
{
"imageId": "ocid1.image.oc1.phx.aaaaaaaa55tzajot4gbiw2p7gquwjnvfzrasosbrq4h6wywkff4zjosp2fia",
"sourceType": "image",
"bootVolumeSizeInGBs": 100
},
"metadata": {
"user_data": "IyEvYmluL2Jhc2gKdG91Y2ggL3RtcC9jbG91ZC1pbml0LXN0YXJ0ZWQKaXB0YWJsZXMgLUYKZ3JvdXBhZGQgZG9ja2VyCnVzZXJtb2QgLWFHIGRvY2tlciB1YnVudHUKcm0gL3Zhci9saWIvYXB0L2xpc3RzL2xvY2sKcGtpbGwgLTkgLWYgYXB0CmN1cmwgLS1yZXRyeSAzIGh0dHBzOi8vcmVsZWFzZXMucmFuY2hlci5jb20vaW5zdGFsbC1kb2NrZXIvMjAuMTAuc2ggfCBzaApkb2NrZXIgcnVuIC1kIC0tcHJpdmlsZWdlZCAtLXJlc3RhcnQ9dW5sZXNzLXN0b3BwZWQgLS1uZXQ9aG9zdCAtdiAvZXRjL2t1YmVybmV0ZXM6L2V0Yy9rdWJlcm5ldGVzIC12IC92YXIvcnVuOi92YXIvcnVuIHJhbmNoZXIvcmFuY2hlci1hZ2VudDp2Mi41LjUgLS1zZXJ2ZXIgaHR0cHM6Ly9teS1yYW5jaGVyLmNvbSAtLXRva2VuIHh4eHh4eCAgLS13b3JrZXIKdG91Y2ggL3RtcC9jbG91ZC1pbml0LWZpbmlzaGVkCg=="
},
"createVnicDetails": {
"assignPublicIp": true
}
}
}
Loading

0 comments on commit 65ebd8d

Please sign in to comment.