Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[autoscaler] Kubernetes autoscaler backend #5492

Merged
merged 48 commits into from
Oct 3, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
86d9c6f
Add Kubernetes NodeProvider to autoscaler
edoakes Aug 12, 2019
be1b96b
Split off SSHCommandRunner
edoakes Aug 19, 2019
ffe00a6
Add KubernetesCommandRunner
edoakes Aug 20, 2019
0915df9
Cleanup
edoakes Aug 22, 2019
4558a76
More config options
edoakes Aug 22, 2019
15d1856
Check if auth present
edoakes Aug 22, 2019
8ed1ed9
More auth checks
edoakes Aug 22, 2019
8ec5f23
Better output
edoakes Aug 22, 2019
3c454c2
Always bootstrap config
edoakes Aug 22, 2019
56e3bf0
All working
edoakes Aug 23, 2019
4379355
Add k8s-rsync comment
edoakes Aug 23, 2019
44327bb
Clean up manual k8s examples
edoakes Aug 23, 2019
2d6fcc3
Fix up submit.yaml
edoakes Aug 23, 2019
9c98da7
Automatically configure permissisons
edoakes Aug 24, 2019
a7265ad
Fix get_node_provider arg
edoakes Aug 24, 2019
3174aa1
Fix permissions
edoakes Aug 24, 2019
aa13c22
Fill in empty auth
edoakes Aug 26, 2019
158876f
Merge branch 'master' into k8s
edoakes Aug 26, 2019
6ff3c2a
Remove ray-cluster from this PR
edoakes Aug 26, 2019
c218965
No hard dep on kubernetes library
edoakes Aug 26, 2019
f904982
Move permissions into autoscaler config
edoakes Aug 26, 2019
2452e1b
lint
edoakes Aug 26, 2019
a4e16d6
Fix indentation
edoakes Aug 26, 2019
e13f4a1
namespace validation
edoakes Aug 28, 2019
0fc7a0f
Use cluster name tag
edoakes Aug 28, 2019
9dd310f
Remove kubernetes from setup.py
edoakes Aug 28, 2019
da0ad01
Comment in example configs
edoakes Aug 28, 2019
34b6592
Same default autoscaling config as aws
edoakes Aug 28, 2019
8a0bb24
Add Kubernetes quickstart
edoakes Aug 28, 2019
6df676c
lint
edoakes Aug 28, 2019
15d3573
Revert changes to submit.yaml (other PR)
edoakes Aug 28, 2019
c610f34
Install kubernetes in travis
edoakes Aug 29, 2019
0cc97d6
address comments
edoakes Aug 29, 2019
e1c6fa2
Improve autoscaling doc
edoakes Aug 29, 2019
3bb44d6
Merge remote-tracking branch 'upstream/master' into k8s
edoakes Aug 29, 2019
2e449ea
kubectl command in setup
edoakes Sep 3, 2019
8fa6126
Force use_internal_ips
edoakes Sep 3, 2019
6559635
Merge remote-tracking branch 'upstream/master' into k8s
edoakes Sep 3, 2019
3fdb850
comments
edoakes Sep 3, 2019
c01d471
backend env in docs
edoakes Sep 4, 2019
670d819
Merge remote-tracking branch 'upstream/master' into k8s
edoakes Sep 4, 2019
86d5cc4
Change namespace config
edoakes Sep 4, 2019
1b80017
comments
edoakes Sep 18, 2019
16adb26
Merge remote-tracking branch 'upstream/master' into k8s
edoakes Sep 18, 2019
e88ec5f
comments
edoakes Sep 18, 2019
fee41e4
Merge branch 'master' into k8s
edoakes Sep 20, 2019
734320a
Merge branch 'master' into k8s
edoakes Oct 2, 2019
b29d172
Fix yaml test
edoakes Oct 3, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add KubernetesCommandRunner
  • Loading branch information
edoakes committed Aug 20, 2019
commit ffe00a69524617513c05fa2aa5d90a38c7d3a5a0
10 changes: 10 additions & 0 deletions kubernetes/autoscaler-role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# TODO: change to Role and only ray namespace
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
namespace: ray
name: autoscaler
rules:
- apiGroups: [""]
resources: ["pods", "pods/status", "pods/exec"]
verbs: ["get", "watch", "list", "create", "delete", "update", "patch"]
12 changes: 0 additions & 12 deletions kubernetes/aws-auth-cm.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion kubernetes/ray-pod.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ spec:
- name: ray-worker
image: eoakes/ray-test
command: ["/bin/bash", "-c", "--"]
args: ["echo \"export KUBERNETES_SERVICE_HOST=$KUBERNETES_SERVICE_HOST\" >> /root/.bashrc && echo \"export KUBERNETES_SERVICE_PORT=$KUBERNETES_SERVICE_PORT\" >> /root/.bashrc && cd ray && git fetch && git checkout k8s && git reset --hard origin/k8s && cd .. && apt-get install -y rsync && pip3 install kubernetes && service ssh start && trap : TERM INT; sleep infinity & wait;"]
args: ["cd ray && git fetch && git checkout k8s && git reset --hard origin/k8s && cd .. && apt-get install -y rsync && pip3 install kubernetes && trap : TERM INT; sleep infinity & wait;"]
ports:
- containerPort: 12345
- containerPort: 12346
Expand Down
9 changes: 3 additions & 6 deletions python/ray/autoscaler/autoscaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@
# How Ray will authenticate with newly launched nodes.
"auth": (
{
"ssh_user": (str, REQUIRED), # e.g. ubuntu
"ssh_user": (str, OPTIONAL), # e.g. ubuntu
"ssh_private_key": (str, OPTIONAL),
"kubernetes_config": (dict, OPTIONAL),
},
REQUIRED),

Expand Down Expand Up @@ -806,11 +807,7 @@ def with_head_node_ip(cmds):
head_ip = services.get_node_ip_address()
out = []
for cmd in cmds:
out.append("export RAY_HEAD_IP={}; " +
"export KUBERNETES_SERVICE_HOST={}; " +
"export KUBERNETES_SERVICE_PORT={} {}".format(
head_ip, os.environ["KUBERNETES_SERVICE_HOST"],
os.environ["KUBERNETES_SERVICE_PORT"], cmd))
out.append("export RAY_HEAD_IP={}; {}".format(head_ip, cmd))
return out


Expand Down
10 changes: 7 additions & 3 deletions python/ray/autoscaler/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,10 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
logger.info("get_or_create_head_node: Updating files on head node...")

# Rewrite the auth config so that the head node can update the workers
remote_key_path = "~/ray_bootstrap_key.pem"
remote_config = copy.deepcopy(config)
remote_config["auth"]["ssh_private_key"] = remote_key_path
if config["provider"]["type"] != "kubernetes":
remote_key_path = "~/ray_bootstrap_key.pem"
remote_config["auth"]["ssh_private_key"] = remote_key_path

# Adjust for new file locations
new_mounts = {}
Expand All @@ -228,9 +229,12 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
remote_config_file.write(json.dumps(remote_config))
remote_config_file.flush()
config["file_mounts"].update({
remote_key_path: config["auth"]["ssh_private_key"],
"~/ray_bootstrap_config.yaml": remote_config_file.name
})
if config["provider"]["type"] != "kubernetes":
config["file_mounts"].update({
remote_key_path: config["auth"]["ssh_private_key"],
})

if restart_only:
init_commands = config["head_start_ray_commands"]
Expand Down
1 change: 1 addition & 0 deletions python/ray/autoscaler/kubernetes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
RAY_NAMESPACE = "ray"
18 changes: 7 additions & 11 deletions python/ray/autoscaler/kubernetes/example-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ min_workers: 0

# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 5
max_workers: 10

# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
Expand All @@ -17,7 +17,7 @@ initial_workers: 1
# Whether or not to autoscale aggressively. If this is enabled, if at any point
# we would start more workers, we start at least enough to bring us to
# initial_workers.
autoscaling_mode: default
autoscaling_mode: aggressive

# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
Expand All @@ -42,14 +42,11 @@ provider:
type: kubernetes
use_internal_ips: true

# TODO
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: root
# TODO - does this work in k8s?
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
ssh_private_key: /root/.ssh/id_rsa
ssh_private_key: ~/.ssh/id_rsa

# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
Expand All @@ -67,8 +64,7 @@ worker_nodes:
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
"/test": "/tmp/test",
}

# List of commands that will be run before `setup_commands`. If docker is
Expand All @@ -88,9 +84,9 @@ worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
- ulimit -n 65536; ray start --head --num-cpus=1 --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml

# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
- ulimit -n 65536; ray start --num-cpus=1 --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
21 changes: 21 additions & 0 deletions python/ray/autoscaler/kubernetes/k8s-rsync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

if [ -z "$KRSYNC_STARTED" ]; then
export KRSYNC_STARTED=true
exec rsync --blocking-io --rsh "$0" $@
fi

# Running as --rsh
namespace=''
pod=$1
shift

# If use uses pod@namespace rsync passes as: {us} -l pod namespace ...
if [ "X$pod" = "X-l" ]; then
pod=$1
shift
namespace="-n $1"
shift
fi

exec kubectl $namespace exec -i $pod -- "$@"
46 changes: 26 additions & 20 deletions python/ray/autoscaler/kubernetes/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,31 @@
import logging

from kubernetes import client, config
from kubernetes.config.config_exception import ConfigException

from ray.autoscaler.node_provider import NodeProvider
# from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
from ray.autoscaler.kubernetes import RAY_NAMESPACE
from ray.autoscaler.kubernetes.pod import default_pod_config

TAG_RAY_CLUSTER_NAME = "ray"

logger = logging.getLogger(__name__)


def to_label_selector(tags):
label_selector = ""
for k, v in tags.items():
if label_selector != "":
label_selector += ","
label_selector += "{}={}".format(k, v)
return label_selector


class KubernetesNodeProvider(NodeProvider):
def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
# config.load_kube_config()
config.load_incluster_config()
try:
config.load_incluster_config()
except ConfigException:
config.load_kube_config()
self.core_api = client.CoreV1Api()
self.pod_spec = default_pod_config

Expand All @@ -29,46 +39,42 @@ def non_terminated_nodes(self, tag_filters):
# have to match on NOT any of the other phases.
field_selector = ",".join([
"status.phase!=Failed", "status.phase!=Unknown",
"status.phase!=Succeeded"
"status.phase!=Succeeded",
])
label_selector = ""
for k, v in tag_filters.items():
if label_selector != "":
label_selector += ","
label_selector += "{}={}".format(k, v)

label_selector = to_label_selector(tag_filters)
pod_list = self.core_api.list_namespaced_pod(
TAG_RAY_CLUSTER_NAME,
RAY_NAMESPACE,
field_selector=field_selector,
label_selector=label_selector)
return [pod.metadata.name for pod in pod_list.items]

def is_running(self, node_id):
pod = self.core_api.read_namespaced_pod_status(node_id,
TAG_RAY_CLUSTER_NAME)
RAY_NAMESPACE)
return pod.status.phase == "Running"

def is_terminated(self, node_id):
pod = self.core_api.read_namespaced_pod_status(node_id,
TAG_RAY_CLUSTER_NAME)
return pod.status.phase == ["Running", "Pending"]
RAY_NAMESPACE)
return pod.status.phase not in ["Running", "Pending"]

def node_tags(self, node_id):
pod = self.core_api.read_namespaced_pod_status(node_id,
TAG_RAY_CLUSTER_NAME)
RAY_NAMESPACE)
return pod.metadata.labels

def external_ip(self, node_id):
raise NotImplementedError("No external IPs for kubernetes pods")

def internal_ip(self, node_id):
pod = self.core_api.read_namespaced_pod_status(node_id,
TAG_RAY_CLUSTER_NAME)
RAY_NAMESPACE)
return pod.status.pod_ip

def set_node_tags(self, node_id, tags):
body = {"metadata": {"labels": tags}}
self.core_api.patch_namespaced_pod(node_id, TAG_RAY_CLUSTER_NAME, body)
self.core_api.patch_namespaced_pod(node_id, RAY_NAMESPACE, body)

def create_node(self, node_config, tags, count):
conf = node_config.copy()
Expand All @@ -91,10 +97,10 @@ def create_node(self, node_config, tags, count):
logger.info("NodeProvider: calling create_namespaced_pod "
"(count={}).".format(count))
for _ in range(count):
self.core_api.create_namespaced_pod(TAG_RAY_CLUSTER_NAME, body)
self.core_api.create_namespaced_pod(RAY_NAMESPACE, body)

def terminate_node(self, node_id):
self.core_api.delete_namespaced_pod(node_id, TAG_RAY_CLUSTER_NAME)
self.core_api.delete_namespaced_pod(node_id, RAY_NAMESPACE)

def terminate_nodes(self, node_ids):
for node_id in node_ids:
Expand Down
2 changes: 1 addition & 1 deletion python/ray/autoscaler/kubernetes/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"image": "eoakes/ray-test",
"command": ["/bin/bash", "-c", "--"],
"args": [
"echo \"export KUBERNETES_SERVICE_HOST=$KUBERNETES_SERVICE_HOST\" >> /root/.bashrc && echo \"export KUBERNETES_SERVICE_PORT=$KUBERNETES_SERVICE_PORT\" >> /root/.bashrc && cd ray && git fetch && git checkout k8s && git reset --hard origin/k8s && cd .. && apt-get install -y rsync && pip3 install kubernetes && service ssh start && trap : TERM INT; sleep infinity & wait;"
"cd ray && git fetch && git checkout k8s && git reset --hard origin/k8s && cd .. && apt-get install -y rsync && trap : TERM INT; sleep infinity & wait;"
],
"ports": [{
"containerPort": 12345
Expand Down
Loading