Skip to content

Add readiness/liveness probes to k8s CaaS resources #2187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
May 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
634b17e
Add probe validation
RobertLucian May 19, 2021
efaa2cb
API spec validation for the probes
RobertLucian May 20, 2021
d513e4b
Glueing the probes with the task/realtime deploys
RobertLucian May 20, 2021
ac48fa7
Move encoding to pkg/lib/k8s
RobertLucian May 20, 2021
79446bf
Create job configmaps for Task API
RobertLucian May 20, 2021
1f397f1
Add Task job from configmap
RobertLucian May 20, 2021
56ec14b
Add job/probes spec to Batch API
RobertLucian May 20, 2021
6520dac
Remove image downloader (for batch too)
RobertLucian May 20, 2021
2c84224
Add comments on how to configure the volume/mount
RobertLucian May 20, 2021
591ebf3
Address some PR comments
RobertLucian May 21, 2021
198a17a
Address some PR comments
RobertLucian May 21, 2021
6512274
Address some more PR comments
RobertLucian May 21, 2021
91e9040
Configmap config generator
RobertLucian May 21, 2021
5ad562d
Convert UserPodContainers to a private function
RobertLucian May 21, 2021
f7f5c6a
Add configmap to async workloads
RobertLucian May 21, 2021
e6df5f0
Some fixes
RobertLucian May 21, 2021
752f6a7
Add configmaps to the list of allowed resources
RobertLucian May 21, 2021
0947333
Address PR comments
RobertLucian May 21, 2021
59dcbb6
Fix bug from merge conflict
RobertLucian May 21, 2021
57712ef
Add configmaps resource as kubebuilder annotation
RobertLucian May 22, 2021
1145fd0
Nits
deliahu May 22, 2021
46ec973
Address nits
RobertLucian May 24, 2021
2397b17
Separate verbs for configmap resource
RobertLucian May 24, 2021
9f5291a
Fixes
RobertLucian May 24, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build/images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ api_images=(
)

dev_images=(
"downloader"
"manager"
"proxy"
"async-gateway"
Expand Down
23 changes: 2 additions & 21 deletions cmd/proxy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package main
import (
"context"
"flag"
"io/ioutil"
"net/http"
"os"
"os/signal"
Expand Down Expand Up @@ -49,7 +48,6 @@ func main() {
userContainerPort int
maxConcurrency int
maxQueueLength int
probeDefPath string
clusterConfigPath string
)

Expand All @@ -59,7 +57,6 @@ func main() {
flag.IntVar(&maxConcurrency, "max-concurrency", 0, "max concurrency allowed for user container")
flag.IntVar(&maxQueueLength, "max-queue-length", 0, "max request queue length for user container")
flag.StringVar(&clusterConfigPath, "cluster-config", "", "cluster config path")
flag.StringVar(&probeDefPath, "probe", "", "path to the desired probe json definition")
flag.Parse()

log := logging.GetLogger()
Expand Down Expand Up @@ -119,23 +116,7 @@ func main() {
)

promStats := proxy.NewPrometheusStatsReporter()

var readinessProbe *probe.Probe
if probeDefPath != "" {
jsonProbe, err := ioutil.ReadFile(probeDefPath)
if err != nil {
log.Fatal(err)
}

probeDef, err := probe.DecodeJSON(string(jsonProbe))
if err != nil {
log.Fatal(err)
}

readinessProbe = probe.NewProbe(probeDef, log)
} else {
readinessProbe = probe.NewDefaultProbe(target, log)
}
readinessProbe := probe.NewDefaultProbe(target, log)

go func() {
reportTicker := time.NewTicker(_reportInterval)
Expand Down Expand Up @@ -165,7 +146,7 @@ func main() {

servers := map[string]*http.Server{
"proxy": {
Addr: ":" + strconv.Itoa(userContainerPort),
Addr: ":" + strconv.Itoa(port),
Handler: proxy.Handler(breaker, httpProxy),
},
"admin": {
Expand Down
1 change: 0 additions & 1 deletion docs/clusters/management/create.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ The docker images used by the cluster can also be overridden. They can be config
image_operator: quay.io/cortexlabs/operator:master
image_controller_manager: quay.io/cortexlabs/controller-manager:master
image_manager: quay.io/cortexlabs/manager:master
image_downloader: quay.io/cortexlabs/downloader:master
image_proxy: quay.io/cortexlabs/proxy:master
image_async_gateway: quay.io/cortexlabs/async-gateway:master
image_cluster_autoscaler: quay.io/cortexlabs/cluster-autoscaler:master
Expand Down
26 changes: 0 additions & 26 deletions images/downloader/Dockerfile

This file was deleted.

63 changes: 0 additions & 63 deletions manager/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ function main() {
function cluster_up() {
create_eks

start_pre_download_images

echo -n "○ updating cluster configuration "
setup_configmap
echo "✓"
Expand Down Expand Up @@ -76,8 +74,6 @@ function cluster_up() {

validate_cortex

await_pre_download_images

echo -e "\ncortex is ready!"
if [ "$CORTEX_OPERATOR_LOAD_BALANCER_SCHEME" == "internal" ]; then
echo -e "\nnote: you will need to configure VPC Peering to connect to your cluster: https://docs.cortex.dev/v/${CORTEX_VERSION_MINOR}/"
Expand Down Expand Up @@ -324,65 +320,6 @@ function setup_istio() {
output_if_error istio-${ISTIO_VERSION}/bin/istioctl install -f /workspace/istio.yaml
}

function start_pre_download_images() {
registry="quay.io/cortexlabs"
if [ -n "$CORTEX_DEV_DEFAULT_IMAGE_REGISTRY" ]; then
registry="$CORTEX_DEV_DEFAULT_IMAGE_REGISTRY"
fi
export CORTEX_IMAGE_PYTHON_HANDLER_CPU="${registry}/python-handler-cpu:${CORTEX_VERSION}"
export CORTEX_IMAGE_PYTHON_HANDLER_GPU="${registry}/python-handler-gpu:${CORTEX_VERSION}-cuda10.2-cudnn8"
export CORTEX_IMAGE_PYTHON_HANDLER_INF="${registry}/python-handler-inf:${CORTEX_VERSION}"
export CORTEX_IMAGE_TENSORFLOW_SERVING_CPU="${registry}/tensorflow-serving-cpu:${CORTEX_VERSION}"
export CORTEX_IMAGE_TENSORFLOW_SERVING_GPU="${registry}/tensorflow-serving-gpu:${CORTEX_VERSION}"
export CORTEX_IMAGE_TENSORFLOW_SERVING_INF="${registry}/tensorflow-serving-inf:${CORTEX_VERSION}"
export CORTEX_IMAGE_TENSORFLOW_HANDLER="${registry}/tensorflow-handler:${CORTEX_VERSION}"

envsubst < manifests/image-downloader-cpu.yaml | kubectl apply -f - &>/dev/null

has_gpu="false"
has_inf="false"

cluster_config_len=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups | yq -r length)
for idx in $(seq 0 $(($cluster_config_len-1))); do
ng_instance_type=$(cat /in/cluster_${CORTEX_CLUSTER_NAME}_${CORTEX_REGION}.yaml | yq -r .node_groups[$idx].instance_type)
if [[ "$ng_instance_type" == p* || "$ng_instance_type" == g* ]]; then
has_gpu="true"
fi
if [[ "$ng_instance_type" == inf* ]]; then
has_inf="true"
fi
done

if [ "$has_gpu" == "true" ]; then
envsubst < manifests/image-downloader-gpu.yaml | kubectl apply -f - &>/dev/null
fi

if [ "$has_inf" == "true" ]; then
envsubst < manifests/image-downloader-inf.yaml | kubectl apply -f - &>/dev/null
fi
}

function await_pre_download_images() {
echo -n "○ downloading docker images "
printed_dot="false"
for ds_name in image-downloader-cpu image-downloader-gpu image-downloader-inf; do
if ! kubectl get daemonset $ds_name > /dev/null 2>&1; then
continue
fi
i=0
until [ "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.numberReady}')" == "$(kubectl get daemonset $ds_name -n=default -o 'jsonpath={.status.desiredNumberScheduled}')" ]; do
if [ $i -eq 120 ]; then break; fi # give up after 6 minutes
echo -n "."
printed_dot="true"
((i=i+1))
sleep 3
done
kubectl -n=default delete --ignore-not-found=true daemonset $ds_name &>/dev/null
done

if [ "$printed_dot" == "true" ]; then echo " ✓"; else echo "✓"; fi
}

function validate_cortex() {
set +e

Expand Down
60 changes: 0 additions & 60 deletions manager/manifests/image-downloader-cpu.yaml

This file was deleted.

49 changes: 0 additions & 49 deletions manager/manifests/image-downloader-gpu.yaml

This file was deleted.

54 changes: 0 additions & 54 deletions manager/manifests/image-downloader-inf.yaml

This file was deleted.

4 changes: 2 additions & 2 deletions manager/manifests/prometheus-monitoring.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ spec:
- path: /metrics
scheme: http
interval: 10s
port: metrics
port: admin
relabelings:
- action: keep
sourceLabels: [ __meta_kubernetes_pod_container_name ]
Expand Down Expand Up @@ -221,7 +221,7 @@ metadata:
spec:
jobLabel: "statsd-exporter"
podMetricsEndpoints:
- port: metrics
- port: admin
scheme: http
path: /metrics
interval: 20s
Expand Down
4 changes: 2 additions & 2 deletions pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ var (
ProxyListeningPortStr = "8888"
ProxyListeningPortInt32 = int32(8888)

MetricsPortStr = "15000"
MetricsPortInt32 = int32(15000)
AdminPortStr = "15000"
AdminPortInt32 = int32(15000)

AuthHeader = "X-Cortex-Authorization"

Expand Down
5 changes: 5 additions & 0 deletions pkg/crds/apis/batch/v1alpha1/batchjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ type BatchJobSpec struct {
// Node groups selector
NodeGroups []string `json:"node_groups"`

// +kubebuilder:validation:Optional
// +nullable
// Readiness probes for the job (container name -> probe)
Probes map[string]kcore.Probe `json:"probes"`

// +kubebuilder:validation:Optional
// Time to live for the resource. The controller will clean-up resources
// that reached a final state when the TTL time is exceeded.
Expand Down
Loading