Skip to content

Commit 2203a92

Browse files
authored
Use a shared async gateway between all async apis (#2380)
1 parent 81845ae commit 2203a92

File tree

19 files changed

+237
-381
lines changed

19 files changed

+237
-381
lines changed

cli/cmd/cluster.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,7 @@ var _clusterHealthCmd = &cobra.Command{
837837
{"prometheus", console.BoolColor(clusterHealth.Prometheus), clusterWarnings.Prometheus},
838838
{"autoscaler", console.BoolColor(clusterHealth.Autoscaler), ""},
839839
{"activator", console.BoolColor(clusterHealth.Activator), ""},
840+
{"async gateway", console.BoolColor(clusterHealth.AsyncGateway), ""},
840841
{"grafana", console.BoolColor(clusterHealth.Grafana), ""},
841842
{"controller manager", console.BoolColor(clusterHealth.ControllerManager), ""},
842843
{"apis gateway", console.BoolColor(clusterHealth.APIsGateway), ""},
@@ -1057,7 +1058,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
10571058
numAPIInstances := len(infoResponse.WorkerNodeInfos)
10581059

10591060
var totalReplicas int
1060-
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
1061+
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveEnqueuers bool
10611062
for _, nodeInfo := range infoResponse.WorkerNodeInfos {
10621063
totalReplicas += nodeInfo.NumReplicas
10631064
if nodeInfo.ComputeUserCapacity.GPU > 0 {
@@ -1066,9 +1067,6 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
10661067
if nodeInfo.ComputeUserCapacity.Inf > 0 {
10671068
doesClusterHaveInfs = true
10681069
}
1069-
if nodeInfo.NumAsyncGatewayReplicas > 0 {
1070-
doesClusterHaveAsyncGateways = true
1071-
}
10721070
if nodeInfo.NumEnqueuerReplicas > 0 {
10731071
doesClusterHaveEnqueuers = true
10741072
}
@@ -1089,7 +1087,6 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
10891087
{Title: "instance type"},
10901088
{Title: "lifecycle"},
10911089
{Title: "replicas"},
1092-
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncGateways},
10931090
{Title: "batch enqueuer replicas", Hidden: !doesClusterHaveEnqueuers},
10941091
{Title: "CPU (requested / total allocatable)"},
10951092
{Title: "memory (requested / total allocatable)"},
@@ -1108,7 +1105,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
11081105
memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String()
11091106
gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU)
11101107
infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf)
1111-
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr})
1108+
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr})
11121109
}
11131110

11141111
t := table.Table{

cmd/async-gateway/main.go

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ import (
2020
"flag"
2121
"net/http"
2222
"os"
23+
"strings"
2324

2425
gateway "github.com/cortexlabs/cortex/pkg/async-gateway"
2526
"github.com/cortexlabs/cortex/pkg/lib/aws"
2627
"github.com/cortexlabs/cortex/pkg/lib/errors"
2728
"github.com/cortexlabs/cortex/pkg/lib/logging"
2829
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
29-
"github.com/cortexlabs/cortex/pkg/types/clusterconfig"
3030
"github.com/cortexlabs/cortex/pkg/types/userconfig"
3131
"github.com/gorilla/handlers"
3232
"github.com/gorilla/mux"
@@ -37,38 +37,28 @@ const (
3737
_defaultPort = "8080"
3838
)
3939

40-
// usage: ./gateway -bucket <bucket> -region <region> -port <port> -queue queue <apiName>
40+
// usage: ./gateway -bucket <bucket> -region <region> -port <port>
4141
func main() {
4242
log := logging.GetLogger()
4343
defer func() {
4444
_ = log.Sync()
4545
}()
4646

4747
var (
48-
clusterConfigPath = flag.String("cluster-config", "", "cluster config path")
49-
port = flag.String("port", _defaultPort, "port on which the gateway server runs on")
50-
queueURL = flag.String("queue", "", "SQS queue URL")
48+
bucket = flag.String("bucket", "", "bucket")
49+
clusterUID = flag.String("cluster-uid", "", "cluster uid")
50+
port = flag.String("port", _defaultPort, "port on which the gateway server runs on")
5151
)
5252
flag.Parse()
5353

5454
switch {
55-
case *queueURL == "":
56-
log.Fatal("missing required option: -queue")
57-
case *clusterConfigPath == "":
58-
log.Fatal("missing required option: -cluster-config")
55+
case *bucket == "":
56+
log.Fatal("missing required option: -bucket")
57+
case *clusterUID == "":
58+
log.Fatal("missing required option: -cluster-uid")
5959
}
6060

61-
apiName := flag.Arg(0)
62-
if apiName == "" {
63-
log.Fatal("apiName argument was not provided")
64-
}
65-
66-
clusterConfig, err := clusterconfig.NewForFile(*clusterConfigPath)
67-
if err != nil {
68-
exit(log, err)
69-
}
70-
71-
awsClient, err := aws.NewForRegion(clusterConfig.Region)
61+
awsClient, err := aws.New()
7262
if err != nil {
7363
exit(log, err)
7464
}
@@ -78,8 +68,9 @@ func main() {
7868
exit(log, err)
7969
}
8070

71+
telemetryEnabled := strings.ToLower(os.Getenv("CORTEX_TELEMETRY_DISABLE")) != "true"
8172
err = telemetry.Init(telemetry.Config{
82-
Enabled: clusterConfig.Telemetry,
73+
Enabled: telemetryEnabled,
8374
UserID: userID,
8475
Properties: map[string]string{
8576
"kind": userconfig.AsyncAPIKind.String(),
@@ -95,10 +86,9 @@ func main() {
9586
defer telemetry.Close()
9687

9788
sess := awsClient.Session()
98-
s3Storage := gateway.NewS3(sess, clusterConfig.Bucket)
99-
sqsQueue := gateway.NewSQS(*queueURL, sess)
89+
s3Storage := gateway.NewS3(sess, *bucket)
10090

101-
svc := gateway.NewService(clusterConfig.ClusterUID, apiName, sqsQueue, s3Storage, log)
91+
svc := gateway.NewService(*clusterUID, s3Storage, log, *sess)
10292
ep := gateway.NewEndpoint(svc, log)
10393

10494
router := mux.NewRouter()

cmd/operator/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,8 @@ func main() {
6767
apiKind := deployment.Labels["apiKind"]
6868
switch apiKind {
6969
case userconfig.AsyncAPIKind.String():
70-
if deployment.Labels["cortex.dev/async"] == "api" {
71-
if err := asyncapi.UpdateAPIMetricsCron(&deployment); err != nil {
72-
operatorLogger.Fatal(errors.Wrap(err, "init"))
73-
}
70+
if err := asyncapi.UpdateAPIMetricsCron(&deployment); err != nil {
71+
operatorLogger.Fatal(errors.Wrap(err, "init"))
7472
}
7573
}
7674
}

manager/generate_eks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,9 +319,9 @@ def generate_eks(
319319
"ami": get_ami(ami_map, "t3.medium"),
320320
"name": "cx-operator",
321321
"instanceType": "t3.medium",
322-
"minSize": 1,
322+
"minSize": 2,
323323
"maxSize": 25,
324-
"desiredCapacity": 1,
324+
"desiredCapacity": 2,
325325
"volumeType": "gp3",
326326
"volumeSize": 20,
327327
"volumeIOPS": 3000,

manager/install.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ function cluster_up() {
5252
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/cluster-autoscaler.yaml.j2 | kubectl apply -f - >/dev/null
5353
echo ""
5454

55+
echo -n "○ configuring async gateway "
56+
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/async-gateway.yaml.j2 | kubectl apply -f - >/dev/null
57+
echo ""
58+
5559
echo -n "○ configuring logging "
5660
python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 | kubectl apply -f - >/dev/null
5761
envsubst < manifests/event-exporter.yaml | kubectl apply -f - >/dev/null
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# Copyright 2021 Cortex Labs, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ServiceAccount
17+
metadata:
18+
name: async-gateway
19+
namespace: default
20+
---
21+
apiVersion: apps/v1
22+
kind: Deployment
23+
metadata:
24+
name: async-gateway
25+
namespace: default
26+
spec:
27+
selector:
28+
matchLabels:
29+
app: async-gateway
30+
strategy:
31+
rollingUpdate:
32+
maxSurge: 25%
33+
maxUnavailable: 25%
34+
type: RollingUpdate
35+
template:
36+
metadata:
37+
name: async-gateway
38+
labels:
39+
app: async-gateway
40+
spec:
41+
serviceAccountName: async-gateway
42+
containers:
43+
- name: gateway
44+
image: {{ config["image_async_gateway"] }}
45+
imagePullPolicy: Always
46+
args:
47+
- --port
48+
- "8888"
49+
- --cluster-uid
50+
- "{{ config["cluster_uid"] }}"
51+
- --bucket
52+
- "{{ config["bucket"] }}"
53+
envFrom:
54+
- configMapRef:
55+
name: env-vars
56+
ports:
57+
- containerPort: 8888
58+
readinessProbe:
59+
httpGet:
60+
path: /healthz
61+
port: 8888
62+
scheme: HTTP
63+
livenessProbe:
64+
httpGet:
65+
path: /healthz
66+
port: 8888
67+
scheme: HTTP
68+
resources:
69+
requests:
70+
cpu: 400m
71+
memory: 512Mi
72+
limits:
73+
cpu: 400m
74+
---
75+
apiVersion: v1
76+
kind: Service
77+
metadata:
78+
name: async-gateway
79+
spec:
80+
type: ClusterIP
81+
selector:
82+
app: async-gateway
83+
ports:
84+
- port: 8888
85+
---
86+
apiVersion: autoscaling/v2beta2
87+
kind: HorizontalPodAutoscaler
88+
metadata:
89+
name: async-gateway
90+
spec:
91+
maxReplicas: 20
92+
minReplicas: 1
93+
scaleTargetRef:
94+
apiVersion: apps/v1
95+
kind: Deployment
96+
name: async-gateway
97+
metrics:
98+
- type: Resource
99+
resource:
100+
name: cpu
101+
target:
102+
type: Utilization
103+
averageUtilization: 90
104+
- type: Resource
105+
resource:
106+
name: memory
107+
target:
108+
type: Utilization
109+
averageUtilization: 90

manager/manifests/cluster-autoscaler.yaml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ spec:
185185
cpu: 300m
186186
requests:
187187
cpu: 100m
188-
memory: 200Mi
188+
memory: 400Mi
189189
command:
190190
- ./cluster-autoscaler
191191
- --v=4

manager/manifests/istio.yaml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ spec:
137137
resources:
138138
requests:
139139
cpu: 400m
140-
memory: 128Mi
140+
memory: 512Mi
141141
limits:
142142
cpu: 1500m
143143
memory: 1024Mi

manager/manifests/prometheus-monitoring.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,6 @@ spec:
252252
selector:
253253
matchLabels:
254254
apiKind: AsyncAPI
255-
cortex.dev/async: api
256255
matchExpressions:
257256
- { key: prometheus-ignore, operator: DoesNotExist }
258257
namespaceSelector:

pkg/async-gateway/endpoint.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"net/http"
2323

24+
"github.com/cortexlabs/cortex/pkg/consts"
2425
"github.com/cortexlabs/cortex/pkg/lib/errors"
2526
"github.com/cortexlabs/cortex/pkg/lib/telemetry"
2627
"github.com/cortexlabs/cortex/pkg/types/async"
@@ -50,14 +51,28 @@ func (e *Endpoint) CreateWorkload(w http.ResponseWriter, r *http.Request) {
5051
return
5152
}
5253

54+
apiName := r.Header.Get(consts.CortexAPINameHeader)
55+
if requestID == "" {
56+
respondPlainText(w, http.StatusBadRequest, fmt.Sprintf("error: missing %s key in request header", consts.CortexAPINameHeader))
57+
return
58+
}
59+
r.Header.Del(consts.CortexAPINameHeader)
60+
61+
queueURL := r.Header.Get(consts.CortexQueueURLHeader)
62+
if queueURL == "" {
63+
respondPlainText(w, http.StatusBadRequest, fmt.Sprintf("error: missing %s key in request header", consts.CortexQueueURLHeader))
64+
return
65+
}
66+
r.Header.Del(consts.CortexQueueURLHeader)
67+
5368
body := r.Body
5469
defer func() {
5570
_ = r.Body.Close()
5671
}()
5772

58-
log := e.logger.With(zap.String("id", requestID))
73+
log := e.logger.With(zap.String("id", requestID), zap.String("apiName", apiName))
5974

60-
id, err := e.service.CreateWorkload(requestID, body, r.Header)
75+
id, err := e.service.CreateWorkload(requestID, apiName, queueURL, body, r.Header)
6176
if err != nil {
6277
respondPlainText(w, http.StatusInternalServerError, fmt.Sprintf("error: %v", err))
6378
logErrorWithTelemetry(log, errors.Wrap(err, "failed to create workload"))
@@ -79,9 +94,16 @@ func (e *Endpoint) GetWorkload(w http.ResponseWriter, r *http.Request) {
7994
return
8095
}
8196

82-
log := e.logger.With(zap.String("id", id))
97+
apiName := r.Header.Get(consts.CortexAPINameHeader)
98+
if apiName == "" {
99+
respondPlainText(w, http.StatusBadRequest, fmt.Sprintf("error: missing %s key in request header", consts.CortexAPINameHeader))
100+
return
101+
}
102+
r.Header.Del(consts.CortexAPINameHeader)
103+
104+
log := e.logger.With(zap.String("id", id), zap.String("apiName", apiName))
83105

84-
res, err := e.service.GetWorkload(id)
106+
res, err := e.service.GetWorkload(id, apiName)
85107
if err != nil {
86108
respondPlainText(w, http.StatusInternalServerError, fmt.Sprintf("error: %v", err))
87109
logErrorWithTelemetry(log, errors.Wrap(err, "failed to get workload"))

0 commit comments

Comments
 (0)