Skip to content

Commit 85b0cc0

Browse files
authored
Add out-of-memory error (#418)
1 parent bafc72a commit 85b0cc0

File tree

6 files changed

+63
-32
lines changed

6 files changed

+63
-32
lines changed

cli/cmd/get.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,12 @@ func describeAPI(name string, resourcesRes *schema.GetResourcesResponse, flagVer
236236

237237
row := []interface{}{
238238
groupStatus.Message(),
239-
s.Int32(groupStatus.ReadyUpdated),
240-
s.Int32(groupStatus.Available()),
241-
s.Int32(groupStatus.Requested),
242-
s.Int32(groupStatus.ReadyStaleCompute),
243-
s.Int32(groupStatus.ReadyStaleModel),
244-
s.Int32(groupStatus.FailedUpdated),
239+
groupStatus.ReadyUpdated,
240+
groupStatus.Available(),
241+
groupStatus.Requested,
242+
groupStatus.ReadyStaleCompute,
243+
groupStatus.ReadyStaleModel,
244+
groupStatus.FailedUpdated,
245245
libtime.Since(updatedAt),
246246
}
247247

@@ -550,6 +550,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri
550550

551551
rows = append(rows, []interface{}{
552552
name,
553+
groupStatus.Message(),
553554
groupStatus.ReadyUpdated,
554555
groupStatus.Available(),
555556
groupStatus.Requested,
@@ -563,6 +564,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri
563564
t := table.Table{
564565
Headers: []table.Header{
565566
{Title: resource.APIType.UserFacing()},
567+
{Title: "status"},
566568
{Title: "up-to-date"},
567569
{Title: "available"},
568570
{Title: "requested"},

docs/deployments/statuses.md

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44

55
| Status | Meaning |
66
|----------------------|---|
7-
| live | API is deployed and ready to serve prediction requests (at least one replica is running) |
8-
| pending | API is waiting for another resource to be ready |
9-
| creating | API is being created |
10-
| stopping | API is stopping |
11-
| stopped | API is stopped |
12-
| error | API was not created due to an error; run `cortex logs <name>` to view the logs |
13-
| skipped | API was not created due to an error in another resource |
14-
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
15-
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
16-
| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready |
7+
| live | API is deployed and ready to serve prediction requests (at least one replica is running) |
8+
| pending | API is waiting for another resource to be ready |
9+
| creating | API is being created |
10+
| stopping | API is stopping |
11+
| stopped | API is stopped |
12+
| error | API was not created due to an error; run `cortex logs <name>` to view the logs |
13+
| skipped | API was not created due to an error in another resource |
14+
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
15+
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
16+
| error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying |
17+
| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready |

pkg/lib/k8s/pod.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,13 +182,16 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
182182
numSucceeded := 0
183183
numFailed := 0
184184
numKilled := 0
185+
numKilledOOM := 0
185186
for _, containerStatus := range containerStatuses {
186-
if containerStatus.State.Running != nil {
187+
if containerStatus.State.Running != nil && containerStatus.RestartCount == 0 {
187188
numRunning++
188189
} else if containerStatus.State.Terminated != nil {
189190
exitCode := containerStatus.State.Terminated.ExitCode
190191
if exitCode == 0 {
191192
numSucceeded++
193+
} else if exitCode == 137 {
194+
numKilledOOM++
192195
} else if killStatuses[exitCode] {
193196
numKilled++
194197
} else {
@@ -198,6 +201,8 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
198201
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
199202
if exitCode == 0 {
200203
numSucceeded++
204+
} else if exitCode == 137 {
205+
numKilledOOM++
201206
} else if killStatuses[exitCode] {
202207
numKilled++
203208
} else {
@@ -208,7 +213,9 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
208213
numWaiting++
209214
}
210215
}
211-
if numKilled > 0 {
216+
if numKilledOOM > 0 {
217+
return PodStatusKilledOOM
218+
} else if numKilled > 0 {
212219
return PodStatusKilled
213220
} else if numFailed > 0 {
214221
return PodStatusFailed

pkg/operator/api/resource/status.go

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ limitations under the License.
1616

1717
package resource
1818

19+
import (
20+
"github.com/cortexlabs/cortex/pkg/lib/k8s"
21+
)
22+
1923
type DataStatus struct {
2024
DataSavedStatus
2125
Code StatusCode `json:"status_code"`
@@ -30,7 +34,8 @@ type APIStatus struct {
3034
InitReplicas int32 `json:"init_replicas"`
3135
TargetCPUUtilization int32 `json:"target_cpu_utilization"`
3236
ReplicaCounts `json:"replica_counts"`
33-
Code StatusCode `json:"status_code"`
37+
PodStatuses []k8s.PodStatus `json:"pod_statuses"`
38+
Code StatusCode `json:"status_code"`
3439
}
3540

3641
type ReplicaCounts struct {
@@ -142,14 +147,14 @@ var _ = [1]int{}[int(StatusStopped)-(len(statusCodes)-1)] // Ensure list length
142147
var statusCodeMessages = []string{
143148
"unknown", // StatusUnknown
144149

145-
"pending", // StatusPending
146-
"compute unavailable", // StatusPendingCompute
147-
"pending", // StatusWaiting
148-
"skipped", // StatusSkipped
149-
"error", // StatusError
150-
"upstream error", // StatusParentFailed
151-
"upstream termination", // StatusParentKilled
152-
"terminated (out of mem)", // StatusDataOOM
150+
"pending", // StatusPending
151+
"compute unavailable", // StatusPendingCompute
152+
"pending", // StatusWaiting
153+
"skipped", // StatusSkipped
154+
"error", // StatusError
155+
"upstream error", // StatusParentFailed
156+
"upstream termination", // StatusParentKilled
157+
"error (out of memory)", // StatusKilledOOM
153158

154159
"running", // StatusRunning
155160
"ready", // StatusSucceeded

pkg/operator/workloads/api_status.go

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func getCurrentAPIStatuses(
6868
return nil, errors.Wrap(err, "api statuses", ctx.App.Name)
6969
}
7070

71-
replicaCountsMap := getReplicaCountsMap(podList, deployments, ctx)
71+
replicaCountsMap, podStatusMap := getReplicaCountsMap(podList, deployments, ctx)
7272

7373
currentResourceWorkloadIDs := ctx.APIResourceWorkloadIDs()
7474

@@ -117,6 +117,7 @@ func getCurrentAPIStatuses(
117117
for resourceID, apiStatus := range apiStatuses {
118118
apiStatus.Path = context.APIPath(apiStatus.APIName, apiStatus.AppName)
119119
apiStatus.ReplicaCounts = replicaCountsMap[resourceID]
120+
apiStatus.PodStatuses = podStatusMap[resourceID]
120121
apiStatus.Code = apiStatusCode(apiStatus)
121122
}
122123

@@ -135,7 +136,7 @@ func getReplicaCountsMap(
135136
podList []kcore.Pod,
136137
deployments map[string]*kapps.Deployment, // api.Name -> deployment
137138
ctx *context.Context,
138-
) map[string]resource.ReplicaCounts {
139+
) (map[string]resource.ReplicaCounts, map[string][]k8s.PodStatus) {
139140

140141
apiComputeIDMap := make(map[string]string)
141142
for _, api := range ctx.APIs {
@@ -149,6 +150,7 @@ func getReplicaCountsMap(
149150
}
150151

151152
replicaCountsMap := make(map[string]resource.ReplicaCounts)
153+
podStatusMap := make(map[string][]k8s.PodStatus)
152154
for _, pod := range podList {
153155
resourceID := pod.Labels["resourceID"]
154156
podAPIComputeID := APIPodComputeID(pod.Spec.Containers)
@@ -170,7 +172,7 @@ func getReplicaCountsMap(
170172
replicaCounts.ReadyStaleCompute++
171173
}
172174
}
173-
if podStatus == k8s.PodStatusFailed {
175+
if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM {
174176
if computeMatches {
175177
replicaCounts.FailedUpdatedCompute++
176178
} else {
@@ -179,6 +181,7 @@ func getReplicaCountsMap(
179181
}
180182

181183
replicaCountsMap[resourceID] = replicaCounts
184+
podStatusMap[resourceID] = append(podStatusMap[resourceID], podStatus)
182185
}
183186

184187
for _, deployment := range deployments {
@@ -191,7 +194,7 @@ func getReplicaCountsMap(
191194
replicaCountsMap[resourceID] = replicaCounts
192195
}
193196

194-
return replicaCountsMap
197+
return replicaCountsMap, podStatusMap
195198
}
196199

197200
func numUpdatedReadyReplicas(ctx *context.Context, api *context.API) (int32, error) {
@@ -229,6 +232,18 @@ func apiStatusCode(apiStatus *resource.APIStatus) resource.StatusCode {
229232
}
230233

231234
if apiStatus.TotalFailed() > 0 {
235+
for _, podStatus := range apiStatus.PodStatuses {
236+
if podStatus == k8s.PodStatusKilledOOM {
237+
return resource.StatusKilledOOM
238+
}
239+
}
240+
241+
for _, podStatus := range apiStatus.PodStatuses {
242+
if podStatus == k8s.PodStatusKilled {
243+
return resource.StatusKilled
244+
}
245+
}
246+
232247
return resource.StatusError
233248
}
234249

pkg/operator/workloads/api_workload.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,8 @@ func (aw *APIWorkload) IsFailed(ctx *context.Context) (bool, error) {
218218
}
219219

220220
for _, pod := range pods {
221-
if k8s.GetPodStatus(&pod) == k8s.PodStatusFailed {
221+
podStatus := k8s.GetPodStatus(&pod)
222+
if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM {
222223
return true, nil
223224
}
224225
}

0 commit comments

Comments
 (0)