Skip to content

Add out-of-memory error #418

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions cli/cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,12 +236,12 @@ func describeAPI(name string, resourcesRes *schema.GetResourcesResponse, flagVer

row := []interface{}{
groupStatus.Message(),
s.Int32(groupStatus.ReadyUpdated),
s.Int32(groupStatus.Available()),
s.Int32(groupStatus.Requested),
s.Int32(groupStatus.ReadyStaleCompute),
s.Int32(groupStatus.ReadyStaleModel),
s.Int32(groupStatus.FailedUpdated),
groupStatus.ReadyUpdated,
groupStatus.Available(),
groupStatus.Requested,
groupStatus.ReadyStaleCompute,
groupStatus.ReadyStaleModel,
groupStatus.FailedUpdated,
libtime.Since(updatedAt),
}

Expand Down Expand Up @@ -550,6 +550,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri

rows = append(rows, []interface{}{
name,
groupStatus.Message(),
groupStatus.ReadyUpdated,
groupStatus.Available(),
groupStatus.Requested,
Expand All @@ -563,6 +564,7 @@ func apiResourceTable(apiGroupStatuses map[string]*resource.APIGroupStatus) stri
t := table.Table{
Headers: []table.Header{
{Title: resource.APIType.UserFacing()},
{Title: "status"},
{Title: "up-to-date"},
{Title: "available"},
{Title: "requested"},
Expand Down
21 changes: 11 additions & 10 deletions docs/deployments/statuses.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@

| Status | Meaning |
|----------------------|---|
| live | API is deployed and ready to serve prediction requests (at least one replica is running) |
| pending | API is waiting for another resource to be ready |
| creating | API is being created |
| stopping | API is stopping |
| stopped | API is stopped |
| error | API was not created due to an error; run `cortex logs <name>` to view the logs |
| skipped | API was not created due to an error in another resource |
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready |
| live | API is deployed and ready to serve prediction requests (at least one replica is running) |
| pending | API is waiting for another resource to be ready |
| creating | API is being created |
| stopping | API is stopping |
| stopped | API is stopped |
| error | API was not created due to an error; run `cortex logs <name>` to view the logs |
| skipped | API was not created due to an error in another resource |
| upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready |
| upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready |
| error (out of memory) | API was terminated due to excessive memory usage; try allocating more memory to the API and re-deploying |
| compute unavailable | API could not start due to insufficient memory, CPU, or GPU in the cluster; some replicas may be ready |
11 changes: 9 additions & 2 deletions pkg/lib/k8s/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,16 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
numSucceeded := 0
numFailed := 0
numKilled := 0
numKilledOOM := 0
for _, containerStatus := range containerStatuses {
if containerStatus.State.Running != nil {
if containerStatus.State.Running != nil && containerStatus.RestartCount == 0 {
numRunning++
} else if containerStatus.State.Terminated != nil {
exitCode := containerStatus.State.Terminated.ExitCode
if exitCode == 0 {
numSucceeded++
} else if exitCode == 137 {
numKilledOOM++
} else if killStatuses[exitCode] {
numKilled++
} else {
Expand All @@ -198,6 +201,8 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
if exitCode == 0 {
numSucceeded++
} else if exitCode == 137 {
numKilledOOM++
} else if killStatuses[exitCode] {
numKilled++
} else {
Expand All @@ -208,7 +213,9 @@ func PodStatusFromContainerStatuses(containerStatuses []kcore.ContainerStatus) P
numWaiting++
}
}
if numKilled > 0 {
if numKilledOOM > 0 {
return PodStatusKilledOOM
} else if numKilled > 0 {
return PodStatusKilled
} else if numFailed > 0 {
return PodStatusFailed
Expand Down
23 changes: 14 additions & 9 deletions pkg/operator/api/resource/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ limitations under the License.

package resource

import (
"github.com/cortexlabs/cortex/pkg/lib/k8s"
)

type DataStatus struct {
DataSavedStatus
Code StatusCode `json:"status_code"`
Expand All @@ -30,7 +34,8 @@ type APIStatus struct {
InitReplicas int32 `json:"init_replicas"`
TargetCPUUtilization int32 `json:"target_cpu_utilization"`
ReplicaCounts `json:"replica_counts"`
Code StatusCode `json:"status_code"`
PodStatuses []k8s.PodStatus `json:"pod_statuses"`
Code StatusCode `json:"status_code"`
}

type ReplicaCounts struct {
Expand Down Expand Up @@ -142,14 +147,14 @@ var _ = [1]int{}[int(StatusStopped)-(len(statusCodes)-1)] // Ensure list length
var statusCodeMessages = []string{
"unknown", // StatusUnknown

"pending", // StatusPending
"compute unavailable", // StatusPendingCompute
"pending", // StatusWaiting
"skipped", // StatusSkipped
"error", // StatusError
"upstream error", // StatusParentFailed
"upstream termination", // StatusParentKilled
"terminated (out of mem)", // StatusDataOOM
"pending", // StatusPending
"compute unavailable", // StatusPendingCompute
"pending", // StatusWaiting
"skipped", // StatusSkipped
"error", // StatusError
"upstream error", // StatusParentFailed
"upstream termination", // StatusParentKilled
"error (out of memory)", // StatusKilledOOM

"running", // StatusRunning
"ready", // StatusSucceeded
Expand Down
23 changes: 19 additions & 4 deletions pkg/operator/workloads/api_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ func getCurrentAPIStatuses(
return nil, errors.Wrap(err, "api statuses", ctx.App.Name)
}

replicaCountsMap := getReplicaCountsMap(podList, deployments, ctx)
replicaCountsMap, podStatusMap := getReplicaCountsMap(podList, deployments, ctx)

currentResourceWorkloadIDs := ctx.APIResourceWorkloadIDs()

Expand Down Expand Up @@ -117,6 +117,7 @@ func getCurrentAPIStatuses(
for resourceID, apiStatus := range apiStatuses {
apiStatus.Path = context.APIPath(apiStatus.APIName, apiStatus.AppName)
apiStatus.ReplicaCounts = replicaCountsMap[resourceID]
apiStatus.PodStatuses = podStatusMap[resourceID]
apiStatus.Code = apiStatusCode(apiStatus)
}

Expand All @@ -135,7 +136,7 @@ func getReplicaCountsMap(
podList []kcore.Pod,
deployments map[string]*kapps.Deployment, // api.Name -> deployment
ctx *context.Context,
) map[string]resource.ReplicaCounts {
) (map[string]resource.ReplicaCounts, map[string][]k8s.PodStatus) {

apiComputeIDMap := make(map[string]string)
for _, api := range ctx.APIs {
Expand All @@ -149,6 +150,7 @@ func getReplicaCountsMap(
}

replicaCountsMap := make(map[string]resource.ReplicaCounts)
podStatusMap := make(map[string][]k8s.PodStatus)
for _, pod := range podList {
resourceID := pod.Labels["resourceID"]
podAPIComputeID := APIPodComputeID(pod.Spec.Containers)
Expand All @@ -170,7 +172,7 @@ func getReplicaCountsMap(
replicaCounts.ReadyStaleCompute++
}
}
if podStatus == k8s.PodStatusFailed {
if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM {
if computeMatches {
replicaCounts.FailedUpdatedCompute++
} else {
Expand All @@ -179,6 +181,7 @@ func getReplicaCountsMap(
}

replicaCountsMap[resourceID] = replicaCounts
podStatusMap[resourceID] = append(podStatusMap[resourceID], podStatus)
}

for _, deployment := range deployments {
Expand All @@ -191,7 +194,7 @@ func getReplicaCountsMap(
replicaCountsMap[resourceID] = replicaCounts
}

return replicaCountsMap
return replicaCountsMap, podStatusMap
}

func numUpdatedReadyReplicas(ctx *context.Context, api *context.API) (int32, error) {
Expand Down Expand Up @@ -229,6 +232,18 @@ func apiStatusCode(apiStatus *resource.APIStatus) resource.StatusCode {
}

if apiStatus.TotalFailed() > 0 {
for _, podStatus := range apiStatus.PodStatuses {
if podStatus == k8s.PodStatusKilledOOM {
return resource.StatusKilledOOM
}
}

for _, podStatus := range apiStatus.PodStatuses {
if podStatus == k8s.PodStatusKilled {
return resource.StatusKilled
}
}

return resource.StatusError
}

Expand Down
3 changes: 2 additions & 1 deletion pkg/operator/workloads/api_workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ func (aw *APIWorkload) IsFailed(ctx *context.Context) (bool, error) {
}

for _, pod := range pods {
if k8s.GetPodStatus(&pod) == k8s.PodStatusFailed {
podStatus := k8s.GetPodStatus(&pod)
if podStatus == k8s.PodStatusFailed || podStatus == k8s.PodStatusKilled || podStatus == k8s.PodStatusKilledOOM {
return true, nil
}
}
Expand Down