Improve cluster info output for batch jobs (#2270)

deliahu · web-flow · commit f515b5cb7726 · 2021-06-21T23:33:37.000-07:00
diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
@@ -933,7 +933,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 	numAPIInstances := len(infoResponse.NodeInfos)
 
 	var totalReplicas int
-	var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncAPIs bool
+	var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
 	for _, nodeInfo := range infoResponse.NodeInfos {
 		totalReplicas += nodeInfo.NumReplicas
 		if nodeInfo.ComputeUserCapacity.GPU > 0 {
@@ -943,7 +943,10 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 			doesClusterHaveInfs = true
 		}
 		if nodeInfo.NumAsyncGatewayReplicas > 0 {
-			doesClusterHaveAsyncAPIs = true
+			doesClusterHaveAsyncGateways = true
+		}
+		if nodeInfo.NumEnqueuerReplicas > 0 {
+			doesClusterHaveEnqueuers = true
 		}
 	}
 
@@ -962,7 +965,8 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 		{Title: "instance type"},
 		{Title: "lifecycle"},
 		{Title: "replicas"},
-		{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncAPIs},
+		{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncGateways},
+		{Title: "batch enqueuer replicas", Hidden: !doesClusterHaveEnqueuers},
 		{Title: "CPU (requested / total allocatable)"},
 		{Title: "memory (requested / total allocatable)"},
 		{Title: "GPU (requested / total allocatable)", Hidden: !doesClusterHaveGPUs},
@@ -980,7 +984,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
 		memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String()
 		gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU)
 		infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf)
-		rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, cpuStr, memStr, gpuStr, infStr})
+		rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr})
 	}
 
 	t := table.Table{
diff --git a/pkg/operator/endpoints/info.go b/pkg/operator/endpoints/info.go
@@ -104,8 +104,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 	for i := range pods {
 		pod := pods[i]
 
+		if pod.Status.Phase == kcore.PodSucceeded || pod.Status.Phase == kcore.PodFailed {
+			// note: pending pods can be scheduled on nodes (image pull in progress)
+			continue
+		}
+
 		_, isAPIPod := pod.Labels["apiName"]
-		asyncDeploymentType, isAsyncPod := pod.Labels["cortex.dev/async"]
+		asyncPodType, isAsyncPod := pod.Labels["cortex.dev/async"]
+		batchPodType, isBatchPod := pod.Labels["cortex.dev/batch"]
 
 		if pod.Spec.NodeName == "" && isAPIPod {
 			numPendingReplicas++
@@ -118,8 +124,10 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
 		}
 
 		if isAPIPod {
-			if isAsyncPod && asyncDeploymentType == "gateway" {
+			if isAsyncPod && asyncPodType == "gateway" {
 				node.NumAsyncGatewayReplicas++
+			} else if isBatchPod && batchPodType == "enqueuer" {
+				node.NumEnqueuerReplicas++
 			} else {
 				node.NumReplicas++
 			}
diff --git a/pkg/operator/schema/schema.go b/pkg/operator/schema/schema.go
@@ -38,6 +38,7 @@ type NodeInfo struct {
 	Price                   float64            `json:"price" yaml:"price"`
 	NumReplicas             int                `json:"num_replicas" yaml:"num_replicas"`
 	NumAsyncGatewayReplicas int                `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"`
+	NumEnqueuerReplicas     int                `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"`
 	ComputeUserCapacity     userconfig.Compute `json:"compute_user_capacity" yaml:"compute_user_capacity"`   // the total resources available to the user on a node
 	ComputeAvailable        userconfig.Compute `json:"compute_available" yaml:"compute_unavailable"`         // unused resources on a node
 	ComputeUserRequested    userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node