Skip to content

Commit f515b5c

Browse files
authored
Improve cluster info output for batch jobs (#2270)
1 parent 9176a65 commit f515b5c

File tree

3 files changed

+19
-6
lines changed

3 files changed

+19
-6
lines changed

cli/cmd/cluster.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -933,7 +933,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
933933
numAPIInstances := len(infoResponse.NodeInfos)
934934

935935
var totalReplicas int
936-
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncAPIs bool
936+
var doesClusterHaveGPUs, doesClusterHaveInfs, doesClusterHaveAsyncGateways, doesClusterHaveEnqueuers bool
937937
for _, nodeInfo := range infoResponse.NodeInfos {
938938
totalReplicas += nodeInfo.NumReplicas
939939
if nodeInfo.ComputeUserCapacity.GPU > 0 {
@@ -943,7 +943,10 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
943943
doesClusterHaveInfs = true
944944
}
945945
if nodeInfo.NumAsyncGatewayReplicas > 0 {
946-
doesClusterHaveAsyncAPIs = true
946+
doesClusterHaveAsyncGateways = true
947+
}
948+
if nodeInfo.NumEnqueuerReplicas > 0 {
949+
doesClusterHaveEnqueuers = true
947950
}
948951
}
949952

@@ -962,7 +965,8 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
962965
{Title: "instance type"},
963966
{Title: "lifecycle"},
964967
{Title: "replicas"},
965-
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncAPIs},
968+
{Title: "async gateway replicas", Hidden: !doesClusterHaveAsyncGateways},
969+
{Title: "batch enqueuer replicas", Hidden: !doesClusterHaveEnqueuers},
966970
{Title: "CPU (requested / total allocatable)"},
967971
{Title: "memory (requested / total allocatable)"},
968972
{Title: "GPU (requested / total allocatable)", Hidden: !doesClusterHaveGPUs},
@@ -980,7 +984,7 @@ func printInfoNodes(infoResponse *schema.InfoResponse) {
980984
memStr := nodeInfo.ComputeUserRequested.Mem.String() + " / " + nodeInfo.ComputeUserCapacity.Mem.String()
981985
gpuStr := s.Int64(nodeInfo.ComputeUserRequested.GPU) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.GPU)
982986
infStr := s.Int64(nodeInfo.ComputeUserRequested.Inf) + " / " + s.Int64(nodeInfo.ComputeUserCapacity.Inf)
983-
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, cpuStr, memStr, gpuStr, infStr})
987+
rows = append(rows, []interface{}{nodeInfo.InstanceType, lifecycle, nodeInfo.NumReplicas, nodeInfo.NumAsyncGatewayReplicas, nodeInfo.NumEnqueuerReplicas, cpuStr, memStr, gpuStr, infStr})
984988
}
985989

986990
t := table.Table{

pkg/operator/endpoints/info.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,14 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
104104
for i := range pods {
105105
pod := pods[i]
106106

107+
if pod.Status.Phase == kcore.PodSucceeded || pod.Status.Phase == kcore.PodFailed {
108+
// note: pending pods can be scheduled on nodes (image pull in progress)
109+
continue
110+
}
111+
107112
_, isAPIPod := pod.Labels["apiName"]
108-
asyncDeploymentType, isAsyncPod := pod.Labels["cortex.dev/async"]
113+
asyncPodType, isAsyncPod := pod.Labels["cortex.dev/async"]
114+
batchPodType, isBatchPod := pod.Labels["cortex.dev/batch"]
109115

110116
if pod.Spec.NodeName == "" && isAPIPod {
111117
numPendingReplicas++
@@ -118,8 +124,10 @@ func getNodeInfos() ([]schema.NodeInfo, int, error) {
118124
}
119125

120126
if isAPIPod {
121-
if isAsyncPod && asyncDeploymentType == "gateway" {
127+
if isAsyncPod && asyncPodType == "gateway" {
122128
node.NumAsyncGatewayReplicas++
129+
} else if isBatchPod && batchPodType == "enqueuer" {
130+
node.NumEnqueuerReplicas++
123131
} else {
124132
node.NumReplicas++
125133
}

pkg/operator/schema/schema.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ type NodeInfo struct {
3838
Price float64 `json:"price" yaml:"price"`
3939
NumReplicas int `json:"num_replicas" yaml:"num_replicas"`
4040
NumAsyncGatewayReplicas int `json:"num_async_gateway_replicas" yaml:"num_async_gateway_replicas"`
41+
NumEnqueuerReplicas int `json:"num_enqueuer_replicas" yaml:"num_enqueuer_replicas"`
4142
ComputeUserCapacity userconfig.Compute `json:"compute_user_capacity" yaml:"compute_user_capacity"` // the total resources available to the user on a node
4243
ComputeAvailable userconfig.Compute `json:"compute_available" yaml:"compute_unavailable"` // unused resources on a node
4344
ComputeUserRequested userconfig.Compute `json:"compute_user_requested" yaml:"compute_user_requested"` // total resources requested by user on a node

0 commit comments

Comments
 (0)