Skip to content

Commit

Permalink
Disappeared or maximally retried nodes should be job failures
Browse files Browse the repository at this point in the history
As discussed: #2518 (comment)

It is a job failure if the `any` strategy cannot schedule the job onto
enough nodes. This makes sense because we have not been able to fulfil
the user's Deal.

This commit modifies the `all` strategy to have the same behaviour. If
the job cannot be successfully run on "all" nodes, the job now fails.
  • Loading branch information
simonwo committed Jun 12, 2023
1 parent 28ffeec commit 0c9551b
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 13 deletions.
10 changes: 4 additions & 6 deletions pkg/requester/selection/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,16 +89,12 @@ func (s *allNodeSelector) SelectNodesForRetry(ctx context.Context, job *model.Jo

failedNodes := make([]string, 0, len(jobState.Executions))
for nodeID, failureCount := range failureCounts {
if failureCount > 0 && failureCount < maxRetriesPerNode {
if failureCount > 0 {
failedNodes = append(failedNodes, nodeID)
}
}

log.Ctx(ctx).Debug().Int("Retryable", len(failedNodes)).Msg("Found nodes to retry")
if len(failedNodes) == 0 {
return nil, nil
}

foundNodes, err := s.nodeDiscoverer.FindNodes(ctx, *job)
if err != nil {
return nil, err
Expand All @@ -107,12 +103,14 @@ func (s *allNodeSelector) SelectNodesForRetry(ctx context.Context, job *model.Jo

retryNodes := make([]model.NodeInfo, 0, len(failedNodes))
for _, foundNode := range foundNodes {
if slices.Contains(failedNodes, string(foundNode.PeerInfo.ID)) {
id := string(foundNode.PeerInfo.ID)
if slices.Contains(failedNodes, id) && failureCounts[id] < maxRetriesPerNode {
retryNodes = append(retryNodes, foundNode)
}
}

if len(failedNodes) > len(retryNodes) {
// A node failed but has disappeared or a node is over the retry limit
return nil, requester.NewErrNotEnoughNodes(len(failedNodes), len(retryNodes))
}

Expand Down
12 changes: 5 additions & 7 deletions pkg/requester/selection/all_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ var retryTestCases = []nodeSelectorTestCase{
expectedNodes: []peer.ID{},
},
{
name: "does not retry job after many failed retries",
name: "many failed retries is a job failure",
nodes: []peer.ID{test1},
ranks: []int{1},
states: map[peer.ID][]model.ExecutionStateType{
Expand All @@ -101,8 +101,7 @@ var retryTestCases = []nodeSelectorTestCase{
model.ExecutionStateFailed,
},
},
checkError: require.NoError,
expectedNodes: []peer.ID{},
checkError: require.Error,
},
{
name: "does not retry job on a different node",
Expand All @@ -118,20 +117,19 @@ var retryTestCases = []nodeSelectorTestCase{
model.ExecutionStateCompleted,
},
},
checkError: require.NoError,
checkError: require.Error,
expectedNodes: []peer.ID{},
},
{
name: "throws error if node disappeared",
name: "disappeared node is a job failure",
nodes: []peer.ID{},
ranks: []int{},
states: map[peer.ID][]model.ExecutionStateType{
test1: {
model.ExecutionStateFailed,
},
},
checkError: require.Error,
expectedNodes: []peer.ID{},
checkError: require.Error,
},
}

Expand Down

0 comments on commit 0c9551b

Please sign in to comment.