atlassian-labs · vincentportella · Aug 19, 2024 · Jul 19, 2024 · Jul 19, 2024 · Jul 22, 2024
diff --git a/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml b/deploy/crds/atlassian.com_cyclenoderequests_crd.yaml
@@ -44,6 +44,8 @@ spec:
               of an object. Servers should convert recognized schemas to the latest
               internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
             type: string
+          clusterName:
+            type: string
           kind:
             description: 'Kind is a string value representing the REST resource this
               object represents. Servers may infer this from the endpoint the client
@@ -101,6 +103,13 @@ spec:
                     - Drain
                     - Wait
                     type: string
+                  strictValidation:
+                    description: StrictValidation is a boolean which determines whether
+                      named nodes selected in a CNR must exist and be valid nodes
+                      before cycling can begin. If set to true when invalid nodes
+                      are selected the CNR will be transitioned to the "Failed" phase
+                      before cycling can begin again.
+                    type: boolean
                 required:
                 - method
                 type: object

diff --git a/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml b/deploy/crds/atlassian.com_cyclenodestatuses_crd.yaml
@@ -101,6 +101,13 @@ spec:
                     - Drain
                     - Wait
                     type: string
+                  strictValidation:
+                    description: StrictValidation is a boolean which determines whether
+                      named nodes selected in a CNR must exist and be valid nodes
+                      before cycling can begin. If set to true when invalid nodes
+                      are selected the CNR will be transitioned to the "Failed" phase
+                      before cycling can begin again.
+                    type: boolean
                 required:
                 - method
                 type: object

diff --git a/deploy/crds/atlassian.com_nodegroups_crd.yaml b/deploy/crds/atlassian.com_nodegroups_crd.yaml
@@ -93,6 +93,13 @@ spec:
                     - Drain
                     - Wait
                     type: string
+                  strictValidation:
+                    description: StrictValidation is a boolean which determines whether
+                      named nodes selected in a CNR must exist and be valid nodes
+                      before cycling can begin. If set to true when invalid nodes
+                      are selected the CNR will be transitioned to the "Failed" phase
+                      before cycling can begin again.
+                    type: boolean
                 required:
                 - method
                 type: object

diff --git a/pkg/apis/atlassian/v1/common_types.go b/pkg/apis/atlassian/v1/common_types.go
@@ -46,6 +46,11 @@ type CycleSettings struct {
 	// in-progress CNS request timeout from the time it's worked on by the controller.
 	// If no cyclingTimeout is provided, CNS will use the default controller CNS cyclingTimeout.
 	CyclingTimeout *metav1.Duration `json:"cyclingTimeout,omitempty"`
+
+	// StrictValidation is a boolean which determines whether named nodes selected in a CNR must
+	// exist and be valid nodes before cycling can begin. If set to true when invalid nodes are
+	// selected the CNR will be transitioned to the "Failed" phase before cycling can begin again.
+	StrictValidation bool `json:"strictValidation,omitempty"`
 }
 
 // HealthCheck defines the health check configuration for the NodeGroup

diff --git a/pkg/apis/atlassian/v1/zz_generated.deepcopy.go b/pkg/apis/atlassian/v1/zz_generated.deepcopy.go
diff --git a/pkg/controller/cyclenoderequest/transitioner/checks.go b/pkg/controller/cyclenoderequest/transitioner/checks.go
@@ -189,7 +189,7 @@ func (t *CycleNodeRequestTransitioner) performHealthCheck(node v1.CycleNodeReque
 
 // performInitialHealthChecks on the nodes selected to be terminated before cycling begin. If any health
 // check fails return an error to prevent cycling from starting
-func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []corev1.Node) error {
+func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes map[string]corev1.Node) error {
 	// Build a set of ready nodes from which to check below
 	readyNodesSet := make(map[string]v1.CycleNodeRequestNode)
 
@@ -241,7 +241,7 @@ func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []co
 
 // performCyclingHealthChecks before terminating an instance selected for termination. Cycling pauses
 // until all health checks pass for the new instance before terminating the old one
-func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes []corev1.Node) (bool, error) {
+func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes map[string]corev1.Node) (bool, error) {
 	var allHealthChecksPassed bool = true
 
 	// Find new instsances attached to the nodegroup and perform health checks on them

diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go
@@ -11,12 +11,15 @@ import (
 
 // listReadyNodes lists nodes that are "ready". By default lists nodes that have also not been touched by Cyclops.
 // A label is used to determine whether nodes have been touched by this CycleNodeRequest.
-func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (nodes []corev1.Node, err error) {
+func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (map[string]corev1.Node, error) {
+	nodes := make(map[string]corev1.Node)
+
 	// Get the nodes
 	selector, err := t.cycleNodeRequest.NodeLabelSelector()
 	if err != nil {
 		return nodes, err
 	}
+
 	nodeList, err := t.rm.ListNodes(selector)
 	if err != nil {
 		return nodes, err
@@ -30,14 +33,16 @@ func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (n
 				continue
 			}
 		}
+
 		// Only add "Ready" nodes
 		for _, cond := range node.Status.Conditions {
 			if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
-				nodes = append(nodes, node)
+				nodes[node.Spec.ProviderID] = node
 				break
 			}
 		}
 	}
+
 	return nodes, nil
 }
 
@@ -56,29 +61,34 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node
 	}
 
 	for _, kubeNode := range kubeNodes {
-		// Skip nodes that are already being worked on so we don't duplicate our work
 		if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name {
 			numNodesInProgress++
+		}
+	}
+
+	for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate {
+		kubeNode, found := kubeNodes[nodeToTerminate.ProviderID]
+
+		if !found {
 			continue
 		}
 
-		for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate {
-			// Add nodes that need to be terminated but have not yet been actioned
-			if kubeNode.Name == nodeToTerminate.Name && kubeNode.Spec.ProviderID == nodeToTerminate.ProviderID {
-				nodes = append(nodes, &kubeNode)
+		// Skip nodes that are already being worked on so we don't duplicate our work
+		if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name {
+			continue
+		}
 
-				for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ {
-					if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name {
-						// Remove nodes from available if they are also scheduled for termination
-						// Slice syntax removes this node at `i` from the array
-						t.cycleNodeRequest.Status.NodesAvailable = append(
-							t.cycleNodeRequest.Status.NodesAvailable[:i],
-							t.cycleNodeRequest.Status.NodesAvailable[i+1:]...,
-						)
+		// Add nodes that need to be terminated but have not yet been actioned
+		nodes = append(nodes, &kubeNode)
 
-						break
-					}
-				}
+		for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ {
+			if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name {
+				// Remove nodes from available if they are also scheduled for termination
+				// Slice syntax removes this node at `i` from the array
+				t.cycleNodeRequest.Status.NodesAvailable = append(
+					t.cycleNodeRequest.Status.NodesAvailable[:i],
+					t.cycleNodeRequest.Status.NodesAvailable[i+1:]...,
+				)
 
 				break
 			}
@@ -94,32 +104,38 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node
 }
 
 // addNamedNodesToTerminate adds the named nodes for this CycleNodeRequest to the list of nodes to terminate.
-// Returns an error if any named node does not exist in the node group for this CycleNodeRequest.
-func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes []corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error {
-	for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames {
-		foundNode := false
-		for _, kubeNode := range kubeNodes {
-			if kubeNode.Name == namedNode {
-				foundNode = true
+// Skips any named node that does not exist in the node group for this CycleNodeRequest.
+func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error {
+	nodeLookupByName := make(map[string]corev1.Node)
 
-				t.cycleNodeRequest.Status.NodesAvailable = append(
-					t.cycleNodeRequest.Status.NodesAvailable,
-					newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
-				)
+	for _, node := range kubeNodes {
+		nodeLookupByName[node.Name] = node
+	}
 
-				t.cycleNodeRequest.Status.NodesToTerminate = append(
-					t.cycleNodeRequest.Status.NodesToTerminate,
-					newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
-				)
+	for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames {
+		kubeNode, found := nodeLookupByName[namedNode]
 
-				break
+		if !found {
+			t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode)
+
+			if t.cycleNodeRequest.Spec.CycleSettings.StrictValidation {
+				return fmt.Errorf("could not find node by name: %v", namedNode)
 			}
-		}
 
-		if !foundNode {
-			return fmt.Errorf("could not find node by name: %v", namedNode)
+			continue
 		}
+
+		t.cycleNodeRequest.Status.NodesAvailable = append(
+			t.cycleNodeRequest.Status.NodesAvailable,
+			newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
+		)
+
+		t.cycleNodeRequest.Status.NodesToTerminate = append(
+			t.cycleNodeRequest.Status.NodesToTerminate,
+			newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
+		)
 	}
+
 	return nil
 }
 

diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go
@@ -64,10 +64,12 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result,
 func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, error) {
 	// Fetch the node names for the cycleNodeRequest, using the label selector provided
 	t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Selecting nodes with label selector")
+
 	kubeNodes, err := t.listReadyNodes(true)
 	if err != nil {
 		return t.transitionToHealing(err)
 	}
+
 	if len(kubeNodes) == 0 {
 		return t.transitionToHealing(fmt.Errorf("no nodes matched selector"))
 	}
@@ -83,14 +85,12 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er
 	if err != nil {
 		return t.transitionToHealing(errors.Wrap(err, "failed to check instances that exist from cloud provider"))
 	}
-	var existingKubeNodes []corev1.Node
 
-	for _, node := range kubeNodes {
-		for _, validProviderID := range existingProviderIDs {
-			if node.Spec.ProviderID == validProviderID {
-				existingKubeNodes = append(existingKubeNodes, node)
-				break
-			}
+	existingKubeNodes := make(map[string]corev1.Node)
+
+	for _, validProviderID := range existingProviderIDs {
+		if node, found := kubeNodes[validProviderID]; found {
+			existingKubeNodes[node.Spec.ProviderID] = node
 		}
 	}
 
@@ -120,19 +120,40 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er
 	// Do some sanity checking before we start filtering things
 	// Check the instance count of the node group matches the number of nodes found in Kubernetes
 	if len(kubeNodes) != len(nodeGroupInstances) {
-		nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances)
 		var offendingNodesInfo string
+
+		nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances)
+
 		if len(nodesNotInCPNodeGroup) > 0 {
+			providerIDs := make([]string, 0)
+
+			for providerID := range nodesNotInCPNodeGroup {
+				providerIDs = append(providerIDs,
+					fmt.Sprintf("id %q", providerID),
+				)
+			}
+
 			offendingNodesInfo += "nodes not in node group: "
-			offendingNodesInfo += strings.Join(nodesNotInCPNodeGroup, ",")
+			offendingNodesInfo += strings.Join(providerIDs, ",")
 		}
+
 		if len(nodesNotInKube) > 0 {
 			if offendingNodesInfo != "" {
 				offendingNodesInfo += ";"
 			}
+
+			providerIDs := make([]string, 0)
+
+			for providerID, node := range nodesNotInKube {
+				providerIDs = append(providerIDs,
+					fmt.Sprintf("id %q in %q", providerID, node.NodeGroupName()),
+				)
+			}
+
 			offendingNodesInfo += "nodes not inside cluster: "
-			offendingNodesInfo += strings.Join(nodesNotInKube, ",")
+			offendingNodesInfo += strings.Join(providerIDs, ",")
 		}
+
 		t.rm.LogEvent(t.cycleNodeRequest, "NodeCountMismatch",
 			"node group: %v, kube: %v. %v",
 			len(nodeGroupInstances), len(kubeNodes), offendingNodesInfo)
@@ -142,12 +163,16 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er
 		if err != nil {
 			return t.transitionToHealing(err)
 		}
+
 		if timedOut {
 			err := fmt.Errorf(
-				"node count mismatch, number of kubernetes of nodes does not match number of cloud provider instances after %v",
-				nodeEquilibriumWaitLimit)
+				"node count mismatch, number of kubernetes nodes does not match number of cloud provider instances after %v",
+				nodeEquilibriumWaitLimit,
+			)
+
 			return t.transitionToHealing(err)
 		}
+
 		return reconcile.Result{Requeue: true, RequeueAfter: requeueDuration}, nil
 	}
 
@@ -162,6 +187,7 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er
 	} else {
 		// Otherwise just add all the nodes in the node group
 		t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding all node group nodes to NodesToTerminate")
+
 		for _, kubeNode := range kubeNodes {
 			// Check to ensure the kubeNode object maps to an existing node in the ASG
 			// If this isn't the case, this is a phantom node. Fail the cnr to be safe.
@@ -213,7 +239,9 @@ func (t *CycleNodeRequestTransitioner) transitionInitialised() (reconcile.Result
 	// The maximum nodes we can select are bounded by our concurrency. We take into account the number
 	// of nodes we are already working on, and only introduce up to our concurrency cap more nodes in this step.
 	maxNodesToSelect := t.cycleNodeRequest.Spec.CycleSettings.Concurrency - t.cycleNodeRequest.Status.ActiveChildren
+
 	t.rm.Logger.Info("Selecting nodes to terminate", "numNodes", maxNodesToSelect)
+
 	nodes, numNodesInProgress, err := t.getNodesToTerminate(maxNodesToSelect)
 	if err != nil {
 		return t.transitionToHealing(err)