Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip named nodes in the CNR that don't exist #83

Merged
merged 8 commits into from
Aug 19, 2024
15 changes: 15 additions & 0 deletions deploy/crds/atlassian.com_cyclenoderequests_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ spec:
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
clusterName:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where does this field come from? Why is it only in the CycleNodeRequest resource?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was part of the operator-sdk render, make generate-crds

type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
Expand Down Expand Up @@ -327,6 +329,19 @@ spec:
description: SkipPreTerminationChecks is an optional flag to skip
pre-termination checks during cycling
type: boolean
validationOptions:
description: ValidationOptions stores the settings to use for validating
state of nodegroups in kube and the cloud provider for cycling the
nodes.
properties:
skipMissingNamedNodes:
description: SkipMissingNodeNames is a boolean which determines
whether named nodes selected in a CNR must exist and be valid
nodes before cycling can begin. If set to true named nodes which
don't exist will be ignored rather than transitioning the CNR
to the failed phase.
type: boolean
type: object
required:
- cycleSettings
- nodeGroupName
Expand Down
13 changes: 13 additions & 0 deletions deploy/crds/atlassian.com_nodegroups_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,19 @@ spec:
description: SkipPreTerminationChecks is an optional flag to skip
pre-termination checks during cycling
type: boolean
validationOptions:
description: ValidationOptions stores the settings to use for validating
state of nodegroups in kube and the cloud provider for cycling the
nodes.
properties:
skipMissingNamedNodes:
description: SkipMissingNodeNames is a boolean which determines
whether named nodes selected in a CNR must exist and be valid
nodes before cycling can begin. If set to true named nodes which
don't exist will be ignored rather than transitioning the CNR
to the failed phase.
type: boolean
type: object
required:
- cycleSettings
- nodeGroupName
Expand Down
2 changes: 2 additions & 0 deletions docs/automation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ spec:
cycleSettings:
method: Drain
concurrency: 1
validationOptions:
skipMissingNamedNodes: true
healthChecks:
- endpoint: http://{{ .NodeIP }}:8080/ready
regexMatch: Ready
Expand Down
5 changes: 5 additions & 0 deletions docs/cycling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ spec:
- "node-name-A"
- "node-name-B"

# Optional section - collection of validation options to define stricter or more lenient validation during cycling.
validationOptions:
# Optional field - Skip node names defined in the CNR that do not match any existing nodes in the Kubernetes API.
skipMissingNamedNodes: true|false

cycleNodeSettings:
# Method can be "Wait" or "Drain", defaults to "Drain" if not provided
# "Wait" will wait for pods on the node to complete, while "Drain" will forcefully drain them off the node
Expand Down
9 changes: 9 additions & 0 deletions pkg/apis/atlassian/v1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,12 @@ type TLSConfig struct {
// sent as part of the request to the upstream host for mTLS.
Key string `json:"key,omitempty"`
}

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
type ValidationOptions struct {
// SkipMissingNodeNames is a boolean which determines whether named nodes selected in a CNR must
// exist and be valid nodes before cycling can begin. If set to true named nodes which don't exist
// will be ignored rather than transitioning the CNR to the failed phase.
SkipMissingNamedNodes bool `json:"skipMissingNamedNodes,omitempty"`
}
4 changes: 4 additions & 0 deletions pkg/apis/atlassian/v1/cyclenoderequest_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ type CycleNodeRequestSpec struct {
// CycleSettings stores the settings to use for cycling the nodes.
CycleSettings CycleSettings `json:"cycleSettings"`

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
ValidationOptions ValidationOptions `json:"validationOptions,omitempty"`

// HealthChecks stores the settings to configure instance custom health checks
HealthChecks []HealthCheck `json:"healthChecks,omitempty"`

Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/atlassian/v1/nodegroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ type NodeGroupSpec struct {
// CycleSettings stores the settings to use for cycling the nodes.
CycleSettings CycleSettings `json:"cycleSettings"`

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
ValidationOptions ValidationOptions `json:"validationOptions,omitempty"`

// Healthchecks stores the settings to configure instance custom health checks
HealthChecks []HealthCheck `json:"healthChecks,omitempty"`

Expand Down
19 changes: 19 additions & 0 deletions pkg/apis/atlassian/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ func (t *CycleNodeRequestTransitioner) performHealthCheck(node v1.CycleNodeReque

// performInitialHealthChecks on the nodes selected to be terminated before cycling begin. If any health
// check fails return an error to prevent cycling from starting
func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []corev1.Node) error {
func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes map[string]corev1.Node) error {
// Build a set of ready nodes from which to check below
readyNodesSet := make(map[string]v1.CycleNodeRequestNode)

Expand Down Expand Up @@ -241,7 +241,7 @@ func (t *CycleNodeRequestTransitioner) performInitialHealthChecks(kubeNodes []co

// performCyclingHealthChecks before terminating an instance selected for termination. Cycling pauses
// until all health checks pass for the new instance before terminating the old one
func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes []corev1.Node) (bool, error) {
func (t *CycleNodeRequestTransitioner) performCyclingHealthChecks(kubeNodes map[string]corev1.Node) (bool, error) {
var allHealthChecksPassed bool = true

// Find new instsances attached to the nodegroup and perform health checks on them
Expand Down
91 changes: 54 additions & 37 deletions pkg/controller/cyclenoderequest/transitioner/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ import (

// listReadyNodes lists nodes that are "ready". By default lists nodes that have also not been touched by Cyclops.
// A label is used to determine whether nodes have been touched by this CycleNodeRequest.
func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (nodes []corev1.Node, err error) {
func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (map[string]corev1.Node, error) {
nodes := make(map[string]corev1.Node)

// Get the nodes
selector, err := t.cycleNodeRequest.NodeLabelSelector()
if err != nil {
return nodes, err
}

nodeList, err := t.rm.ListNodes(selector)
if err != nil {
return nodes, err
Expand All @@ -30,14 +33,16 @@ func (t *CycleNodeRequestTransitioner) listReadyNodes(includeInProgress bool) (n
continue
}
}

// Only add "Ready" nodes
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
nodes = append(nodes, node)
nodes[node.Spec.ProviderID] = node
break
}
}
}

return nodes, nil
}

Expand All @@ -56,29 +61,34 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node
}

for _, kubeNode := range kubeNodes {
// Skip nodes that are already being worked on so we don't duplicate our work
if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name {
numNodesInProgress++
}
}

for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate {
kubeNode, found := kubeNodes[nodeToTerminate.ProviderID]

if !found {
continue
}

for _, nodeToTerminate := range t.cycleNodeRequest.Status.NodesToTerminate {
// Add nodes that need to be terminated but have not yet been actioned
if kubeNode.Name == nodeToTerminate.Name && kubeNode.Spec.ProviderID == nodeToTerminate.ProviderID {
nodes = append(nodes, &kubeNode)
// Skip nodes that are already being worked on so we don't duplicate our work
if value, ok := kubeNode.Labels[cycleNodeLabel]; ok && value == t.cycleNodeRequest.Name {
continue
}

for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ {
if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name {
// Remove nodes from available if they are also scheduled for termination
// Slice syntax removes this node at `i` from the array
t.cycleNodeRequest.Status.NodesAvailable = append(
t.cycleNodeRequest.Status.NodesAvailable[:i],
t.cycleNodeRequest.Status.NodesAvailable[i+1:]...,
)
// Add nodes that need to be terminated but have not yet been actioned
nodes = append(nodes, &kubeNode)

break
}
}
for i := 0; i < len(t.cycleNodeRequest.Status.NodesAvailable); i++ {
if kubeNode.Name == t.cycleNodeRequest.Status.NodesAvailable[i].Name {
// Remove nodes from available if they are also scheduled for termination
// Slice syntax removes this node at `i` from the array
t.cycleNodeRequest.Status.NodesAvailable = append(
t.cycleNodeRequest.Status.NodesAvailable[:i],
t.cycleNodeRequest.Status.NodesAvailable[i+1:]...,
)

break
}
Expand All @@ -94,32 +104,39 @@ func (t *CycleNodeRequestTransitioner) getNodesToTerminate(numNodes int64) (node
}

// addNamedNodesToTerminate adds the named nodes for this CycleNodeRequest to the list of nodes to terminate.
// Returns an error if any named node does not exist in the node group for this CycleNodeRequest.
func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes []corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error {
for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames {
foundNode := false
for _, kubeNode := range kubeNodes {
if kubeNode.Name == namedNode {
foundNode = true
// Skips any named node that does not exist in the node group for this CycleNodeRequest.
func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) error {
nodeLookupByName := make(map[string]corev1.Node)

t.cycleNodeRequest.Status.NodesAvailable = append(
t.cycleNodeRequest.Status.NodesAvailable,
newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
)
for _, node := range kubeNodes {
nodeLookupByName[node.Name] = node
}

t.cycleNodeRequest.Status.NodesToTerminate = append(
t.cycleNodeRequest.Status.NodesToTerminate,
newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
)
for _, namedNode := range t.cycleNodeRequest.Spec.NodeNames {
kubeNode, found := nodeLookupByName[namedNode]

break
if !found {
t.rm.Logger.Info("could not find node by name, skipping", "nodeName", namedNode)

if !t.cycleNodeRequest.Spec.ValidationOptions.SkipMissingNamedNodes {
return fmt.Errorf("could not find node by name: %v", namedNode)
}
}

if !foundNode {
return fmt.Errorf("could not find node by name: %v", namedNode)
t.rm.LogEvent(t.cycleNodeRequest, "SkipMissingNamedNode", "Named node %s not found", namedNode)
continue
}

t.cycleNodeRequest.Status.NodesAvailable = append(
t.cycleNodeRequest.Status.NodesAvailable,
newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
)

t.cycleNodeRequest.Status.NodesToTerminate = append(
t.cycleNodeRequest.Status.NodesToTerminate,
newCycleNodeRequestNode(&kubeNode, nodeGroupInstances[kubeNode.Spec.ProviderID].NodeGroupName()),
)
}

return nil
}

Expand Down
Loading
Loading