Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a max failed CNRs threshold to Nodegroups #88

Merged
merged 8 commits into from
Sep 2, 2024
5 changes: 5 additions & 0 deletions deploy/crds/atlassian.com_nodegroups_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ spec:
- waitPeriod
type: object
type: array
maxFailedCycleNodeRequests:
description: MaxFailedCycleNodeRequests defines the maximum number
of allowed failed CNRs for a nodegroup before the observer stops
generating them.
type: integer
nodeGroupName:
description: NodeGroupName is the name of the node group in the cloud
provider that corresponds to this NodeGroup resource.
Expand Down
20 changes: 20 additions & 0 deletions pkg/apis/atlassian/v1/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,23 @@ func buildNodeGroupNames(nodeGroupsList []string, nodeGroupName string) []string

return nodeGroups
}

// sameNodeGroups compares two lists of nodegroup names and check they are the
// same. Ordering does not affect equality.
func sameNodeGroups(groupA, groupB []string) bool {
if len(groupA) != len(groupB) {
return false
}

groupMap := make(map[string]struct{})
for _, group := range groupA {
groupMap[group] = struct{}{}
}

for _, group := range groupB {
if _, ok := groupMap[group]; !ok {
return false
}
}
return true
}
41 changes: 41 additions & 0 deletions pkg/apis/atlassian/v1/common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,44 @@ func TestBuildNodeGroupNames(t *testing.T) {
})
}
}

func Test_sameNodeGroups(t *testing.T) {
tests := []struct {
name string
groupA []string
groupB []string
expect bool
}{
{
"pass case with same order",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
true,
},
{
"pass case with different order",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c", "ingress-us-west-2a"},
true,
},
{
"failure case with different length",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c"},
false,
},
{
"failure case with different items",
[]string{"ingress-us-west-2a", "ingress-us-west-2b", "ingress-us-west-2c"},
[]string{"ingress-us-west-2b", "ingress-us-west-2c", "system"},
false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := sameNodeGroups(tt.groupA, tt.groupB)
assert.Equal(t, tt.expect, got)
})
}
}
21 changes: 21 additions & 0 deletions pkg/apis/atlassian/v1/cyclenoderequest.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,24 @@ func (in *CycleNodeRequest) NodeLabelSelector() (labels.Selector, error) {
func (in *CycleNodeRequest) GetNodeGroupNames() []string {
return buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName)
}

// IsPartOfNodeGroup returns whether the CycleNodeRequest is part of the
// provided NodeGroup by comparing the list of named cloud provider nodegroups
// defined in each one. Ordering does not affect equality.
func (in *CycleNodeRequest) IsFromNodeGroup(nodegroup NodeGroup) bool {
return sameNodeGroups(
buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName),
nodegroup.GetNodeGroupNames(),
)
}

// IsFromSameNodeGroup returns whether the CycleNodeRequest is part of the
// same Nodegroup provided as the provided CycleNodeRequest by comparing the
// list of named cloud provider nodegroups defined in each one. Ordering does
// not affect equality.
func (in *CycleNodeRequest) IsFromSameNodeGroup(cnr CycleNodeRequest) bool {
return sameNodeGroups(
buildNodeGroupNames(in.Spec.NodeGroupsList, in.Spec.NodeGroupName),
cnr.GetNodeGroupNames(),
)
}
4 changes: 4 additions & 0 deletions pkg/apis/atlassian/v1/nodegroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ type NodeGroupSpec struct {
// CycleSettings stores the settings to use for cycling the nodes.
CycleSettings CycleSettings `json:"cycleSettings"`

// MaxFailedCycleNodeRequests defines the maximum number of allowed failed CNRs for a nodegroup before the observer
// stops generating them.
MaxFailedCycleNodeRequests uint `json:"maxFailedCycleNodeRequests,omitempty"`

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
ValidationOptions ValidationOptions `json:"validationOptions,omitempty"`
Expand Down
26 changes: 24 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/test_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import (
v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
"github.com/atlassian-labs/cyclops/pkg/controller"
"github.com/atlassian-labs/cyclops/pkg/mock"

"sigs.k8s.io/controller-runtime/pkg/client"
)

type Option func(t *Transitioner)
Expand All @@ -22,6 +24,18 @@ func WithKubeNodes(nodes []*mock.Node) Option {
}
}

func WithExtraKubeObject(extraKubeObject client.Object) Option {
return func(t *Transitioner) {
t.extrakubeObjects = append(t.extrakubeObjects, extraKubeObject)
}
}

func WithTransitionerOptions(options Options) Option {
return func(t *Transitioner) {
t.transitionerOptions = options
}
}

// ************************************************************************** //

type Transitioner struct {
Expand All @@ -30,6 +44,10 @@ type Transitioner struct {

CloudProviderInstances []*mock.Node
KubeNodes []*mock.Node

extrakubeObjects []client.Object
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: extraKubeObjects


transitionerOptions Options
}

func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner {
Expand All @@ -38,13 +56,17 @@ func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner
// override these as needed
CloudProviderInstances: make([]*mock.Node, 0),
KubeNodes: make([]*mock.Node, 0),
extrakubeObjects: []client.Object{cnr},
transitionerOptions: Options{},
}

for _, opt := range opts {
opt(t)
}

t.Client = mock.NewClient(t.KubeNodes, t.CloudProviderInstances, cnr)
t.Client = mock.NewClient(
t.KubeNodes, t.CloudProviderInstances, t.extrakubeObjects...,
)

rm := &controller.ResourceManager{
Client: t.K8sClient,
Expand All @@ -54,7 +76,7 @@ func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner
}

t.CycleNodeRequestTransitioner = NewCycleNodeRequestTransitioner(
cnr, rm, Options{},
cnr, rm, t.transitionerOptions,
)

return t
Expand Down
13 changes: 11 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -582,10 +582,18 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
if err != nil {
return t.transitionToHealing(err)
}

if shouldRequeue {
return reconcile.Result{Requeue: true, RequeueAfter: transitionDuration}, nil
}

// Delete failed sibling CNRs regardless of whether the CNR for the
// transitioner should be deleted. If failed CNRs pile up that will prevent
// Cyclops observer from auto-generating new CNRs for a nodegroup.
if err := t.deleteFailedSiblingCNRs(); err != nil {
return t.transitionToHealing(err)
}

// If deleting CycleNodeRequests is not enabled, stop here
if !t.options.DeleteCNR {
return reconcile.Result{}, nil
Expand All @@ -595,10 +603,11 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
// than the time configured to keep them for.
if t.cycleNodeRequest.CreationTimestamp.Add(t.options.DeleteCNRExpiry).Before(time.Now()) {
t.rm.Logger.Info("Deleting CycleNodeRequest")
err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest)
if err != nil {

if err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest); err != nil {
t.rm.Logger.Error(err, "Unable to delete expired CycleNodeRequest")
}

return reconcile.Result{}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,74 @@ func TestPendingTimeoutReached(t *testing.T) {
assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase)
}

// Test to ensure that Cyclops will not proceed if there is node detached from
// the nodegroup on the cloud provider. It should try to wait for the issue to
// resolve and transition to Initialised when it does before reaching the
// timeout period.
func TestPendingReattachedCloudProviderNode(t *testing.T) {
nodegroup, err := mock.NewNodegroup("ng-1", 2)
if err != nil {
assert.NoError(t, err)
}

// "detach" one instance from the asg
nodegroup[0].Nodegroup = ""

cnr := &v1.CycleNodeRequest{
ObjectMeta: metav1.ObjectMeta{
Name: "cnr-1",
Namespace: "kube-system",
},
Spec: v1.CycleNodeRequestSpec{
NodeGroupsList: []string{"ng-1"},
CycleSettings: v1.CycleSettings{
Concurrency: 1,
Method: v1.CycleNodeRequestMethodDrain,
},
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
"customer": "kitt",
},
},
},
Status: v1.CycleNodeRequestStatus{
Phase: v1.CycleNodeRequestPending,
},
}

fakeTransitioner := NewFakeTransitioner(cnr,
WithKubeNodes(nodegroup),
WithCloudProviderInstances(nodegroup),
)

// Should requeue while it tries to wait
_, err = fakeTransitioner.Run()
assert.NoError(t, err)
assert.Equal(t, v1.CycleNodeRequestPending, cnr.Status.Phase)

// Simulate waiting for 1s less than the wait limit
cnr.Status.EquilibriumWaitStarted = &metav1.Time{
Time: time.Now().Add(-nodeEquilibriumWaitLimit + time.Second),
}

_, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{
AutoScalingGroupName: aws.String("ng-1"),
InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}),
})

assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// The CNR should transition to the Initialised phase because the state of
// the nodes is now correct and this happened within the timeout period.
_, err = fakeTransitioner.Run()
assert.NoError(t, err)
assert.Equal(t, v1.CycleNodeRequestInitialised, cnr.Status.Phase)
assert.Len(t, cnr.Status.NodesToTerminate, 2)
}

// Test to ensure that Cyclops will not proceed if there is node detached from
// the nodegroup on the cloud provider. It should wait and especially should not
// succeed if the instance is re-attached by the final requeuing of the Pending
Expand Down Expand Up @@ -596,6 +664,16 @@ func TestPendingReattachedCloudProviderNodeTooLate(t *testing.T) {
Time: time.Now().Add(-nodeEquilibriumWaitLimit - time.Second),
}

_, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{
AutoScalingGroupName: aws.String("ng-1"),
InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}),
})

assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// This time should transition to the healing phase even though the state
// is correct because the timeout check happens first
_, err = fakeTransitioner.Run()
Expand Down
Loading
Loading