Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a max failed CNRs threshold to Nodegroups #88

Merged
merged 8 commits into from
Sep 2, 2024
5 changes: 5 additions & 0 deletions deploy/crds/atlassian.com_nodegroups_crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,11 @@ spec:
- waitPeriod
type: object
type: array
maxFailedCycleNodeRequests:
description: MaxFailedCycleNodeRequests defines the maximum number
of allowed failed CNRs for a nodegroup before the observer stops
generating them.
type: integer
nodeGroupName:
description: NodeGroupName is the name of the node group in the cloud
provider that corresponds to this NodeGroup resource.
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/atlassian/v1/nodegroup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ type NodeGroupSpec struct {
// CycleSettings stores the settings to use for cycling the nodes.
CycleSettings CycleSettings `json:"cycleSettings"`

// MaxFailedCycleNodeRequests defines the maximum number of allowed failed CNRs for a nodegroup before the observer
// stops generating them.
MaxFailedCycleNodeRequests uint `json:"maxFailedCycleNodeRequests,omitempty"`

// ValidationOptions stores the settings to use for validating state of nodegroups
// in kube and the cloud provider for cycling the nodes.
ValidationOptions ValidationOptions `json:"validationOptions,omitempty"`
Expand Down
38 changes: 30 additions & 8 deletions pkg/controller/cyclenoderequest/transitioner/test_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,33 @@ import (
v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
"github.com/atlassian-labs/cyclops/pkg/controller"
"github.com/atlassian-labs/cyclops/pkg/mock"

"sigs.k8s.io/controller-runtime/pkg/client"
)

type Option func(t *Transitioner)

func WithCloudProviderInstances(nodes []*mock.Node) Option {
return func(t *Transitioner) {
t.cloudProviderInstances = append(t.cloudProviderInstances, nodes...)
t.CloudProviderInstances = append(t.CloudProviderInstances, nodes...)
}
}

func WithKubeNodes(nodes []*mock.Node) Option {
return func(t *Transitioner) {
t.kubeNodes = append(t.kubeNodes, nodes...)
t.KubeNodes = append(t.KubeNodes, nodes...)
}
}

func WithExtraKubeObject(extraKubeObject client.Object) Option {
return func(t *Transitioner) {
t.extrakubeObjects = append(t.extrakubeObjects, extraKubeObject)
}
}

func WithTransitionerOptions(options Options) Option {
return func(t *Transitioner) {
t.transitionerOptions = options
}
}

Expand All @@ -28,23 +42,31 @@ type Transitioner struct {
*CycleNodeRequestTransitioner
*mock.Client

cloudProviderInstances []*mock.Node
kubeNodes []*mock.Node
CloudProviderInstances []*mock.Node
KubeNodes []*mock.Node

extrakubeObjects []client.Object
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: extraKubeObjects


transitionerOptions Options
}

func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner {
t := &Transitioner{
// By default there are no nodes and each test will
// override these as needed
cloudProviderInstances: make([]*mock.Node, 0),
kubeNodes: make([]*mock.Node, 0),
CloudProviderInstances: make([]*mock.Node, 0),
KubeNodes: make([]*mock.Node, 0),
extrakubeObjects: []client.Object{cnr},
transitionerOptions: Options{},
}

for _, opt := range opts {
opt(t)
}

t.Client = mock.NewClient(t.kubeNodes, t.cloudProviderInstances, cnr)
t.Client = mock.NewClient(
t.KubeNodes, t.CloudProviderInstances, t.extrakubeObjects...,
)

rm := &controller.ResourceManager{
Client: t.K8sClient,
Expand All @@ -54,7 +76,7 @@ func NewFakeTransitioner(cnr *v1.CycleNodeRequest, opts ...Option) *Transitioner
}

t.CycleNodeRequestTransitioner = NewCycleNodeRequestTransitioner(
cnr, rm, Options{},
cnr, rm, t.transitionerOptions,
)

return t
Expand Down
13 changes: 11 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -594,10 +594,18 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
if err != nil {
return t.transitionToHealing(err)
}

if shouldRequeue {
return reconcile.Result{Requeue: true, RequeueAfter: transitionDuration}, nil
}

// Delete failed sibling CNRs regardless of whether the CNR for the
// transitioner should be deleted. If failed CNRs pile up that will prevent
// Cyclops observer from auto-generating new CNRs for a nodegroup.
if err := t.deleteFailedSiblingCNRs(); err != nil {
return t.transitionToHealing(err)
}

// If deleting CycleNodeRequests is not enabled, stop here
if !t.options.DeleteCNR {
return reconcile.Result{}, nil
Expand All @@ -607,10 +615,11 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
// than the time configured to keep them for.
if t.cycleNodeRequest.CreationTimestamp.Add(t.options.DeleteCNRExpiry).Before(time.Now()) {
t.rm.Logger.Info("Deleting CycleNodeRequest")
err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest)
if err != nil {

if err := t.rm.Client.Delete(context.TODO(), t.cycleNodeRequest); err != nil {
t.rm.Logger.Error(err, "Unable to delete expired CycleNodeRequest")
}

return reconcile.Result{}, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ func TestPendingReattachedCloudProviderNode(t *testing.T) {
assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1"
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// The CNR should transition to the Initialised phase because the state of
// the nodes is now correct and this happened within the timeout period.
Expand Down Expand Up @@ -474,7 +474,7 @@ func TestPendingReattachedCloudProviderNodeTooLate(t *testing.T) {
assert.NoError(t, err)

// "re-attach" the instance to the asg
fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1"
fakeTransitioner.CloudProviderInstances[0].Nodegroup = "ng-1"

// This time should transition to the healing phase even though the state
// is correct because the timeout check happens first
Expand Down
Loading
Loading