Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions pkg/etcdcli/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,11 @@ func HasStarted(member *etcdserverpb.Member) bool {
// loss of a single etcd member. Such loss is common during new static pod revision.
func IsQuorumFaultTolerant(memberHealth []healthCheck) bool {
totalMembers := len(memberHealth)
quorum := totalMembers/2 + 1
quorum, err := MinimumTolerableQuorum(totalMembers)
if err != nil {
klog.Errorf("etcd cluster could not determine minimum quorum required. total number of members is %v. minimum quorum required is %v: %w", totalMembers, quorum, err)
return false
}
healthyMembers := len(GetHealthyMemberNames(memberHealth))
switch {
case totalMembers-quorum < 1:
Expand All @@ -256,7 +260,10 @@ func IsQuorumFaultTolerant(memberHealth []healthCheck) bool {
// IsQuorumFaultTolerantErr is the same as IsQuorumFaultTolerant but with an error return instead of the log
func IsQuorumFaultTolerantErr(memberHealth []healthCheck) error {
totalMembers := len(memberHealth)
quorum := totalMembers/2 + 1
quorum, err := MinimumTolerableQuorum(totalMembers)
if err != nil {
return fmt.Errorf("etcd cluster could not determine minimum quorum required. total number of members is %v. minimum quorum required is %v: %w", totalMembers, quorum, err)
}
healthyMembers := len(GetHealthyMemberNames(memberHealth))
switch {
case totalMembers-quorum < 1:
Expand Down Expand Up @@ -320,3 +327,10 @@ func (c *raftTermsCollector) Collect(ch chan<- prometheus.Metric) {
)
}
}

func MinimumTolerableQuorum(members int) (int, error) {
if members <= 0 {
return 0, fmt.Errorf("invalid etcd member length: %v", members)
}
return (members / 2) + 1, nil
}
46 changes: 46 additions & 0 deletions pkg/etcdcli/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package etcdcli

import (
"fmt"
"github.com/stretchr/testify/require"
"reflect"
"testing"

Expand Down Expand Up @@ -260,6 +261,51 @@ func healthyMember(member int) healthCheck {
}
}

func TestMinimumTolerableQuorum(t *testing.T) {

scenarios := []struct {
name string
input int
expErr error
exp int
}{
{
name: "valid input `3`",
input: 3,
expErr: nil,
exp: 2,
},
{
name: "valid input `5`",
input: 5,
expErr: nil,
exp: 3,
},
{
name: "invalid input `0`",
input: 0,
expErr: fmt.Errorf("invalid etcd member length: %v", 0),
exp: 0,
},
{
name: "invalid input `-10`",
input: -10,
expErr: fmt.Errorf("invalid etcd member length: %v", -10),
exp: 0,
},
}

for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
// act
actual, err := MinimumTolerableQuorum(scenario.input)
// assert
require.Equal(t, scenario.expErr, err)
require.Equal(t, scenario.exp, actual)
})
}
}

func unHealthyMember(member int) healthCheck {
return healthCheck{
Member: &etcdserverpb.Member{
Expand Down
25 changes: 16 additions & 9 deletions pkg/operator/ceohelpers/common.go
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
package ceohelpers

import (
"context"
"encoding/json"
"fmt"
"github.com/openshift/cluster-etcd-operator/pkg/dnshelpers"
"github.com/openshift/cluster-etcd-operator/pkg/etcdcli"
"net"
"net/url"

"go.etcd.io/etcd/api/v3/etcdserverpb"

machinev1beta1 "github.com/openshift/api/machine/v1beta1"
machinelistersv1beta1 "github.com/openshift/client-go/machine/listers/machine/v1beta1"
"github.com/openshift/library-go/pkg/operator/resource/resourcemerge"
"github.com/openshift/library-go/pkg/operator/v1helpers"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
corev1listers "k8s.io/client-go/listers/core/v1"

machinelistersv1beta1 "github.com/openshift/client-go/machine/listers/machine/v1beta1"
"github.com/openshift/library-go/pkg/operator/resource/resourcemerge"
"github.com/openshift/library-go/pkg/operator/v1helpers"

"github.com/openshift/cluster-etcd-operator/pkg/operator/configobservation/controlplanereplicascount"
)
Expand Down Expand Up @@ -178,14 +179,20 @@ func memberToURL(member *etcdserverpb.Member) (string, error) {
return member.PeerURLs[0], nil
}

func VotingMemberIPListSet(configMapLister corev1listers.ConfigMapNamespaceLister) (sets.String, error) {
etcdEndpointsConfigMap, err := configMapLister.Get("etcd-endpoints")
func VotingMemberIPListSet(ctx context.Context, cli etcdcli.EtcdClient) (sets.String, error) {
members, err := cli.VotingMemberList(ctx)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tjungblu is this correct ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch 👍🏽

if err != nil {
return sets.NewString(), err // should not happen
}
currentVotingMemberIPListSet := sets.NewString()
for _, votingMemberIP := range etcdEndpointsConfigMap.Data {
currentVotingMemberIPListSet.Insert(votingMemberIP)

for _, member := range members {
// Use of PeerURL is expected here because it is a mandatory field, and it will mirror ClientURL.
ip, err := dnshelpers.GetIPFromAddress(member.PeerURLs[0])
if err != nil {
return sets.NewString(), err
}
currentVotingMemberIPListSet.Insert(ip)
}

return currentVotingMemberIPListSet, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ func (c *ClusterMemberController) isEtcdContainerRunningNotReady(node *corev1.No
// The voting members are read from the etcd-endpoints configmap
func (c *ClusterMemberController) allNodesMapToVotingMembers(nodes []*corev1.Node) ([]*corev1.Node, error) {
var nonVotingMemberNodes []*corev1.Node
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(c.configMapLister)
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(context.Background(), c.etcdClient)
if err != nil {
return nonVotingMemberNodes, fmt.Errorf("failed to get the set of voting members: %v", err)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (c *clusterMemberRemovalController) sync(ctx context.Context, syncCtx facto
// attemptToScaleDown attempts to remove a voting member only once we have identified that
// a Machine resource is being deleted and a replacement member has been created
func (c *clusterMemberRemovalController) attemptToScaleDown(ctx context.Context, recorder events.Recorder) error {
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(c.configMapListerForTargetNamespace)
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(ctx, c.etcdClient)
if err != nil {
return err
}
Expand All @@ -125,15 +125,21 @@ func (c *clusterMemberRemovalController) attemptToScaleDown(ctx context.Context,
if desiredControlPlaneReplicasCount == 0 {
return fmt.Errorf("desired control plane replicas count cannot be empty")
}
if currentVotingMemberIPListSet.Len() <= desiredControlPlaneReplicasCount {
klog.V(4).Infof("Ignoring scale-down since the number of etcd voting members (%d) < desired number of control-plane replicas (%d) ", currentVotingMemberIPListSet.Len(), desiredControlPlaneReplicasCount)

liveVotingMembers, err := c.getAllVotingMembers(ctx)
if err != nil {
return fmt.Errorf("could not list etcd members: %w", err)
}

if len(liveVotingMembers) < desiredControlPlaneReplicasCount {
klog.V(2).Infof("Ignoring scale-down since the number of etcd members (%d) < desired number of control-plane replicas (%d) ", len(liveVotingMembers), desiredControlPlaneReplicasCount)
return nil
}

// machines with master role and deletion hook
memberMachines, err := ceohelpers.CurrentMemberMachinesWithDeletionHooks(c.masterMachineSelector, c.masterMachineLister)
if err != nil {
return err
return fmt.Errorf("could not find master machines with deletion hook: %w", err)
}

var votingMembersMachines []*machinev1beta1.Machine
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to make the below logic consistent, should the currentVotingMemberIPListSet sourced from liveVotingMembers too? Otherwise you have one set generated by configmap, the other from the etcd memberlist.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually wanted to do this, but did not want to change the logic completely.

I had to use liveVotingMembers for deciding whether attempt scaling down, or skip. During testing, I noticed that the configmap takes time to reflect the cluster membership, and the CEO keep spinning waiting for a new CM to be created.

+1 to replace currentVotingMemberIPListSet with liveVotingMembers, wdyt @hasbro17 ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had the opposite thought actually. For the default case when we're checking if scale-down is allowed i.e votingMembers >= desiredControlPlaneReplicasCount we don't want to make live calls to the etcd cluster.

The reason why we use currentVotingMemberIPListSet (sourced from the etcd-endpoints configmap) is so we're not inducing unnecessary load in the default case by using a cache instead of live calls to cluster.

During testing, I noticed that the configmap takes time to reflect the cluster membership, and the CEO keep spinning waiting for a new CM to be created.

Not sure what CM here is (assuming machine) but other than the delay @Elbehery Is there a reason why you can't use the currentVotingMemberIPListSet in L128-135 above? How much is the delay in practice? I would expect the etcd-endpoints controller to update within the minute once there is an actual membership change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @hasbro17 for your review

votingMembers >= desiredControlPlaneReplicasCount we don't want to make live calls to the etcd cluster.

the etcd-endpoint CM contains all the voting member ( i.e. non-learner ) whether healthy or unhealthy. In #937 (comment) we agreed to constrain to the scaling-down invariant around healthyVotingMembers iiuc

we're not inducing unnecessary load in the default case by using a cache instead of live calls to cluster.

I think listing the cluster members is not a load, moreover the recent changes to etcdcli package reuses the connections :)

I would expect the etcd-endpoints controller to update within the minute once there is an actual membership change.

Yes this is correct, it takes around a minutes to update the CM .. I can change it back, but may I ask listing the healthy members directly from the etcd cluster is not correct ?

Copy link
Contributor

@tjungblu tjungblu Oct 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason why we use currentVotingMemberIPListSet (sourced from the etcd-endpoints configmap) is so we're not inducing unnecessary load in the default case by using a cache instead of live calls to cluster.

That makes sense, I think we make lots of MemberList calls across the operator already (health probe, etc). The connection to etcd is already open as Mustafa said, so getting the current list of members is not an expensive call.

I'm all up for consolidating this listing into a single controller that queries this in the background every 5-10s and all other controllers just feed from that.

How much is the delay in practice?

I don't have a direct answer for that, but let's go with the case I've added to the quorum guard here:
https://github.com/openshift/cluster-etcd-operator/blob/master/pkg/operator/etcdendpointscontroller/etcdendpointscontroller.go#L145

So when you're down to two nodes, we'll never update the CM until you add a new node. I realize now that this is stupid, but we have no other way to selectively tell the static pod controller not to rollout a revision based on that.
In that worst case, it takes how long it takes the cluster admin to add a new node.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm all up for consolidating this listing into a single controller that queries this in the background every 5-10s and all other controllers just feed from that.

+1

we'll never update the CM until you add a new node.

In this case I think listing the members from etcd client makes more sense, or ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I forgot that we'll have client connections open from the client cache so I guess having this controller make periodic live list calls might not add a significant load. So I guess we can keep the live call here.

So when you're down to two nodes, we'll never update the CM until you add a new node

Yeah that's not ideal especially since the apiserver is using the endpoints configmap and will end up talking to a non-existent endpoint if we have 2 members and the CM is outdated with 3. But it's a necessary trade-off that we've made given that we don't want quorum loss mid rollout.
Plus with CPMS once we should have a new machine who's member would soon be added and promoted right after the unhealthy one is removed so this gap shouldn't be too large.

Let's keep the live list for now, and leave off the consolidation of list calls to a better internal cache than the etcd-endpoints CM as a follow up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we agree on sticking with the member listing, then I would definitely like to see currentVotingMemberIPListSet to come from the member listing. Otherwise this is going to be wildly inconsistent.

Expand All @@ -151,14 +157,19 @@ func (c *clusterMemberRemovalController) attemptToScaleDown(ctx context.Context,
// do not trust data in the cache, compare with the current state
healthyLiveVotingMembers, err := c.getHealthyVotingMembers(ctx)
if err != nil {
return err
return fmt.Errorf("could not list etcd healthy members: %w", err)
}

// scaling down invariant
if len(healthyLiveVotingMembers) < desiredControlPlaneReplicasCount {
klog.V(2).Infof("Ignoring scale down since the number of healthy live etcd voting members (%d) < desired number of control-plane replicas (%d) ", len(healthyLiveVotingMembers), desiredControlPlaneReplicasCount)
minTolerableQuorum, err := etcdcli.MinimumTolerableQuorum(desiredControlPlaneReplicasCount)
if err != nil {
klog.V(2).Infof("etcd cluster could not determine minimum quorum required. desiredControlPlaneReplicasCount is %v. minimum quorum required is %v: %w", desiredControlPlaneReplicasCount, minTolerableQuorum, err)
}

if len(healthyLiveVotingMembers) < minTolerableQuorum {
klog.V(2).Infof("ignoring scale down since the number of healthy live etcd members (%d) < minimum required to maintain quorum (%d) ", len(healthyLiveVotingMembers), minTolerableQuorum)
if time.Now().After(c.lastTimeScaleDownEventWasSent.Add(5 * time.Minute)) {
recorder.Eventf("ScaleDown", "Ignoring scale down since the number of healthy live etcd voting members (%d) < desired number of control-plane replicas (%d) ", len(healthyLiveVotingMembers), desiredControlPlaneReplicasCount)
recorder.Eventf("ScaleDown", "Ignoring scale down since the number of healthy live etcd members (%d) < minimum required to maintain quorum (%d) ", len(healthyLiveVotingMembers), minTolerableQuorum)
c.lastTimeScaleDownEventWasSent = time.Now()
}
return nil
Expand Down Expand Up @@ -192,25 +203,19 @@ func (c *clusterMemberRemovalController) attemptToScaleDown(ctx context.Context,
unhealthyMembersURLs = append(unhealthyMembersURLs, unhealthyMember.Name)
}
}
if len(unhealthyVotingMemberMachinesPendingDeletion) > 0 {
klog.V(4).Infof("found unhealthy voting members with machine pending deletion: %v", unhealthyVotingMemberMachinesPendingDeletion)
klog.V(4).Infof("unhealthy members found: %v", unhealthyMembersURLs)
} else {
return fmt.Errorf("cannot proceed with scaling down, unhealthy voting members found: %v, none are pending deletion", unhealthyMembersURLs)
if len(unhealthyVotingMemberMachinesPendingDeletion) == 0 {
klog.V(2).Infof("cannot proceed with scaling down, unhealthy voting etcd members found: %v but none are pending deletion", unhealthyMembersURLs)
return fmt.Errorf("cannot proceed with scaling down, unhealthy voting etcd members found: %v but none are pending deletion", unhealthyMembersURLs)
}
}

// remove the unhealthy machine pending deletion first
// if no unhealthy machine pending deletion found, then attempt to scale down the healthy machines pending deletion
if len(unhealthyVotingMemberMachinesPendingDeletion) > 0 {
klog.V(2).Infof("found unhealthy voting etcd members with machine pending deletion: %v", unhealthyVotingMemberMachinesPendingDeletion)
votingMembersMachinesPendingDeletion = append(unhealthyVotingMemberMachinesPendingDeletion, votingMembersMachinesPendingDeletion...)
}

liveVotingMembers, err := c.getAllVotingMembers(ctx)
if err != nil {
return err
}

var allErrs []error
for _, votingMemberMachinePendingDeletion := range votingMembersMachinesPendingDeletion {
removed, errs := c.attemptToRemoveMemberFor(ctx, liveVotingMembers, votingMemberMachinePendingDeletion, recorder)
Expand Down Expand Up @@ -290,13 +295,13 @@ func (c *clusterMemberRemovalController) removeMemberWithoutMachine(ctx context.

// attemptToRemoveLearningMember attempts to remove a learning member pending deletion regardless of whether a replacement member has been found
func (c *clusterMemberRemovalController) attemptToRemoveLearningMember(ctx context.Context, recorder events.Recorder) error {
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(c.configMapListerForTargetNamespace)
currentVotingMemberIPListSet, err := ceohelpers.VotingMemberIPListSet(ctx, c.etcdClient)
if err != nil {
return err
}
memberMachines, err := ceohelpers.CurrentMemberMachinesWithDeletionHooks(c.masterMachineSelector, c.masterMachineLister)
if err != nil {
return err
return fmt.Errorf("could not find master machines with deletion hook: %w", err)
}
var learningMachines []*machinev1beta1.Machine
for memberMachineIP, memberMachine := range ceohelpers.IndexMachinesByNodeInternalIP(memberMachines) {
Expand Down
Loading