From 40e69af2242fcd91f4a351da02de1b94158d419c Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Tue, 28 Feb 2023 22:58:57 +0400 Subject: [PATCH] fix: improve etcd leave on reset process When removing a member from `etcd`, the server does a pre-check to make sure the member is connected to a quorum of other members, and the remove request might fail. Add a retry to wait for the etcd to be fully connected before giving up, as some parts of the reset flow alrady ran. Also fix an issue which appears in the integration test, when `reset` is called early in the boot sequence when local etcd hasn't started fully yet. Signed-off-by: Andrey Smirnov --- internal/pkg/etcd/etcd.go | 15 ++++++++++++++- internal/pkg/etcd/local.go | 7 ++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/internal/pkg/etcd/etcd.go b/internal/pkg/etcd/etcd.go index ed69ba9a0f..ca4d2e191a 100644 --- a/internal/pkg/etcd/etcd.go +++ b/internal/pkg/etcd/etcd.go @@ -14,6 +14,7 @@ import ( "time" "github.com/cosi-project/runtime/pkg/state" + "github.com/siderolabs/go-retry/retry" "go.etcd.io/etcd/api/v3/etcdserverpb" "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" "go.etcd.io/etcd/client/pkg/v3/transport" @@ -155,7 +156,19 @@ func (c *Client) LeaveCluster(ctx context.Context, st state.State) error { return err } - if err := c.RemoveMemberByMemberID(ctx, memberID); err != nil { + if err := retry.Constant(5*time.Minute, retry.WithUnits(10*time.Second)).RetryWithContext(ctx, func(ctx context.Context) error { + err := c.RemoveMemberByMemberID(ctx, memberID) + if err == nil { + return nil + } + + if errors.Is(err, rpctypes.ErrUnhealthy) { + // unhealthy is returned when the member hasn't established connections with quorum other members + return retry.ExpectedError(err) + } + + return err + }); err != nil { return err } diff --git a/internal/pkg/etcd/local.go b/internal/pkg/etcd/local.go index a88d8a8d32..ee61629545 100644 --- a/internal/pkg/etcd/local.go +++ b/internal/pkg/etcd/local.go @@ -7,6 +7,7 @@ package etcd import ( "context" "fmt" + "time" "github.com/cosi-project/runtime/pkg/safe" "github.com/cosi-project/runtime/pkg/state" @@ -16,10 +17,14 @@ import ( // GetLocalMemberID gets the etcd member id of the local node via resources. func GetLocalMemberID(ctx context.Context, s state.State) (uint64, error) { - member, err := safe.ReaderGet[*etcd.Member]( + ctx, cancel := context.WithTimeout(ctx, 3*time.Minute) + defer cancel() + + member, err := safe.StateWatchFor[*etcd.Member]( ctx, s, etcd.NewMember(etcd.NamespaceName, etcd.LocalMemberID).Metadata(), + state.WithEventTypes(state.Created), ) if err != nil { return 0, fmt.Errorf("failed to get local etcd member ID: %w", err)