From d2a2a6b41c90c99b17765703c8e2e2e2bcbbfb87 Mon Sep 17 00:00:00 2001 From: Kimmo Lehto Date: Wed, 15 May 2024 13:08:30 +0300 Subject: [PATCH] Check new controllers against etcd member-list to detect replaced hosts Signed-off-by: Kimmo Lehto --- phase/gather_k0s_facts.go | 46 +++++++++++++++++++ phase/reset_controllers.go | 6 +-- phase/validate_facts.go | 34 ++++++++++++++ .../k0sctl.k0sproject.io/v1beta1/cluster.go | 5 +- 4 files changed, 84 insertions(+), 7 deletions(-) diff --git a/phase/gather_k0s_facts.go b/phase/gather_k0s_facts.go index 92ace13d..af2c3fdf 100644 --- a/phase/gather_k0s_facts.go +++ b/phase/gather_k0s_facts.go @@ -4,6 +4,8 @@ import ( "context" "encoding/json" "fmt" + "net" + "net/url" "path" "strings" @@ -83,6 +85,50 @@ func (p *GatherK0sFacts) Run() error { return err } + if p.leader.Metadata.K0sRunningVersion != nil { + if err := p.listEtcdMembers(p.leader); err != nil { + return err + } + } + + return nil +} + +func (p *GatherK0sFacts) listEtcdMembers(h *cluster.Host) error { + log.Infof("%s: listing etcd members", h) + // etcd member-list outputs json like: + // {"members":{"controller0":"https://172.17.0.2:2380","controller1":"https://172.17.0.3:2380"}} + // on versions like ~1.21.x etcd member-list outputs to stderr. + output, err := h.ExecOutput(h.Configurer.K0sCmdf("etcd member-list --data-dir=%s 2>&1", h.K0sDataDir()), exec.Sudo(h)) + if err != nil { + return fmt.Errorf("failed to run list etcd members command: %w", err) + } + + result := make(map[string]any) + if err := json.Unmarshal([]byte(output), &result); err != nil { + return fmt.Errorf("failed to decode etcd member-list output: %w", err) + } + + etcdMembers := []string{} + if members, ok := result["members"].(map[string]any); ok { + for _, urlField := range members { + urlFieldStr, ok := urlField.(string) + if ok { + memberURL, err := url.Parse(urlFieldStr) + if err != nil { + return fmt.Errorf("failed to parse etcd member URL: %w", err) + } + memberHost, _, err := net.SplitHostPort(memberURL.Host) + if err != nil { + return fmt.Errorf("failed to split etcd member URL: %w", err) + } + log.Debugf("%s: detected etcd member %s", h, memberHost) + etcdMembers = append(etcdMembers, memberHost) + } + } + } + + p.Config.Metadata.EtcdMembers = etcdMembers return nil } diff --git a/phase/reset_controllers.go b/phase/reset_controllers.go index 5b33f718..f7eb8707 100644 --- a/phase/reset_controllers.go +++ b/phase/reset_controllers.go @@ -90,11 +90,7 @@ func (p *ResetControllers) Run() error { if !p.NoLeave { log.Debugf("%s: leaving etcd...", h) - etcdAddress := h.SSH.Address - if h.PrivateAddress != "" { - etcdAddress = h.PrivateAddress - } - if err := h.Exec(h.Configurer.K0sCmdf("etcd leave --peer-address %s --datadir %s", etcdAddress, h.K0sDataDir()), exec.Sudo(h)); err != nil { + if err := h.Exec(h.Configurer.K0sCmdf("etcd leave --peer-address %s --datadir %s", h.PrivateAddress, h.K0sDataDir()), exec.Sudo(h)); err != nil { log.Warnf("%s: failed to leave etcd: %s", h, err.Error()) } log.Debugf("%s: leaving etcd completed", h) diff --git a/phase/validate_facts.go b/phase/validate_facts.go index 831929b2..89958cd6 100644 --- a/phase/validate_facts.go +++ b/phase/validate_facts.go @@ -2,6 +2,7 @@ package phase import ( "fmt" + "slices" log "github.com/sirupsen/logrus" ) @@ -27,6 +28,10 @@ func (p *ValidateFacts) Run() error { return err } + if err := p.validateControllerSwap(); err != nil { + return err + } + return nil } @@ -69,3 +74,32 @@ func (p *ValidateFacts) validateDefaultVersion() error { return nil } + +func (p *ValidateFacts) validateControllerSwap() error { + log.Debugf("validating controller list vs etcd member list") + if p.Config.Spec.K0sLeader().Metadata.K0sRunningVersion == nil { + log.Debugf("%s: leader has no k0s running, assuming a fresh cluster", p.Config.Spec.K0sLeader()) + return nil + } + + if p.Config.Spec.K0sLeader().Role == "single" { + log.Debugf("%s: leader is a single node, assuming no etcd", p.Config.Spec.K0sLeader()) + } + + if len(p.Config.Metadata.EtcdMembers) > len(p.Config.Spec.Hosts.Controllers()) { + log.Warnf("there are more etcd members in the cluster than controllers listed in the k0sctl configuration") + } + + for _, h := range p.Config.Spec.Hosts.Controllers() { + if h.Metadata.K0sRunningVersion != nil { + continue + } + log.Debugf("%s: host is new, checking if etcd members list %+v already contains %s", h, p.Config.Metadata.EtcdMembers, h.PrivateAddress) + if slices.Contains(p.Config.Metadata.EtcdMembers, h.PrivateAddress) { + return fmt.Errorf("controller %s is listed as an etcd member but k0s is not running on it, host may have been replaced", h) + } + log.Debugf("%s: no match, assuming its safe to install", h) + } + + return nil +} diff --git a/pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster.go b/pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster.go index 6b07972f..a9603286 100644 --- a/pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster.go +++ b/pkg/apis/k0sctl.k0sproject.io/v1beta1/cluster.go @@ -14,8 +14,9 @@ const APIVersion = "k0sctl.k0sproject.io/v1beta1" // ClusterMetadata defines cluster metadata type ClusterMetadata struct { - Name string `yaml:"name" validate:"required" default:"k0s-cluster"` - Kubeconfig string `yaml:"-"` + Name string `yaml:"name" validate:"required" default:"k0s-cluster"` + Kubeconfig string `yaml:"-"` + EtcdMembers []string `yaml:"-"` } // Cluster describes launchpad.yaml configuration