Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Delete and init kubeadm on subsequent starts" #13506

Merged
merged 2 commits into from
Feb 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 188 additions & 27 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
Expand All @@ -61,6 +62,7 @@ import (
"k8s.io/minikube/pkg/minikube/sysinit"
"k8s.io/minikube/pkg/minikube/vmpath"
"k8s.io/minikube/pkg/util"
"k8s.io/minikube/pkg/util/retry"
"k8s.io/minikube/pkg/version"
kconst "k8s.io/minikube/third_party/kubeadm/app/constants"
)
Expand Down Expand Up @@ -396,10 +398,13 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error {
}

if err := bsutil.ExistingConfig(k.c); err == nil {
if reconfigure := k.needsReconfigure(cfg); !reconfigure {
klog.Infof("found existing configuration files, will attempt cluster restart")
rerr := k.restartControlPlane(cfg)
if rerr == nil {
return nil
}

out.ErrT(style.Embarrassed, "Unable to restart cluster, will reset it: {{.error}}", out.V{"error": rerr})
if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil {
klog.Warningf("delete failed: %v", err)
}
Expand Down Expand Up @@ -558,21 +563,71 @@ func (k *Bootstrapper) ensureServiceStarted(svc string) error {
}

// needsReconfigure returns whether or not the cluster needs to be reconfigured
func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool {
func (k *Bootstrapper) needsReconfigure(conf string, hostname string, port int, client *kubernetes.Clientset, version string) bool {
if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
}
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3000 * time.Millisecond
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)
if err != nil {
klog.Infof("needs reconfigure: apiserver error: %v", err)
return true
}
if st != state.Running {
klog.Infof("needs reconfigure: apiserver in state %s", st)
return true
}

if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

if err := kverify.APIServerVersionMatch(client, version); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

// DANGER: This log message is hard-coded in an integration test!
klog.Infof("The running cluster does not require reconfiguration: %s", hostname)
return false
}

// restartCluster restarts the Kubernetes cluster configured by kubeadm
func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
klog.Infof("restartCluster start")

start := time.Now()
defer func() {
klog.Infof("restartCluster took %s", time.Since(start))
}()

k8sVersion, err := util.ParseKubernetesVersion(cfg.KubernetesConfig.KubernetesVersion)
if err != nil {
return errors.Wrap(err, "parsing Kubernetes version")
}

phase := "alpha"
controlPlane := "controlplane"
if k8sVersion.GTE(semver.MustParse("1.13.0")) {
phase = "init"
controlPlane = "control-plane"
}

if err := k.createCompatSymlinks(); err != nil {
klog.Errorf("failed to create compat symlinks: %v", err)
}

cp, err := config.PrimaryControlPlane(&cfg)
if err != nil {
klog.Warningf("needs reconfigure: primary control plane error: %v", err)
return true
return errors.Wrap(err, "primary control plane")
}

hostname, _, port, err := driver.ControlPlaneEndpoint(&cfg, &cp, cfg.Driver)
if err != nil {
klog.Warningf("needs reconfigure: control plane error: %v", err)
return true
return errors.Wrap(err, "control plane")
}

// Save the costly tax of reinstalling Kubernetes if the only issue is a missing kube context
Expand All @@ -583,40 +638,125 @@ func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool {

client, err := k.client(hostname, port)
if err != nil {
klog.Warningf("needs reconfigure: getting k8s client error: %v", err)
return true
return errors.Wrap(err, "getting k8s client")
}

// If the cluster is running, check if we have any work to do.
conf := bsutil.KubeadmYamlPath

if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
if !k.needsReconfigure(conf, hostname, port, client, cfg.KubernetesConfig.KubernetesVersion) {
klog.Infof("Taking a shortcut, as the cluster seems to be properly configured")
return nil
}
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3 * time.Second
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)

if err := k.stopKubeSystem(cfg); err != nil {
klog.Warningf("Failed to stop kube-system containers: port conflicts may arise: %v", err)
}

if err := sysinit.New(k.c).Stop("kubelet"); err != nil {
klog.Warningf("Failed to stop kubelet, this might cause upgrade errors: %v", err)
}

if err := k.clearStaleConfigs(cfg); err != nil {
return errors.Wrap(err, "clearing stale configs")
}

if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil {
return errors.Wrap(err, "cp")
}

baseCmd := fmt.Sprintf("%s %s", bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), phase)
cmds := []string{
fmt.Sprintf("%s phase certs all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubeconfig all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubelet-start --config %s", baseCmd, conf),
fmt.Sprintf("%s phase %s all --config %s", baseCmd, controlPlane, conf),
fmt.Sprintf("%s phase etcd local --config %s", baseCmd, conf),
}

klog.Infof("reconfiguring cluster from %s", conf)
// Run commands one at a time so that it is easier to root cause failures.
for _, c := range cmds {
if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
klog.Errorf("%s failed - will try once more: %v", c, err)

if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
return errors.Wrap(err, "run")
}
}
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
klog.Warningf("needs reconfigure: apiserver error: %v", err)
return true
return errors.Wrap(err, "runtime")
}
if st != state.Running {
klog.Warningf("needs reconfigure: apiserver in state %s", st.String())
return true

// We must ensure that the apiserver is healthy before proceeding
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}

if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver health")
}

if err := kverify.APIServerVersionMatch(client, cfg.KubernetesConfig.KubernetesVersion); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
// because reboots clear /etc/cni
if err := k.applyCNI(cfg); err != nil {
return errors.Wrap(err, "apply cni")
}

klog.Infof("%s: %s", constants.ReconfigurationNotRequired, hostname)
return false
if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}

if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
}

// This can fail during upgrades if the old pods have not shut down yet
addonPhase := func() error {
_, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf)))
return err
}
if err = retry.Expo(addonPhase, 100*time.Microsecond, 30*time.Second); err != nil {
klog.Warningf("addon install failed, wil retry: %v", err)
return errors.Wrap(err, "addons")
}

// must be called after applyCNI and `kubeadm phase addon all` (ie, coredns redeploy)
if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods (last observed state),
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods(meta.NamespaceSystem).List(context.Background(), meta.ListOptions{LabelSelector: "tier=control-plane"})
if err != nil {
return err
}
for _, pod := range pods.Items {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))

if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
}

if err := bsutil.AdjustResourceLimits(k.c); err != nil {
klog.Warningf("unable to adjust resource limits: %v", err)
}

return nil
}

// JoinCluster adds new node to an existing cluster.
Expand Down Expand Up @@ -909,6 +1049,27 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err
return nil
}

// stopKubeSystem stops all the containers in the kube-system to prevent #8740 when doing hot upgrade
func (k *Bootstrapper) stopKubeSystem(cfg config.ClusterConfig) error {
klog.Info("stopping kube-system containers ...")
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "new cruntime")
}

ids, err := cr.ListContainers(cruntime.ListContainersOptions{Namespaces: []string{"kube-system"}})
if err != nil {
return errors.Wrap(err, "list")
}

if len(ids) > 0 {
if err := cr.StopContainers(ids); err != nil {
return errors.Wrap(err, "stop")
}
}
return nil
}

// adviseNodePressure will advise the user what to do with difference pressure errors based on their environment
func adviseNodePressure(err error, name string, drv string) {
if diskErr, ok := err.(*kverify.ErrDiskPressure); ok {
Expand Down
3 changes: 0 additions & 3 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,6 @@ const (
MountTypeFlag = "type"
// MountUIDFlag is the flag used to set the mount UID
MountUIDFlag = "uid"

// ReconfigurationNotRequired is the message logged when reconfiguration is not required
ReconfigurationNotRequired = "The running cluster does not require reconfiguration"
)

var (
Expand Down
3 changes: 1 addition & 2 deletions test/integration/pause_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"testing"

"k8s.io/minikube/cmd/minikube/cmd"
"k8s.io/minikube/pkg/minikube/constants"
)

// TestPause tests minikube pause functionality
Expand Down Expand Up @@ -97,7 +96,7 @@ func validateStartNoReconfigure(ctx context.Context, t *testing.T, profile strin
}

if !NoneDriver() {
softLog := constants.ReconfigurationNotRequired
softLog := "The running cluster does not require reconfiguration"
if !strings.Contains(rr.Output(), softLog) {
t.Errorf("expected the second start log output to include %q but got: %s", softLog, rr.Output())
}
Expand Down
1 change: 1 addition & 0 deletions translations/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,7 @@
"Unable to pull images, which may be OK: {{.error}}": "Bilder können nicht abgerufen werden, was möglicherweise kein Problem darstellt: {{.error}}",
"Unable to push cached images: {{.error}}": "Kann gecachete Image nicht veröffentlichen (push): {{.error}}",
"Unable to remove machine directory": "Kann Maschinen Verzeichnis nicht entfernen",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "Kann existierenden Kubernetes v{{.old}} Cluster nicht auf Version v{{.new}} downgraden",
"Unable to stop VM": "Kann VM nicht stoppen",
"Unable to update {{.driver}} driver: {{.error}}": "Kann Treiber {{.driver}} nicht aktualisieren: {{.error}}",
Expand Down
1 change: 1 addition & 0 deletions translations/es.json
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@
"Unable to pull images, which may be OK: {{.error}}": "No se ha podido recuperar imágenes, que podrían estar en buen estado: {{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 1 addition & 0 deletions translations/ko.json
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to remove machine directory: %v": "머신 디렉토리를 제거할 수 없습니다: %v",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "가상 머신을 시작할 수 없습니다. 확인 후 가능하면 'minikube delete' 를 실행하세요",
"Unable to stop VM": "가상 머신을 중지할 수 없습니다",
Expand Down
1 change: 1 addition & 0 deletions translations/pl.json
Original file line number Diff line number Diff line change
Expand Up @@ -782,6 +782,7 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM": "Nie można uruchomić maszyny wirtualnej",
"Unable to stop VM": "Nie można zatrzymać maszyny wirtualnej",
Expand Down
1 change: 1 addition & 0 deletions translations/ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 1 addition & 0 deletions translations/strings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -713,6 +713,7 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 1 addition & 0 deletions translations/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,7 @@
"Unable to pull images, which may be OK: {{.error}}": "无法拉取镜像,有可能是正常状况:{{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "无法启动虚拟机。可能的话请检查后执行 'minikube delete'",
"Unable to stop VM": "无法停止虚拟机",
Expand Down