Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Delete and init kubeadm on subsequent starts #13121

Merged
merged 4 commits into from
Dec 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 27 additions & 187 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
Expand All @@ -62,7 +61,6 @@ import (
"k8s.io/minikube/pkg/minikube/sysinit"
"k8s.io/minikube/pkg/minikube/vmpath"
"k8s.io/minikube/pkg/util"
"k8s.io/minikube/pkg/util/retry"
"k8s.io/minikube/pkg/version"
kconst "k8s.io/minikube/third_party/kubeadm/app/constants"
)
Expand Down Expand Up @@ -398,13 +396,10 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error {
}

if err := bsutil.ExistingConfig(k.c); err == nil {
klog.Infof("found existing configuration files, will attempt cluster restart")
rerr := k.restartControlPlane(cfg)
if rerr == nil {
if reconfigure := k.needsReconfigure(cfg); !reconfigure {
return nil
}

out.ErrT(style.Embarrassed, "Unable to restart cluster, will reset it: {{.error}}", out.V{"error": rerr})
if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil {
klog.Warningf("delete failed: %v", err)
}
Expand Down Expand Up @@ -563,71 +558,21 @@ func (k *Bootstrapper) ensureServiceStarted(svc string) error {
}

// needsReconfigure returns whether or not the cluster needs to be reconfigured
func (k *Bootstrapper) needsReconfigure(conf string, hostname string, port int, client *kubernetes.Clientset, version string) bool {
if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
}
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3000 * time.Millisecond
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)
if err != nil {
klog.Infof("needs reconfigure: apiserver error: %v", err)
return true
}
if st != state.Running {
klog.Infof("needs reconfigure: apiserver in state %s", st)
return true
}

if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

if err := kverify.APIServerVersionMatch(client, version); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

// DANGER: This log message is hard-coded in an integration test!
klog.Infof("The running cluster does not require reconfiguration: %s", hostname)
return false
}

// restartCluster restarts the Kubernetes cluster configured by kubeadm
func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
klog.Infof("restartCluster start")

start := time.Now()
defer func() {
klog.Infof("restartCluster took %s", time.Since(start))
}()

k8sVersion, err := util.ParseKubernetesVersion(cfg.KubernetesConfig.KubernetesVersion)
if err != nil {
return errors.Wrap(err, "parsing Kubernetes version")
}

phase := "alpha"
controlPlane := "controlplane"
if k8sVersion.GTE(semver.MustParse("1.13.0")) {
phase = "init"
controlPlane = "control-plane"
}

func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool {
if err := k.createCompatSymlinks(); err != nil {
klog.Errorf("failed to create compat symlinks: %v", err)
}

cp, err := config.PrimaryControlPlane(&cfg)
if err != nil {
return errors.Wrap(err, "primary control plane")
klog.Warningf("needs reconfigure: primary control plane error: %v", err)
return true
}

hostname, _, port, err := driver.ControlPlaneEndpoint(&cfg, &cp, cfg.Driver)
if err != nil {
return errors.Wrap(err, "control plane")
klog.Warningf("needs reconfigure: control plane error: %v", err)
return true
}

// Save the costly tax of reinstalling Kubernetes if the only issue is a missing kube context
Expand All @@ -638,124 +583,40 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {

client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "getting k8s client")
klog.Warningf("needs reconfigure: getting k8s client error: %v", err)
return true
}

// If the cluster is running, check if we have any work to do.
conf := bsutil.KubeadmYamlPath
if !k.needsReconfigure(conf, hostname, port, client, cfg.KubernetesConfig.KubernetesVersion) {
klog.Infof("Taking a shortcut, as the cluster seems to be properly configured")
return nil
}

if err := k.stopKubeSystem(cfg); err != nil {
klog.Warningf("Failed to stop kube-system containers: port conflicts may arise: %v", err)
}

if err := sysinit.New(k.c).Stop("kubelet"); err != nil {
klog.Warningf("Failed to stop kubelet, this might cause upgrade errors: %v", err)
}

if err := k.clearStaleConfigs(cfg); err != nil {
return errors.Wrap(err, "clearing stale configs")
}

if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil {
return errors.Wrap(err, "cp")
}

baseCmd := fmt.Sprintf("%s %s", bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), phase)
cmds := []string{
fmt.Sprintf("%s phase certs all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubeconfig all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubelet-start --config %s", baseCmd, conf),
fmt.Sprintf("%s phase %s all --config %s", baseCmd, controlPlane, conf),
fmt.Sprintf("%s phase etcd local --config %s", baseCmd, conf),
}

klog.Infof("reconfiguring cluster from %s", conf)
// Run commands one at a time so that it is easier to root cause failures.
for _, c := range cmds {
if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
klog.Errorf("%s failed - will try once more: %v", c, err)

if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
return errors.Wrap(err, "run")
}
}
if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3 * time.Second
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)
if err != nil {
return errors.Wrap(err, "runtime")
}

// We must ensure that the apiserver is healthy before proceeding
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}

if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver health")
}

// because reboots clear /etc/cni
if err := k.applyCNI(cfg); err != nil {
return errors.Wrap(err, "apply cni")
}

if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}

if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
}

// This can fail during upgrades if the old pods have not shut down yet
addonPhase := func() error {
_, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf)))
return err
klog.Warningf("needs reconfigure: apiserver error: %v", err)
return true
}
if err = retry.Expo(addonPhase, 100*time.Microsecond, 30*time.Second); err != nil {
klog.Warningf("addon install failed, wil retry: %v", err)
return errors.Wrap(err, "addons")
if st != state.Running {
klog.Warningf("needs reconfigure: apiserver in state %s", st.String())
return true
}

// must be called after applyCNI and `kubeadm phase addon all` (ie, coredns redeploy)
if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods (last observed state),
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods(meta.NamespaceSystem).List(context.Background(), meta.ListOptions{LabelSelector: "tier=control-plane"})
if err != nil {
return err
}
for _, pod := range pods.Items {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))

if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
}

if err := bsutil.AdjustResourceLimits(k.c); err != nil {
klog.Warningf("unable to adjust resource limits: %v", err)
if err := kverify.APIServerVersionMatch(client, cfg.KubernetesConfig.KubernetesVersion); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
}

return nil
klog.Infof("%s: %s", constants.ReconfigurationNotRequired, hostname)
return false
}

// JoinCluster adds new node to an existing cluster.
Expand Down Expand Up @@ -1045,27 +906,6 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err
return nil
}

// stopKubeSystem stops all the containers in the kube-system to prevent #8740 when doing hot upgrade
func (k *Bootstrapper) stopKubeSystem(cfg config.ClusterConfig) error {
klog.Info("stopping kube-system containers ...")
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "new cruntime")
}

ids, err := cr.ListContainers(cruntime.ListContainersOptions{Namespaces: []string{"kube-system"}})
if err != nil {
return errors.Wrap(err, "list")
}

if len(ids) > 0 {
if err := cr.StopContainers(ids); err != nil {
return errors.Wrap(err, "stop")
}
}
return nil
}

// adviseNodePressure will advise the user what to do with difference pressure errors based on their environment
func adviseNodePressure(err error, name string, drv string) {
if diskErr, ok := err.(*kverify.ErrDiskPressure); ok {
Expand Down
3 changes: 3 additions & 0 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ const (
MountTypeFlag = "type"
// MountUIDFlag is the flag used to set the mount UID
MountUIDFlag = "uid"

// ReconfigurationNotRequired is the message logged when reconfiguration is not required
ReconfigurationNotRequired = "The running cluster does not require reconfiguration"
)

var (
Expand Down
2 changes: 1 addition & 1 deletion pkg/minikube/node/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ func setupKubeAdm(mAPI libmachine.API, cfg config.ClusterConfig, n config.Node,
if err != nil {
exit.Error(reason.InternalBootstrapper, "Failed to get bootstrapper", err)
}
for _, eo := range config.ExtraOptions {
for _, eo := range cfg.KubernetesConfig.ExtraOptions {
out.Infof("{{.extra_option_component_name}}.{{.key}}={{.value}}", out.V{"extra_option_component_name": eo.Component, "key": eo.Key, "value": eo.Value})
}
// Loads cached images, generates config files, download binaries
Expand Down
3 changes: 2 additions & 1 deletion test/integration/pause_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"testing"

"k8s.io/minikube/cmd/minikube/cmd"
"k8s.io/minikube/pkg/minikube/constants"
)

// TestPause tests minikube pause functionality
Expand Down Expand Up @@ -96,7 +97,7 @@ func validateStartNoReconfigure(ctx context.Context, t *testing.T, profile strin
}

if !NoneDriver() {
softLog := "The running cluster does not require reconfiguration"
softLog := constants.ReconfigurationNotRequired
if !strings.Contains(rr.Output(), softLog) {
t.Errorf("expected the second start log output to include %q but got: %s", softLog, rr.Output())
}
Expand Down
1 change: 0 additions & 1 deletion translations/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,6 @@
"Unable to pull images, which may be OK: {{.error}}": "Bilder können nicht abgerufen werden, was möglicherweise kein Problem darstellt: {{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/es.json
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,6 @@
"Unable to pull images, which may be OK: {{.error}}": "No se ha podido recuperar imágenes, que podrían estar en buen estado: {{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/ko.json
Original file line number Diff line number Diff line change
Expand Up @@ -770,7 +770,6 @@
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to remove machine directory: %v": "머신 디렉토리를 제거할 수 없습니다: %v",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "가상 머신을 시작할 수 없습니다. 확인 후 가능하면 'minikube delete' 를 실행하세요",
"Unable to stop VM": "가상 머신을 중지할 수 없습니다",
Expand Down
1 change: 0 additions & 1 deletion translations/pl.json
Original file line number Diff line number Diff line change
Expand Up @@ -778,7 +778,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM": "Nie można uruchomić maszyny wirtualnej",
"Unable to stop VM": "Nie można zatrzymać maszyny wirtualnej",
Expand Down
1 change: 0 additions & 1 deletion translations/ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/strings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -877,7 +877,6 @@
"Unable to pull images, which may be OK: {{.error}}": "无法拉取镜像,有可能是正常状况:{{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "无法启动虚拟机。可能的话请检查后执行 'minikube delete'",
"Unable to stop VM": "无法停止虚拟机",
Expand Down