Skip to content

Commit

Permalink
Merge pull request #13121 from spowelljr/fixSecondStart
Browse files Browse the repository at this point in the history
Delete and init kubeadm on subsequent starts
  • Loading branch information
spowelljr authored Dec 15, 2021
2 parents 5de7650 + 78aef25 commit 1d43372
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 196 deletions.
214 changes: 27 additions & 187 deletions pkg/minikube/bootstrapper/kubeadm/kubeadm.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ import (
"github.com/docker/machine/libmachine/state"
"github.com/pkg/errors"
core "k8s.io/api/core/v1"
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
Expand All @@ -62,7 +61,6 @@ import (
"k8s.io/minikube/pkg/minikube/sysinit"
"k8s.io/minikube/pkg/minikube/vmpath"
"k8s.io/minikube/pkg/util"
"k8s.io/minikube/pkg/util/retry"
"k8s.io/minikube/pkg/version"
kconst "k8s.io/minikube/third_party/kubeadm/app/constants"
)
Expand Down Expand Up @@ -398,13 +396,10 @@ func (k *Bootstrapper) StartCluster(cfg config.ClusterConfig) error {
}

if err := bsutil.ExistingConfig(k.c); err == nil {
klog.Infof("found existing configuration files, will attempt cluster restart")
rerr := k.restartControlPlane(cfg)
if rerr == nil {
if reconfigure := k.needsReconfigure(cfg); !reconfigure {
return nil
}

out.ErrT(style.Embarrassed, "Unable to restart cluster, will reset it: {{.error}}", out.V{"error": rerr})
if err := k.DeleteCluster(cfg.KubernetesConfig); err != nil {
klog.Warningf("delete failed: %v", err)
}
Expand Down Expand Up @@ -563,71 +558,21 @@ func (k *Bootstrapper) ensureServiceStarted(svc string) error {
}

// needsReconfigure returns whether or not the cluster needs to be reconfigured
func (k *Bootstrapper) needsReconfigure(conf string, hostname string, port int, client *kubernetes.Clientset, version string) bool {
if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
}
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3000 * time.Millisecond
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)
if err != nil {
klog.Infof("needs reconfigure: apiserver error: %v", err)
return true
}
if st != state.Running {
klog.Infof("needs reconfigure: apiserver in state %s", st)
return true
}

if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

if err := kverify.APIServerVersionMatch(client, version); err != nil {
klog.Infof("needs reconfigure: %v", err)
return true
}

// DANGER: This log message is hard-coded in an integration test!
klog.Infof("The running cluster does not require reconfiguration: %s", hostname)
return false
}

// restartCluster restarts the Kubernetes cluster configured by kubeadm
func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {
klog.Infof("restartCluster start")

start := time.Now()
defer func() {
klog.Infof("restartCluster took %s", time.Since(start))
}()

k8sVersion, err := util.ParseKubernetesVersion(cfg.KubernetesConfig.KubernetesVersion)
if err != nil {
return errors.Wrap(err, "parsing Kubernetes version")
}

phase := "alpha"
controlPlane := "controlplane"
if k8sVersion.GTE(semver.MustParse("1.13.0")) {
phase = "init"
controlPlane = "control-plane"
}

func (k *Bootstrapper) needsReconfigure(cfg config.ClusterConfig) bool {
if err := k.createCompatSymlinks(); err != nil {
klog.Errorf("failed to create compat symlinks: %v", err)
}

cp, err := config.PrimaryControlPlane(&cfg)
if err != nil {
return errors.Wrap(err, "primary control plane")
klog.Warningf("needs reconfigure: primary control plane error: %v", err)
return true
}

hostname, _, port, err := driver.ControlPlaneEndpoint(&cfg, &cp, cfg.Driver)
if err != nil {
return errors.Wrap(err, "control plane")
klog.Warningf("needs reconfigure: control plane error: %v", err)
return true
}

// Save the costly tax of reinstalling Kubernetes if the only issue is a missing kube context
Expand All @@ -638,124 +583,40 @@ func (k *Bootstrapper) restartControlPlane(cfg config.ClusterConfig) error {

client, err := k.client(hostname, port)
if err != nil {
return errors.Wrap(err, "getting k8s client")
klog.Warningf("needs reconfigure: getting k8s client error: %v", err)
return true
}

// If the cluster is running, check if we have any work to do.
conf := bsutil.KubeadmYamlPath
if !k.needsReconfigure(conf, hostname, port, client, cfg.KubernetesConfig.KubernetesVersion) {
klog.Infof("Taking a shortcut, as the cluster seems to be properly configured")
return nil
}

if err := k.stopKubeSystem(cfg); err != nil {
klog.Warningf("Failed to stop kube-system containers: port conflicts may arise: %v", err)
}

if err := sysinit.New(k.c).Stop("kubelet"); err != nil {
klog.Warningf("Failed to stop kubelet, this might cause upgrade errors: %v", err)
}

if err := k.clearStaleConfigs(cfg); err != nil {
return errors.Wrap(err, "clearing stale configs")
}

if _, err := k.c.RunCmd(exec.Command("sudo", "cp", conf+".new", conf)); err != nil {
return errors.Wrap(err, "cp")
}

baseCmd := fmt.Sprintf("%s %s", bsutil.InvokeKubeadm(cfg.KubernetesConfig.KubernetesVersion), phase)
cmds := []string{
fmt.Sprintf("%s phase certs all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubeconfig all --config %s", baseCmd, conf),
fmt.Sprintf("%s phase kubelet-start --config %s", baseCmd, conf),
fmt.Sprintf("%s phase %s all --config %s", baseCmd, controlPlane, conf),
fmt.Sprintf("%s phase etcd local --config %s", baseCmd, conf),
}

klog.Infof("reconfiguring cluster from %s", conf)
// Run commands one at a time so that it is easier to root cause failures.
for _, c := range cmds {
if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
klog.Errorf("%s failed - will try once more: %v", c, err)

if _, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", c)); err != nil {
return errors.Wrap(err, "run")
}
}
if rr, err := k.c.RunCmd(exec.Command("sudo", "diff", "-u", conf, conf+".new")); err != nil {
klog.Infof("needs reconfigure: configs differ:\n%s", rr.Output())
return true
}

cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
// cruntime.Enable() may restart kube-apiserver but does not wait for it to return back
apiStatusTimeout := 3 * time.Second
st, err := kverify.WaitForAPIServerStatus(k.c, apiStatusTimeout, hostname, port)
if err != nil {
return errors.Wrap(err, "runtime")
}

// We must ensure that the apiserver is healthy before proceeding
if err := kverify.WaitForAPIServerProcess(cr, k, cfg, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}

if err := kverify.WaitForHealthyAPIServer(cr, k, cfg, k.c, client, time.Now(), hostname, port, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver health")
}

// because reboots clear /etc/cni
if err := k.applyCNI(cfg); err != nil {
return errors.Wrap(err, "apply cni")
}

if err := kverify.WaitForSystemPods(cr, k, cfg, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}

if err := kverify.NodePressure(client); err != nil {
adviseNodePressure(err, cfg.Name, cfg.Driver)
}

// This can fail during upgrades if the old pods have not shut down yet
addonPhase := func() error {
_, err := k.c.RunCmd(exec.Command("/bin/bash", "-c", fmt.Sprintf("%s phase addon all --config %s", baseCmd, conf)))
return err
klog.Warningf("needs reconfigure: apiserver error: %v", err)
return true
}
if err = retry.Expo(addonPhase, 100*time.Microsecond, 30*time.Second); err != nil {
klog.Warningf("addon install failed, wil retry: %v", err)
return errors.Wrap(err, "addons")
if st != state.Running {
klog.Warningf("needs reconfigure: apiserver in state %s", st.String())
return true
}

// must be called after applyCNI and `kubeadm phase addon all` (ie, coredns redeploy)
if cfg.VerifyComponents[kverify.ExtraKey] {
// after kubelet is restarted (with 'kubeadm init phase kubelet-start' above),
// it appears as to be immediately Ready as well as all kube-system pods (last observed state),
// then (after ~10sec) it realises it has some changes to apply, implying also pods restarts,
// and by that time we would exit completely, so we wait until kubelet begins restarting pods
klog.Info("waiting for restarted kubelet to initialise ...")
start := time.Now()
wait := func() error {
pods, err := client.CoreV1().Pods(meta.NamespaceSystem).List(context.Background(), meta.ListOptions{LabelSelector: "tier=control-plane"})
if err != nil {
return err
}
for _, pod := range pods.Items {
if ready, _ := kverify.IsPodReady(&pod); !ready {
return nil
}
}
return fmt.Errorf("kubelet not initialised")
}
_ = retry.Expo(wait, 250*time.Millisecond, 1*time.Minute)
klog.Infof("kubelet initialised")
klog.Infof("duration metric: took %s waiting for restarted kubelet to initialise ...", time.Since(start))

if err := kverify.WaitExtra(client, kverify.CorePodsLabels, kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "extra")
}
if err := kverify.ExpectAppsRunning(client, kverify.AppsRunningList); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
}

if err := bsutil.AdjustResourceLimits(k.c); err != nil {
klog.Warningf("unable to adjust resource limits: %v", err)
if err := kverify.APIServerVersionMatch(client, cfg.KubernetesConfig.KubernetesVersion); err != nil {
klog.Warningf("needs reconfigure: %v", err)
return true
}

return nil
klog.Infof("%s: %s", constants.ReconfigurationNotRequired, hostname)
return false
}

// JoinCluster adds new node to an existing cluster.
Expand Down Expand Up @@ -1045,27 +906,6 @@ func (k *Bootstrapper) elevateKubeSystemPrivileges(cfg config.ClusterConfig) err
return nil
}

// stopKubeSystem stops all the containers in the kube-system to prevent #8740 when doing hot upgrade
func (k *Bootstrapper) stopKubeSystem(cfg config.ClusterConfig) error {
klog.Info("stopping kube-system containers ...")
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return errors.Wrap(err, "new cruntime")
}

ids, err := cr.ListContainers(cruntime.ListContainersOptions{Namespaces: []string{"kube-system"}})
if err != nil {
return errors.Wrap(err, "list")
}

if len(ids) > 0 {
if err := cr.StopContainers(ids); err != nil {
return errors.Wrap(err, "stop")
}
}
return nil
}

// adviseNodePressure will advise the user what to do with difference pressure errors based on their environment
func adviseNodePressure(err error, name string, drv string) {
if diskErr, ok := err.(*kverify.ErrDiskPressure); ok {
Expand Down
3 changes: 3 additions & 0 deletions pkg/minikube/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ const (
MountTypeFlag = "type"
// MountUIDFlag is the flag used to set the mount UID
MountUIDFlag = "uid"

// ReconfigurationNotRequired is the message logged when reconfiguration is not required
ReconfigurationNotRequired = "The running cluster does not require reconfiguration"
)

var (
Expand Down
2 changes: 1 addition & 1 deletion pkg/minikube/node/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ func setupKubeAdm(mAPI libmachine.API, cfg config.ClusterConfig, n config.Node,
if err != nil {
exit.Error(reason.InternalBootstrapper, "Failed to get bootstrapper", err)
}
for _, eo := range config.ExtraOptions {
for _, eo := range cfg.KubernetesConfig.ExtraOptions {
out.Infof("{{.extra_option_component_name}}.{{.key}}={{.value}}", out.V{"extra_option_component_name": eo.Component, "key": eo.Key, "value": eo.Value})
}
// Loads cached images, generates config files, download binaries
Expand Down
3 changes: 2 additions & 1 deletion test/integration/pause_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"testing"

"k8s.io/minikube/cmd/minikube/cmd"
"k8s.io/minikube/pkg/minikube/constants"
)

// TestPause tests minikube pause functionality
Expand Down Expand Up @@ -96,7 +97,7 @@ func validateStartNoReconfigure(ctx context.Context, t *testing.T, profile strin
}

if !NoneDriver() {
softLog := "The running cluster does not require reconfiguration"
softLog := constants.ReconfigurationNotRequired
if !strings.Contains(rr.Output(), softLog) {
t.Errorf("expected the second start log output to include %q but got: %s", softLog, rr.Output())
}
Expand Down
1 change: 0 additions & 1 deletion translations/de.json
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,6 @@
"Unable to pull images, which may be OK: {{.error}}": "Bilder können nicht abgerufen werden, was möglicherweise kein Problem darstellt: {{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/es.json
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,6 @@
"Unable to pull images, which may be OK: {{.error}}": "No se ha podido recuperar imágenes, que podrían estar en buen estado: {{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/ko.json
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,6 @@
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to remove machine directory: %v": "머신 디렉토리를 제거할 수 없습니다: %v",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "가상 머신을 시작할 수 없습니다. 확인 후 가능하면 'minikube delete' 를 실행하세요",
"Unable to stop VM": "가상 머신을 중지할 수 없습니다",
Expand Down
1 change: 0 additions & 1 deletion translations/pl.json
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM": "Nie można uruchomić maszyny wirtualnej",
"Unable to stop VM": "Nie można zatrzymać maszyny wirtualnej",
Expand Down
1 change: 0 additions & 1 deletion translations/ru.json
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/strings.txt
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,6 @@
"Unable to pick a default driver. Here is what was considered, in preference order:": "",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to stop VM": "",
"Unable to update {{.driver}} driver: {{.error}}": "",
Expand Down
1 change: 0 additions & 1 deletion translations/zh-CN.json
Original file line number Diff line number Diff line change
Expand Up @@ -878,7 +878,6 @@
"Unable to pull images, which may be OK: {{.error}}": "无法拉取镜像,有可能是正常状况:{{.error}}",
"Unable to push cached images: {{.error}}": "",
"Unable to remove machine directory": "",
"Unable to restart cluster, will reset it: {{.error}}": "",
"Unable to safely downgrade existing Kubernetes v{{.old}} cluster to v{{.new}}": "",
"Unable to start VM. Please investigate and run 'minikube delete' if possible": "无法启动虚拟机。可能的话请检查后执行 'minikube delete'",
"Unable to stop VM": "无法停止虚拟机",
Expand Down

0 comments on commit 1d43372

Please sign in to comment.