Skip to content

Commit

Permalink
feat: add host preflights for all needed ports (#1205)
Browse files Browse the repository at this point in the history
* feat: add host preflights for all needed ports

checks if all needed ports are available on the server prior to start
the installation.

* chore: add e2e test for port failures

make sure preflights are failing if some ports are in use.

* feat: auto reboot node on reset

now we reboot the node after finishing the reset command.

* chore(e2e): wait for dbus + remove --reboot from reset

* chore: simplify things by just sleeping

* chore: log when sleeping

* chore(e2e): persist default route
  • Loading branch information
ricardomaraschini authored Sep 25, 2024
1 parent 2c793d9 commit 9150c74
Show file tree
Hide file tree
Showing 7 changed files with 481 additions and 30 deletions.
17 changes: 3 additions & 14 deletions cmd/embedded-cluster/reset.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,6 @@ var resetCommand = &cli.Command{
Usage: "Disable interactive prompts",
Value: false,
},
&cli.BoolFlag{
Name: "reboot",
Usage: "Reboot system after resetting the node",
Value: false,
},
},
Usage: fmt.Sprintf("Remove %s from the current node", binName),
Action: func(c *cli.Context) error {
Expand All @@ -378,7 +373,7 @@ var resetCommand = &cli.Command{
}

logrus.Info("This will remove this node from the cluster and completely reset it, removing all data stored on the node.")
logrus.Info("Do not reset another node until this is complete.")
logrus.Info("This action will cause the node to reboot. Do not reset another node until this is complete.")
if !c.Bool("force") && !c.Bool("no-prompt") && !prompts.New().Confirm("Do you want to continue?", false) {
return fmt.Errorf("Aborting")
}
Expand Down Expand Up @@ -449,10 +444,6 @@ var resetCommand = &cli.Command{
return err
}

if !c.Bool("reboot") {
logrus.Infof("Node has been reset. Please reboot to ensure transient configuration is also reset.")
}

if err := helpers.RemoveAll(defaults.PathToK0sConfig()); err != nil {
return fmt.Errorf("failed to remove k0s config: %w", err)
}
Expand Down Expand Up @@ -501,10 +492,8 @@ var resetCommand = &cli.Command{
return fmt.Errorf("failed to remove k0s binary: %w", err)
}

if c.Bool("reboot") {
if _, err := exec.Command("reboot").Output(); err != nil {
return err
}
if _, err := exec.Command("reboot").Output(); err != nil {
return err
}

return nil
Expand Down
16 changes: 7 additions & 9 deletions e2e/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,22 +359,20 @@ func ConfigureProxy(in *Input) {
// them trust it.
for i := 0; i < in.Nodes; i++ {
name := fmt.Sprintf("node-%s-%02d", in.id, i)
for _, cmd := range [][]string{
{"ip", "route", "del", "default"},
{"ip", "route", "add", "default", "via", "10.0.0.254"},
{"mkdir", "-p", "/usr/local/share/ca-certificates/proxy"},
} {
RunCommandOnNode(in, cmd, name)
}
RunCommandOnNode(in, []string{"mkdir", "-p", "/usr/local/share/ca-certificates/proxy"}, name)

CopyFileToNode(in, name, File{
SourcePath: "/tmp/ca.crt",
DestPath: "/usr/local/share/ca-certificates/proxy/ca.crt",
Mode: 0644,
})

cmd := []string{"update-ca-certificates"}
RunCommandOnNode(in, cmd, name)
for _, cmd := range [][]string{
{"update-ca-certificates"},
{"/usr/local/bin/default-route-through-proxy.sh"},
} {
RunCommandOnNode(in, cmd, name)
}
}
}

Expand Down
6 changes: 6 additions & 0 deletions e2e/install_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,9 @@ func TestResetAndReinstall(t *testing.T) {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

t.Logf("%s: installing embedded-cluster on node 0 after reset", time.Now().Format(time.RFC3339))
line = []string{"single-node-install.sh", "ui"}
if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil {
Expand Down Expand Up @@ -698,6 +701,9 @@ func TestResetAndReinstallAirgap(t *testing.T) {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

t.Logf("%s: installing embedded-cluster on node 0", time.Now().Format(time.RFC3339))
line = []string{"single-node-airgap-install.sh"}
if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil {
Expand Down
45 changes: 40 additions & 5 deletions e2e/preflights_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,24 @@ func TestPreflights(t *testing.T) {
})

_, stderr, err := container.Exec(cli,
"apt-get update && apt-get install -y apt-utils kmod",
"apt-get update && apt-get install -y apt-utils kmod netcat-traditional",
)
if err != nil {
t.Fatalf("failed to install deps: err=%v, stderr=%s", err, stderr)
}

if _, stderr, err = container.Exec(cli, "nohup netcat -l -p 10250 &"); err != nil {
t.Fatalf("failed to start netcat: err=%v, stderr=%s", err, stderr)
}

if _, stderr, err = container.Exec(cli, "nohup netcat -l 127.0.0.1 -p 50000 &"); err != nil {
t.Fatalf("failed to start netcat: err=%v, stderr=%s", err, stderr)
}

if _, stderr, err = container.Exec(cli, "nohup netcat -l -u -p 4789 &"); err != nil {
t.Fatalf("failed to start netcat: err=%v, stderr=%s", err, stderr)
}

runCmd := fmt.Sprintf("%s install run-preflights --no-prompt", container.GetECBinaryPath())
if os.Getenv("LICENSE_PATH") != "" {
runCmd = fmt.Sprintf("%s --license %s", runCmd, container.GetLicensePath())
Expand Down Expand Up @@ -93,10 +105,13 @@ func TestPreflights(t *testing.T) {
assert: func(t *testing.T, results *preflights.Output) {
expected := map[string]bool{
// TODO: work to remove these
"System Clock": true,
"'devices' Cgroup Controller": true,
"API Access": true,
"Proxy Registry Access": true,
"System Clock": true,
"'devices' Cgroup Controller": true,
"API Access": true,
"Proxy Registry Access": true,
"Kubelet Port Availability": true,
"Calico Communication Port Availability": true,
"Local Artifact Mirror Port Availability": true,
// as long as fio ran successfully, we're good
"Filesystem Write Latency": true,
}
Expand Down Expand Up @@ -124,6 +139,26 @@ func TestPreflights(t *testing.T) {
}
},
},
{
name: "Should contain port failures",
assert: func(t *testing.T, results *preflights.Output) {
expected := map[string]bool{
"Kubelet Port Availability": false,
"Calico Communication Port Availability": false,
"Local Artifact Mirror Port Availability": false,
}
for _, res := range results.Fail {
if _, ok := expected[res.Title]; ok {
expected[res.Title] = true
}
}
for title, found := range expected {
if !found {
t.Errorf("expected port failure not found: %q", title)
}
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
17 changes: 15 additions & 2 deletions e2e/restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ func TestSingleNodeDisasterRecovery(t *testing.T) {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

t.Logf("%s: restoring the installation", time.Now().Format(time.RFC3339))
line = append([]string{"restore-installation.exp"}, testArgs...)
if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil {
Expand Down Expand Up @@ -156,6 +159,9 @@ func TestSingleNodeDisasterRecoveryWithProxy(t *testing.T) {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

t.Logf("%s: restoring the installation", time.Now().Format(time.RFC3339))
line = append([]string{"restore-installation.exp"}, testArgs...)
line = append(line, "--http-proxy", cluster.HTTPProxy)
Expand Down Expand Up @@ -229,6 +235,9 @@ func TestSingleNodeResumeDisasterRecovery(t *testing.T) {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

t.Logf("%s: restoring the installation", time.Now().Format(time.RFC3339))
line = append([]string{"resume-restore.exp"}, testArgs...)
if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil {
Expand Down Expand Up @@ -329,6 +338,10 @@ func TestSingleNodeAirgapDisasterRecovery(t *testing.T) {
if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil {
t.Fatalf("fail to reset the installation: %v", err)
}

t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339))
time.Sleep(30 * time.Second)

installTestDependenciesDebian(t, tc, 0, true)
t.Logf("%s: restoring the installation", time.Now().Format(time.RFC3339))
testArgs = append(testArgs, "--pod-cidr", "10.128.0.0/20", "--service-cidr", "10.129.0.0/20")
Expand Down Expand Up @@ -463,7 +476,7 @@ func TestMultiNodeHADisasterRecovery(t *testing.T) {
}

// reset the cluster
line = []string{"reset-installation.sh", "--force", "--reboot"}
line = []string{"reset-installation.sh", "--force"}
t.Logf("%s: resetting the installation on node 2", time.Now().Format(time.RFC3339))
if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil {
t.Fatalf("fail to reset the installation: %v", err)
Expand Down Expand Up @@ -681,7 +694,7 @@ func TestMultiNodeAirgapHADisasterRecovery(t *testing.T) {
}

// reset the cluster
line = []string{"reset-installation.sh", "--force", "--reboot"}
line = []string{"reset-installation.sh", "--force"}
t.Logf("%s: resetting the installation on node 2", time.Now().Format(time.RFC3339))
if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil {
t.Fatalf("fail to reset the installation: %v", err)
Expand Down
20 changes: 20 additions & 0 deletions e2e/scripts/default-route-through-proxy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

echo "
[Unit]
Description=Set default route through proxy node
After=network-online.targej
Wants=network-online.target
[Service]
Type=oneshot
ExecStartPre=/bin/sleep 5
ExecStart=/bin/bash -c 'ip route del default; ip route add default via 10.0.0.254'
RemainAfterExit=true
[Install]
WantedBy=multi-user.target" > /etc/systemd/system/default-route-through-proxy.service

systemctl daemon-reload
systemctl enable default-route-through-proxy
systemctl start default-route-through-proxy
Loading

0 comments on commit 9150c74

Please sign in to comment.