Skip to content

Commit

Permalink
Merge pull request openshift#28157 from liqcui/nto66086
Browse files Browse the repository at this point in the history
OCP-66086-Automate PSAP NTO Prevent from stalld continually restarting
  • Loading branch information
openshift-merge-robot authored Sep 9, 2023
2 parents cdd0a95 + 8107b57 commit 6229a08
Show file tree
Hide file tree
Showing 8 changed files with 254 additions and 0 deletions.
1 change: 1 addition & 0 deletions test/extended/include.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
_ "github.com/openshift/origin/test/extended/kubevirt"
_ "github.com/openshift/origin/test/extended/machines"
_ "github.com/openshift/origin/test/extended/networking"
_ "github.com/openshift/origin/test/extended/node_tuning"
_ "github.com/openshift/origin/test/extended/oauth"
_ "github.com/openshift/origin/test/extended/olm"
_ "github.com/openshift/origin/test/extended/operators"
Expand Down
8 changes: 8 additions & 0 deletions test/extended/node_tuning/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
reviewers:
- liqcui
- shahsahil264
- jmencak
approvers:
- liqcui
- shahsahil264
- jmencak
116 changes: 116 additions & 0 deletions test/extended/node_tuning/node_tuning.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package node_tuning

import (
"fmt"
"path/filepath"
"strings"
"time"

g "github.com/onsi/ginkgo/v2"
o "github.com/onsi/gomega"

exutil "github.com/openshift/origin/test/extended/util"
"k8s.io/apimachinery/pkg/util/wait"
e2e "k8s.io/kubernetes/test/e2e/framework"
)

var _ = g.Describe("[sig-node-tuning] NTO should", func() {
defer g.GinkgoRecover()

var (
ntoNamespace = "openshift-cluster-node-tuning-operator"
oc = exutil.NewCLIWithoutNamespace("nto")
buildPruningBaseDir = exutil.FixturePath("testdata", "node_tuning")
ntoStalldFile = filepath.Join(buildPruningBaseDir, "nto-stalld.yaml")
stalldCurrentPID string
)

// OCP-66086 - [OCPBUGS-11150] Node Tuning Operator - NTO Prevent from stalld continually restarting
// author: liqcui@redhat.com
// OCP Bugs: https://issues.redhat.com/browse/OCPBUGS-11150

g.It("OCP-66086 NTO Prevent from stalld continually restarting [Slow]", func() {
e2e.Logf("get the first rhcos worker nodes as label node")
firstCoreOSWorkerNodes, err := exutil.GetFirstCoreOsWorkerNode(oc)
o.Expect(err).NotTo(o.HaveOccurred())
if len(firstCoreOSWorkerNodes) == 0 {
g.Skip("no rhcos worker node was found - skipping test ...")
}
e2e.Logf("the firstCoreOSWorkerNodes is:%v", firstCoreOSWorkerNodes)

defer oc.AsAdmin().WithoutNamespace().Run("label").Args("node", firstCoreOSWorkerNodes, "node-role.kubernetes.io/worker-stalld-", "--overwrite").Execute()
defer oc.AsAdmin().WithoutNamespace().Run("delete").Args("tuned", "openshift-stalld", "-n", ntoNamespace, "--ignore-not-found").Execute()

e2e.Logf("label the first rhcos node with node-role.kubernetes.io/worker-stalld=")
err = oc.AsAdmin().WithoutNamespace().Run("label").Args("node", firstCoreOSWorkerNodes, "node-role.kubernetes.io/worker-stalld=", "--overwrite").Execute()
o.Expect(err).NotTo(o.HaveOccurred())

e2e.Logf("create custom profile openshift-stalld")
err = oc.AsAdmin().WithoutNamespace().Run("apply").Args("-f", ntoStalldFile, "-n", ntoNamespace).Execute()
o.Expect(err).NotTo(o.HaveOccurred())

e2e.Logf("assert if the tuned openshift-stalld created successfully")
tunedStdOut, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("tuned", "-n", ntoNamespace).Output()
e2e.Logf("current tuned status is:\n%s,", tunedStdOut)
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(tunedStdOut).NotTo(o.BeEmpty())
o.Expect(tunedStdOut).To(o.ContainSubstring("openshift-stalld"))

// Assert if profile applied to label node with re-try
o.Eventually(func() bool {
appliedStatus, err1 := oc.AsAdmin().WithoutNamespace().Run("get").Args("-n", ntoNamespace, "profile", firstCoreOSWorkerNodes, `-ojsonpath='{.status.conditions[?(@.type=="Applied")].status}'`).Output()
tunedProfile, err2 := oc.AsAdmin().WithoutNamespace().Run("get").Args("-n", ntoNamespace, "profile", firstCoreOSWorkerNodes, "-ojsonpath={.status.tunedProfile}").Output()
if err1 != nil || err2 != nil || strings.Contains(appliedStatus, "False") || strings.Contains(appliedStatus, "Unknown") || tunedProfile != "openshift-stalld" {
e2e.Logf("failed to apply custom profile to nodes, the status is %s and profile is %s, check again", appliedStatus, tunedProfile)
}
return strings.Contains(appliedStatus, "True") && tunedProfile == "openshift-stalld"
}, 5*time.Second, time.Second).Should(o.BeTrue())

e2e.Logf("assert if the custom profile openshift-stalld applied to label node")
profileStdOut, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("-n", ntoNamespace, "profile", firstCoreOSWorkerNodes, "-ojsonpath={.status.tunedProfile}").Output()
e2e.Logf("current profile status is [ %s ] on [ %s ]", profileStdOut, firstCoreOSWorkerNodes)
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(profileStdOut).NotTo(o.BeEmpty())
o.Expect(profileStdOut).To(o.ContainSubstring("openshift-stalld"))

e2e.Logf("check if stalld service is running ...")
stalldStatus, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, firstCoreOSWorkerNodes, ntoNamespace, "systemctl", "status", "stalld")
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(stalldStatus).To(o.ContainSubstring("active (running)"))

e2e.Logf("assert if stalld service restart ...")
stalldPreviousPID, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, firstCoreOSWorkerNodes, ntoNamespace, "pidof", "stalld")
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(stalldPreviousPID).NotTo(o.BeEmpty())
e2e.Logf("record the previous stalld PID is <stalldPreviousPID: %v>", stalldPreviousPID)

e2e.Logf("start to periodically check stalld PID and compare if stalld pid change")
// Wait for 10 minutes and check stalld pid in the meantime and exit if we found stalld restarted
errWait := wait.Poll(2*time.Minute, 10*time.Minute, func() (bool, error) {
stalldCurrentPID, err = exutil.DebugNodeRetryWithOptionsAndChroot(oc, firstCoreOSWorkerNodes, ntoNamespace, "pidof", "stalld")
e2e.Logf("the current PID of stalld is < %v >", stalldCurrentPID)
// the wait poll will exit if stalld restart or anbonrmal
if err != nil || stalldCurrentPID != stalldPreviousPID {
e2e.Logf("[ NOTE ] <stalldPreviousPID: %v stalldCurrentPID: %v > The PID of stalld has been changed due to stalld service restarted.", stalldPreviousPID, stalldCurrentPID)
return true, nil
}
e2e.Logf("no restart of stalld process as expected, the stalld PID still is %v", stalldCurrentPID)
return false, nil
})

e2e.Logf("get how many minutes stalld process keep up and running ...")
stalldRuntimeDuration, err := exutil.DebugNodeRetryWithOptionsAndChroot(oc, firstCoreOSWorkerNodes, ntoNamespace, "/bin/bash", "-c", "ps -o etime= -p "+stalldCurrentPID)
o.Expect(err).NotTo(o.HaveOccurred())
o.Expect(stalldRuntimeDuration).NotTo(o.BeEmpty())
e2e.Logf("the the stalld process keep running for %v", stalldRuntimeDuration)

if errWait != nil {
e2e.Logf("%v", errWait)
return
}

err = fmt.Errorf("case: %v\nexpected error got because of %v", g.CurrentSpecReport().FullText(), fmt.Sprintf("stalld service restarted : %v", errWait))
o.Expect(err).NotTo(o.HaveOccurred())

})
})
45 changes: 45 additions & 0 deletions test/extended/testdata/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions test/extended/testdata/node_tuning/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
reviewers:
- liqcui
- shahsahil264
- jmencak
approvers:
- liqcui
- shahsahil264
- jmencak
23 changes: 23 additions & 0 deletions test/extended/testdata/node_tuning/nto-stalld.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: tuned.openshift.io/v1
kind: Tuned
metadata:
name: openshift-stalld
namespace: openshift-cluster-node-tuning-operator
spec:
profile:
- data: |
[main]
summary=Custom OpenShift profile
include=openshift-node,realtime
[sysctl]
kernel.sched_rt_runtime_us = -1
[service]
service.stalld=start,enable
name: openshift-stalld
recommend:
- match:
- label: node-role.kubernetes.io/worker-stalld
priority: 20
profile: openshift-stalld

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 51 additions & 0 deletions test/extended/util/nodes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package util

import (
"strings"
"time"

"k8s.io/apimachinery/pkg/util/wait"
)

// GetClusterNodesByRole returns the cluster nodes by role
func GetClusterNodesByRole(oc *CLI, role string) ([]string, error) {
nodes, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("node", "-l", "node-role.kubernetes.io/"+role, "-o", "jsonpath='{.items[*].metadata.name}'").Output()
return strings.Split(strings.Trim(nodes, "'"), " "), err
}

// GetFirstCoreOsWorkerNode returns the first CoreOS worker node
func GetFirstCoreOsWorkerNode(oc *CLI) (string, error) {
return getFirstNodeByOsID(oc, "worker", "rhcos")
}

// getFirstNodeByOsID returns the cluster node by role and os id
func getFirstNodeByOsID(oc *CLI, role string, osID string) (string, error) {
nodes, err := GetClusterNodesByRole(oc, role)
for _, node := range nodes {
stdout, err := oc.AsAdmin().WithoutNamespace().Run("get").Args("node/"+node, "-o", "jsonpath=\"{.metadata.labels.node\\.openshift\\.io/os_id}\"").Output()
if strings.Trim(stdout, "\"") == osID {
return node, err
}
}
return "", err
}

// DebugNodeRetryWithOptionsAndChroot launches debug container using chroot with options
// and waitPoll to avoid "error: unable to create the debug pod" and do retry
func DebugNodeRetryWithOptionsAndChroot(oc *CLI, nodeName string, debugNodeNamespace string, cmd ...string) (string, error) {
var (
cargs []string
stdOut string
err error
)
cargs = []string{"node/" + nodeName, "-n" + debugNodeNamespace, "--", "chroot", "/host"}
cargs = append(cargs, cmd...)
wait.Poll(3*time.Second, 30*time.Second, func() (bool, error) {
stdOut, _, err = oc.AsAdmin().WithoutNamespace().Run("debug").Args(cargs...).Outputs()
if err != nil {
return false, nil
}
return true, nil
})
return stdOut, err
}

0 comments on commit 6229a08

Please sign in to comment.