Merge pull request #28110 from jeff-roche/rt-latency-tests

OCPVE-292: OCPVE-293: OCPVE-294: feat: added a realtime latency test suite
openshift · Aug 3, 2023 · 6ee9dc5 · 6ee9dc5
2 parents 4a3ca7b + 8f43ef2
commit 6ee9dc5
Show file tree

Hide file tree

Showing 6 changed files with 269 additions and 9 deletions.
diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go
@@ -367,4 +367,17 @@ var staticSuites = []ginkgo.TestSuite{
 		},
 		TestTimeout: 30 * time.Minute,
 	},
+	{
+		Name: "openshift/nodes/realtime/latency",
+		Description: templates.LongDesc(`
+		This test suite runs tests to validate realtime latency on nodes.
+		`),
+		Matches: func(name string) bool {
+			if isDisabled(name) {
+				return false
+			}
+			return strings.Contains(name, "[Suite:openshift/nodes/realtime/latency")
+		},
+		TestTimeout: 30 * time.Minute,
+	},
 }
diff --git a/test/extended/kernel/OWNERS b/test/extended/kernel/OWNERS
@@ -0,0 +1,6 @@
+approvers:
+- eggfoobar
+- jeff-roche
+- jerpeter1
+- jakobmoellerdev
+
diff --git a/test/extended/kernel/kernel_rt_pi_stress.go → test/extended/kernel/kernel_rt_functional.go b/test/extended/kernel/kernel_rt_pi_stress.go → test/extended/kernel/kernel_rt_functional.go
@@ -21,21 +21,18 @@ var _ = g.Describe("[sig-node][Suite:openshift/nodes/realtime][Disruptive] Real
 		startRtTestPod(oc)
 	})
 
-	g.It("pi_stress to run successfully with the default algorithm", func() {
-		args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1"}
-		_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	g.It("pi_stress to run successfully with the fifo algorithm", func() {
+		err := runPiStressFifo(oc)
 		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running pi_stress with the fifo algorithm")
 	})
 
 	g.It("pi_stress to run successfully with the round robin algorithm", func() {
-		args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1", "--rr"}
-		_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+		err := runPiStressRR(oc)
 		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running pi_stress with the round robin algorithm")
 	})
 
 	g.It("deadline_test to run successfully", func() {
-		args := []string{rtPodName, "--", "deadline_test"}
-		_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+		err := runDeadlineTest(oc)
 		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running deadline_test")
 	})
 
@@ -46,5 +43,4 @@ var _ = g.Describe("[sig-node][Suite:openshift/nodes/realtime][Disruptive] Real
 	g.AfterAll(func() {
 		cleanupRealtimeTestEnvironment(oc)
 	})
-
 })
diff --git a/test/extended/kernel/kernel_rt_latency.go b/test/extended/kernel/kernel_rt_latency.go
@@ -0,0 +1,52 @@
+package kernel
+
+import (
+	g "github.com/onsi/ginkgo/v2"
+	o "github.com/onsi/gomega"
+	exutil "github.com/openshift/origin/test/extended/util"
+)
+
+var _ = g.Describe("[sig-node][Suite:openshift/nodes/realtime/latency][Disruptive] Real time kernel should meet latency requirements when tested with", g.Ordered, func() {
+	defer g.GinkgoRecover()
+	var (
+		oc = exutil.NewCLI(rtNamespace).AsAdmin()
+	)
+
+	g.BeforeAll(func() {
+		failIfNotRT(oc)
+		configureRealtimeTestEnvironment(oc)
+	})
+
+	g.BeforeEach(func() {
+		startRtTestPod(oc)
+	})
+
+	g.It("hwlatdetect", func() {
+		err := runHwlatdetect(oc)
+		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running hwlatdetect")
+	})
+
+	g.It("oslat", func() {
+		cpuCount, err := getProcessorCount(oc)
+		o.Expect(err).NotTo(o.HaveOccurred(), "unable to get the number of processors online")
+
+		err = runOslat(cpuCount, oc)
+		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running oslat")
+	})
+
+	g.It("cyclictest", func() {
+		cpuCount, err := getProcessorCount(oc)
+		o.Expect(err).NotTo(o.HaveOccurred(), "unable to get the number of processors online")
+
+		err = runCyclictest(cpuCount, oc)
+		o.Expect(err).NotTo(o.HaveOccurred(), "error occured running cyclictest")
+	})
+
+	g.AfterEach(func() {
+		cleanupRtTestPod(oc)
+	})
+
+	g.AfterAll(func() {
+		cleanupRealtimeTestEnvironment(oc)
+	})
+})
diff --git a/test/extended/kernel/tools.go b/test/extended/kernel/tools.go
@@ -0,0 +1,187 @@
+package kernel
+
+import (
+	"encoding/json"
+	"fmt"
+	"strconv"
+
+	exutil "github.com/openshift/origin/test/extended/util"
+	"github.com/pkg/errors"
+)
+
+const (
+	hwlatdetectThresholdusec = 5000
+	oslatThresholdusec       = 5000
+	cyclictestThresholdusec  = 5000
+)
+
+func runPiStressFifo(oc *exutil.CLI) error {
+	args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1"}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+
+	return err
+}
+
+func runPiStressRR(oc *exutil.CLI) error {
+	args := []string{rtPodName, "--", "pi_stress", "--duration=600", "--groups=1", "--rr"}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+
+	return err
+}
+
+func runDeadlineTest(oc *exutil.CLI) error {
+	args := []string{rtPodName, "--", "deadline_test"}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+
+	return err
+}
+
+func runHwlatdetect(oc *exutil.CLI) error {
+	args := []string{rtPodName, "--", "hwlatdetect", "--duration=600s", "--window=1s", "--width=500ms", fmt.Sprintf("--threshold=%dus", hwlatdetectThresholdusec)}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		// An error here indicates thresholds were exceeded or an issue with the test
+		return errors.Wrap(err, "error running hwlatdetect")
+	}
+
+	return nil
+}
+
+func runOslat(cpuCount int, oc *exutil.CLI) error {
+	oslatReportFile := "/tmp/oslatresults.json"
+
+	// Make sure there is enough hardware for this test
+	if cpuCount <= 4 {
+		return fmt.Errorf("more than 4 cores are required to run this oslat test. Found %d cores", cpuCount)
+	}
+
+	// Run the test
+	args := []string{rtPodName, "--", "oslat", "--cpu-list", fmt.Sprintf("4-%d", cpuCount-1), "--rtprio", "1", "--duration", "600", "--json", oslatReportFile}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		return errors.Wrap(err, "error running oslat")
+	}
+
+	// Get the results
+	args = []string{rtPodName, "--", "cat", oslatReportFile}
+	report, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		return errors.Wrap(err, "error retrieving oslat results")
+	}
+
+	// Parse the results and return any errors detected
+	if err = parseOslatResults(report, oslatThresholdusec); err != nil {
+		return errors.Wrap(err, "error parsing oslat report")
+	}
+
+	return nil
+}
+
+func parseOslatResults(jsonReport string, maxThresholdusec int) error {
+	var oslatReport struct {
+		Threads map[string]struct {
+			Cpu int `json:"cpu"`
+			Max int `json:"max"`
+		} `json:"thread"`
+	}
+
+	// Parse the data
+	err := json.Unmarshal([]byte(jsonReport), &oslatReport)
+	if err != nil {
+		return errors.Wrap(err, "unable to decode oslat report json")
+	}
+
+	if len(oslatReport.Threads) == 0 {
+		return fmt.Errorf("no thread reports found")
+	}
+
+	failedCPUs := make([]int, 0, len(oslatReport.Threads)) // Report all failed cores
+	for _, thread := range oslatReport.Threads {
+		if thread.Max > maxThresholdusec {
+			failedCPUs = append(failedCPUs, thread.Cpu)
+		}
+	}
+
+	if len(failedCPUs) > 0 {
+		return fmt.Errorf("the following CPUs were over the max latency threshold: %v", failedCPUs)
+	}
+
+	return nil
+}
+
+func runCyclictest(cpuCount int, oc *exutil.CLI) error {
+	cyclictestReportFile := "/tmp/cyclictestresults.json"
+	// Make sure there is enough hardware for this test
+	if cpuCount <= 4 {
+		return fmt.Errorf("more than 4 cores are required to run this oslat test. Found %d cores", cpuCount)
+	}
+
+	// Run the test
+	args := []string{rtPodName, "--", "cyclictest", "--duration=10m", "--priority=95", fmt.Sprintf("--threads=%d", cpuCount-5), fmt.Sprintf("--affinity=4-%d", cpuCount-1), "--interval=1000", fmt.Sprintf("--breaktrace=%d", cyclictestThresholdusec), "--mainaffinity=4", "-m", fmt.Sprintf("--json=%s", cyclictestReportFile)}
+	_, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		return errors.Wrap(err, "error running cyclictest")
+	}
+
+	// Gather the results
+	args = []string{rtPodName, "--", "cat", cyclictestReportFile}
+	report, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		return errors.Wrap(err, "error retrieving cyclictest results")
+	}
+
+	// Parse the results and return any errors detected
+	if err = parseCyclictestResults(report, cyclictestThresholdusec); err != nil {
+		return errors.Wrap(err, "error parsing cyclictest report")
+	}
+
+	return nil
+}
+
+func parseCyclictestResults(jsonReport string, maxThresholdusec int) error {
+	var cyclictestReport struct {
+		Threads map[string]struct {
+			Cpu int `json:"cpu"`
+			Max int `json:"max"`
+		} `json:"thread"`
+	}
+
+	// Parse the data
+	err := json.Unmarshal([]byte(jsonReport), &cyclictestReport)
+	if err != nil {
+		return errors.Wrap(err, "unable to decode cyclictest report json")
+	}
+
+	if len(cyclictestReport.Threads) == 0 {
+		return fmt.Errorf("no thread reports found")
+	}
+
+	failedCPUs := make([]int, 0, len(cyclictestReport.Threads)) // Report all failed cores
+	for _, thread := range cyclictestReport.Threads {
+		if thread.Max > maxThresholdusec {
+			failedCPUs = append(failedCPUs, thread.Cpu)
+		}
+	}
+
+	if len(failedCPUs) > 0 {
+		return fmt.Errorf("the following CPUs were over the max latency threshold: %v", failedCPUs)
+	}
+
+	return nil
+}
+
+func getProcessorCount(oc *exutil.CLI) (int, error) {
+	args := []string{rtPodName, "--", "getconf", "_NPROCESSORS_ONLN"}
+	num, err := oc.SetNamespace(rtNamespace).Run("exec").Args(args...).Output()
+	if err != nil {
+		return 0, err
+	}
+
+	// Parse out the CPU count
+	count, err := strconv.Atoi(num)
+	if err != nil {
+		return 0, err
+	}
+
+	return count, nil
+}
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go