From 93a0effc5df56245d6296b6e080832df94cc7794 Mon Sep 17 00:00:00 2001 From: machine424 Date: Wed, 24 Apr 2024 12:40:44 +0200 Subject: [PATCH] MON-3513: Add availability test for Metrics API This should ensure the availability of the Metrics API during e2e tests including upgrades. Thus it should also help with https://issues.redhat.com/browse/MON-3539. The correctness of the API: whether the right/expected content is returned, should be tested elsewhere (we already have tests for that in CMO, and the HPA tests already make use of that etc.). This tests only check the availability. --- pkg/defaultmonitortests/types.go | 2 + .../disruptionmetricsapi/monitortest.go | 170 ++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 pkg/monitortests/monitoring/disruptionmetricsapi/monitortest.go diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go index ecbab70b1261..905fbeb3bd08 100644 --- a/pkg/defaultmonitortests/types.go +++ b/pkg/defaultmonitortests/types.go @@ -19,6 +19,7 @@ import ( "github.com/openshift/origin/pkg/monitortests/kubeapiserver/disruptionnewapiserver" "github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests" "github.com/openshift/origin/pkg/monitortests/monitoring/statefulsetsrecreation" + "github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi" "github.com/openshift/origin/pkg/monitortests/network/disruptioningress" "github.com/openshift/origin/pkg/monitortests/network/disruptionpodnetwork" "github.com/openshift/origin/pkg/monitortests/network/disruptionserviceloadbalancer" @@ -118,6 +119,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI monitorTestRegistry.AddMonitorTestOrDie("disruption-summary-serializer", "Test Framework", disruptionserializer.NewDisruptionSummarySerializer()) monitorTestRegistry.AddMonitorTestOrDie("monitoring-statefulsets-recreation", "Monitoring", statefulsetsrecreation.NewStatefulsetsChecker()) + monitorTestRegistry.AddMonitorTestOrDie("metrics-api-availability", "Monitoring", disruptionmetricsapi.NewAvailabilityInvariant()) return monitorTestRegistry } diff --git a/pkg/monitortests/monitoring/disruptionmetricsapi/monitortest.go b/pkg/monitortests/monitoring/disruptionmetricsapi/monitortest.go new file mode 100644 index 000000000000..8ee274241b88 --- /dev/null +++ b/pkg/monitortests/monitoring/disruptionmetricsapi/monitortest.go @@ -0,0 +1,170 @@ +package disruptionmetricsapi + +import ( + "context" + "fmt" + "time" + + "github.com/openshift/origin/pkg/monitortestframework" + + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + + "github.com/openshift/origin/pkg/monitor/backenddisruption" + "github.com/openshift/origin/pkg/monitor/monitorapi" + "github.com/openshift/origin/pkg/monitortestlibrary/disruptionlibrary" + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" +) + +const ( + monitoringNamespace = "openshift-monitoring" + prometheusAdapterDeployentName = "prometheus-adapter" + metricsServerDeploymentName = "metrics-server" +) + +type availability struct { + disruptionCheckers []*disruptionlibrary.Availability + notSupportedReason error +} + +func NewAvailabilityInvariant() monitortestframework.MonitorTest { + return &availability{} +} + +func createAPIServerBackendSampler(clusterConfig *rest.Config, disruptionBackendName, url string, connectionType monitorapi.BackendConnectionType) (*backenddisruption.BackendSampler, error) { + backendSampler, err := backenddisruption.NewAPIServerBackend(clusterConfig, disruptionBackendName, url, connectionType) + if err != nil { + return nil, err + } + backendSampler = backendSampler.WithUserAgent(fmt.Sprintf("openshift-external-backend-sampler-%s-%s", connectionType, disruptionBackendName)) + + return backendSampler, nil +} + +func (w *availability) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error { + var err error + + // Skip for single replica Deployments. + kubeClient, err := kubernetes.NewForConfig(adminRESTConfig) + if err != nil { + return err + } + var deployment *appsv1.Deployment + deployment, err = kubeClient.AppsV1().Deployments(monitoringNamespace).Get(ctx, metricsServerDeploymentName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + // TODO: remove this in 4.17 + deployment, err = kubeClient.AppsV1().Deployments(monitoringNamespace).Get(ctx, prometheusAdapterDeployentName, metav1.GetOptions{}) + if err != nil { + return err + } + } else if err != nil { + return err + } + if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == 1 { + w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: fmt.Sprintf("%s only has a single replica", deployment.Name)} + return w.notSupportedReason + } + + disruptionBackedName := "metrics-api" + + newConnectionTestName := "[sig-instrumentation] disruption/metrics-api connection/new should be available throughout the test" + reusedConnectionTestName := "[sig-instrumentation] disruption/metrics-api connection/reused should be available throughout the test" + + // TODO: clean up/refactor following. + + // For nodes metrics + newConnections, err := createAPIServerBackendSampler(adminRESTConfig, disruptionBackedName, "/apis/metrics.k8s.io/v1beta1/nodes", monitorapi.NewConnectionType) + if err != nil { + return err + } + reusedConnections, err := createAPIServerBackendSampler(adminRESTConfig, disruptionBackedName, "/apis/metrics.k8s.io/v1beta1/nodes", monitorapi.ReusedConnectionType) + if err != nil { + return err + } + + w.disruptionCheckers = append(w.disruptionCheckers, disruptionlibrary.NewAvailabilityInvariant( + newConnectionTestName, reusedConnectionTestName, + newConnections, reusedConnections, + )) + + // For pods metrics, monitoringNamespace is always available and we don't want to ask cluster wide. + newConnections, err = createAPIServerBackendSampler(adminRESTConfig, disruptionBackedName, fmt.Sprintf("/apis/metrics.k8s.io/v1beta1/namespaces/%s/pods", monitoringNamespace), monitorapi.NewConnectionType) + if err != nil { + return err + } + reusedConnections, err = createAPIServerBackendSampler(adminRESTConfig, disruptionBackedName, fmt.Sprintf("/apis/metrics.k8s.io/v1beta1/namespaces/%s/pods", monitoringNamespace), monitorapi.ReusedConnectionType) + if err != nil { + return err + } + + w.disruptionCheckers = append(w.disruptionCheckers, disruptionlibrary.NewAvailabilityInvariant( + newConnectionTestName, reusedConnectionTestName, + newConnections, reusedConnections, + )) + + for i := range w.disruptionCheckers { + if err := w.disruptionCheckers[i].StartCollection(ctx, adminRESTConfig, recorder); err != nil { + return err + } + } + + return nil +} + +func (w *availability) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { + if w.notSupportedReason != nil { + return nil, nil, w.notSupportedReason + } + + intervals := monitorapi.Intervals{} + junits := []*junitapi.JUnitTestCase{} + errs := []error{} + + for i := range w.disruptionCheckers { + // TODO: check for nil after refactoring in StartCollection + localIntervals, localJunits, localErr := w.disruptionCheckers[i].CollectData(ctx) + intervals = append(intervals, localIntervals...) + junits = append(junits, localJunits...) + if localErr != nil { + errs = append(errs, localErr) + } + } + + return intervals, junits, utilerrors.NewAggregate(errs) +} + +func (w *availability) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) { + return nil, w.notSupportedReason +} + +func (w *availability) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { + if w.notSupportedReason != nil { + return nil, w.notSupportedReason + } + + junits := []*junitapi.JUnitTestCase{} + errs := []error{} + + for i := range w.disruptionCheckers { + // TODO: check for nil after refactoring in StartCollection + localJunits, localErr := w.disruptionCheckers[i].EvaluateTestsFromConstructedIntervals(ctx, finalIntervals) + junits = append(junits, localJunits...) + if localErr != nil { + errs = append(errs, localErr) + } + } + + return junits, utilerrors.NewAggregate(errs) +} + +func (w *availability) WriteContentToStorage(ctx context.Context, storageDir string, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error { + return w.notSupportedReason +} + +func (w *availability) Cleanup(ctx context.Context) error { + return w.notSupportedReason +}