From 432ee31b00f9d8dd954cffef828c6873a25a7bd0 Mon Sep 17 00:00:00 2001
From: machine424 <ayoubmrini424@gmail.com>
Date: Thu, 26 Dec 2024 12:45:48 +0100
Subject: [PATCH] Add disruption check for Thanos Querier API

Uncertain if the SLO is achievable with the current config (a check every second)
API adjustments may be needed to improve reliability, or we may need to loosen up the check

Based on Route dependency, which could increase the likelihood of false positives.

Auth isn't set; we only check for 401. A 5xx would indicate Route backend (Thanos Querier Pods) issues.
We may be able to easily get token based auth to work
---
 pkg/defaultmonitortests/types.go              |   2 +
 .../disruptionthanosquerierapi/monitortest.go | 126 ++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 pkg/monitortests/monitoring/disruptionthanosquerierapi/monitortest.go

diff --git a/pkg/defaultmonitortests/types.go b/pkg/defaultmonitortests/types.go
index ad172476409a..a252ff0249e2 100644
--- a/pkg/defaultmonitortests/types.go
+++ b/pkg/defaultmonitortests/types.go
@@ -23,6 +23,7 @@ import (
 	"github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests"
 	"github.com/openshift/origin/pkg/monitortests/machines/watchmachines"
 	"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi"
+	"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionthanosquerierapi"
 	"github.com/openshift/origin/pkg/monitortests/monitoring/statefulsetsrecreation"
 	"github.com/openshift/origin/pkg/monitortests/network/disruptioningress"
 	"github.com/openshift/origin/pkg/monitortests/network/disruptionpodnetwork"
@@ -131,6 +132,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
 
 	monitorTestRegistry.AddMonitorTestOrDie("monitoring-statefulsets-recreation", "Monitoring", statefulsetsrecreation.NewStatefulsetsChecker())
 	monitorTestRegistry.AddMonitorTestOrDie("metrics-api-availability", "Monitoring", disruptionmetricsapi.NewAvailabilityInvariant())
+	monitorTestRegistry.AddMonitorTestOrDie("thanos-querier-api-availability", "Monitoring", disruptionthanosquerierapi.NewAvailabilityInvariant())
 	monitorTestRegistry.AddMonitorTestOrDie(apiunreachablefromclientmetrics.MonitorName, "kube-apiserver", apiunreachablefromclientmetrics.NewMonitorTest())
 	monitorTestRegistry.AddMonitorTestOrDie(faultyloadbalancer.MonitorName, "kube-apiserver", faultyloadbalancer.NewMonitorTest())
 
diff --git a/pkg/monitortests/monitoring/disruptionthanosquerierapi/monitortest.go b/pkg/monitortests/monitoring/disruptionthanosquerierapi/monitortest.go
new file mode 100644
index 000000000000..a19530d8a406
--- /dev/null
+++ b/pkg/monitortests/monitoring/disruptionthanosquerierapi/monitortest.go
@@ -0,0 +1,126 @@
+package disruptionthanosquerierapi
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/openshift/origin/pkg/monitortestframework"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+
+	"github.com/openshift/origin/pkg/monitor/backenddisruption"
+	"github.com/openshift/origin/pkg/monitor/monitorapi"
+	"github.com/openshift/origin/pkg/monitortestlibrary/disruptionlibrary"
+	"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
+)
+
+const (
+	monitoringNamespace = "openshift-monitoring"
+	thanosQuerierName   = "thanos-querier"
+)
+
+type availability struct {
+	disruptionChecker  *disruptionlibrary.Availability
+	notSupportedReason error
+}
+
+func NewAvailabilityInvariant() monitortestframework.MonitorTest {
+	return &availability{}
+}
+
+func createRouteBackendSampler(clusterConfig *rest.Config, namespace, name, disruptionBackendName, path string, connectionType monitorapi.BackendConnectionType) (*backenddisruption.BackendSampler, error) {
+	backendSampler := backenddisruption.NewRouteBackend(
+		clusterConfig,
+		namespace,
+		name,
+		disruptionBackendName,
+		path,
+		connectionType).
+		WithUserAgent(fmt.Sprintf("openshift-external-backend-sampler-%s-%s", connectionType, disruptionBackendName)).
+		// Auth isn't configured. An Unauthorized response should be enough to indicate that the Route's backend is reachable.
+		WithExpectedStatusCode(401)
+	return backendSampler, nil
+}
+
+func (w *availability) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
+	var err error
+
+	kubeClient, err := kubernetes.NewForConfig(adminRESTConfig)
+	if err != nil {
+		return err
+	}
+
+	deploymentScale, err := kubeClient.AppsV1().Deployments(monitoringNamespace).GetScale(ctx, thanosQuerierName, metav1.GetOptions{})
+	if err != nil {
+		return err
+	}
+	// Skip for single replica Deployments.
+	if deploymentScale.Spec.Replicas == 1 {
+		w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: fmt.Sprintf("%s only has a single replica", deploymentScale.Name)}
+		return w.notSupportedReason
+	}
+
+	disruptionBackedName := "thanos-querier-api"
+	newConnectionTestName := fmt.Sprintf("[sig-instrumentation] disruption/%s connection/new should be available throughout the test", disruptionBackedName)
+	reusedConnectionTestName := fmt.Sprintf("[sig-instrumentation] disruption/%s connection/reused should be available throughout the test", disruptionBackedName)
+	path := "/api"
+
+	newConnections, err := createRouteBackendSampler(adminRESTConfig, monitoringNamespace, thanosQuerierName, disruptionBackedName, path, monitorapi.NewConnectionType)
+	if err != nil {
+		return err
+	}
+	reusedConnections, err := createRouteBackendSampler(adminRESTConfig, monitoringNamespace, thanosQuerierName, disruptionBackedName, path, monitorapi.ReusedConnectionType)
+	if err != nil {
+		return err
+	}
+
+	w.disruptionChecker = disruptionlibrary.NewAvailabilityInvariant(
+		newConnectionTestName, reusedConnectionTestName,
+		newConnections, reusedConnections,
+	)
+
+	if err := w.disruptionChecker.StartCollection(ctx, adminRESTConfig, recorder); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (w *availability) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
+	if w.notSupportedReason != nil {
+		return nil, nil, w.notSupportedReason
+	}
+	// we failed and indicated it during setup.
+	if w.disruptionChecker == nil {
+		return nil, nil, nil
+	}
+
+	return w.disruptionChecker.CollectData(ctx)
+}
+
+func (w *availability) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
+	return nil, w.notSupportedReason
+}
+
+func (w *availability) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
+	if w.notSupportedReason != nil {
+		return nil, w.notSupportedReason
+	}
+	// we failed and indicated it during setup.
+	if w.disruptionChecker == nil {
+		return nil, nil
+	}
+
+	return w.disruptionChecker.EvaluateTestsFromConstructedIntervals(ctx, finalIntervals)
+}
+
+func (w *availability) WriteContentToStorage(ctx context.Context, storageDir string, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
+	return w.notSupportedReason
+}
+
+func (w *availability) Cleanup(ctx context.Context) error {
+	return w.notSupportedReason
+}