Skip to content

Commit

Permalink
Add disruption check for Thanos Querier API
Browse files Browse the repository at this point in the history
Uncertain if the SLO is achievable with the current config (a check every second)
API adjustments may be needed to improve reliability, or we may need to loosen up the check

Based on Route dependency, which could increase the likelihood of false positives.

Auth isn't set; we only check for 401. A 5xx would indicate Route backend (Thanos Querier Pods) issues.
We may be able to easily get token based auth to work
  • Loading branch information
machine424 committed Dec 26, 2024
1 parent 929fc7c commit 432ee31
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pkg/defaultmonitortests/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests"
"github.com/openshift/origin/pkg/monitortests/machines/watchmachines"
"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi"
"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionthanosquerierapi"
"github.com/openshift/origin/pkg/monitortests/monitoring/statefulsetsrecreation"
"github.com/openshift/origin/pkg/monitortests/network/disruptioningress"
"github.com/openshift/origin/pkg/monitortests/network/disruptionpodnetwork"
Expand Down Expand Up @@ -131,6 +132,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI

monitorTestRegistry.AddMonitorTestOrDie("monitoring-statefulsets-recreation", "Monitoring", statefulsetsrecreation.NewStatefulsetsChecker())
monitorTestRegistry.AddMonitorTestOrDie("metrics-api-availability", "Monitoring", disruptionmetricsapi.NewAvailabilityInvariant())
monitorTestRegistry.AddMonitorTestOrDie("thanos-querier-api-availability", "Monitoring", disruptionthanosquerierapi.NewAvailabilityInvariant())
monitorTestRegistry.AddMonitorTestOrDie(apiunreachablefromclientmetrics.MonitorName, "kube-apiserver", apiunreachablefromclientmetrics.NewMonitorTest())
monitorTestRegistry.AddMonitorTestOrDie(faultyloadbalancer.MonitorName, "kube-apiserver", faultyloadbalancer.NewMonitorTest())

Expand Down
126 changes: 126 additions & 0 deletions pkg/monitortests/monitoring/disruptionthanosquerierapi/monitortest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package disruptionthanosquerierapi

import (
"context"
"fmt"
"time"

"github.com/openshift/origin/pkg/monitortestframework"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/openshift/origin/pkg/monitor/backenddisruption"
"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestlibrary/disruptionlibrary"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
)

const (
monitoringNamespace = "openshift-monitoring"
thanosQuerierName = "thanos-querier"
)

type availability struct {
disruptionChecker *disruptionlibrary.Availability
notSupportedReason error
}

func NewAvailabilityInvariant() monitortestframework.MonitorTest {
return &availability{}
}

func createRouteBackendSampler(clusterConfig *rest.Config, namespace, name, disruptionBackendName, path string, connectionType monitorapi.BackendConnectionType) (*backenddisruption.BackendSampler, error) {
backendSampler := backenddisruption.NewRouteBackend(
clusterConfig,
namespace,
name,
disruptionBackendName,
path,
connectionType).
WithUserAgent(fmt.Sprintf("openshift-external-backend-sampler-%s-%s", connectionType, disruptionBackendName)).
// Auth isn't configured. An Unauthorized response should be enough to indicate that the Route's backend is reachable.
WithExpectedStatusCode(401)
return backendSampler, nil
}

func (w *availability) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
var err error

kubeClient, err := kubernetes.NewForConfig(adminRESTConfig)
if err != nil {
return err
}

deploymentScale, err := kubeClient.AppsV1().Deployments(monitoringNamespace).GetScale(ctx, thanosQuerierName, metav1.GetOptions{})
if err != nil {
return err
}
// Skip for single replica Deployments.
if deploymentScale.Spec.Replicas == 1 {
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: fmt.Sprintf("%s only has a single replica", deploymentScale.Name)}
return w.notSupportedReason
}

disruptionBackedName := "thanos-querier-api"
newConnectionTestName := fmt.Sprintf("[sig-instrumentation] disruption/%s connection/new should be available throughout the test", disruptionBackedName)
reusedConnectionTestName := fmt.Sprintf("[sig-instrumentation] disruption/%s connection/reused should be available throughout the test", disruptionBackedName)
path := "/api"

newConnections, err := createRouteBackendSampler(adminRESTConfig, monitoringNamespace, thanosQuerierName, disruptionBackedName, path, monitorapi.NewConnectionType)
if err != nil {
return err
}
reusedConnections, err := createRouteBackendSampler(adminRESTConfig, monitoringNamespace, thanosQuerierName, disruptionBackedName, path, monitorapi.ReusedConnectionType)
if err != nil {
return err
}

w.disruptionChecker = disruptionlibrary.NewAvailabilityInvariant(
newConnectionTestName, reusedConnectionTestName,
newConnections, reusedConnections,
)

if err := w.disruptionChecker.StartCollection(ctx, adminRESTConfig, recorder); err != nil {
return err
}

return nil
}

func (w *availability) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
if w.notSupportedReason != nil {
return nil, nil, w.notSupportedReason
}
// we failed and indicated it during setup.
if w.disruptionChecker == nil {
return nil, nil, nil
}

return w.disruptionChecker.CollectData(ctx)
}

func (w *availability) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, w.notSupportedReason
}

func (w *availability) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
if w.notSupportedReason != nil {
return nil, w.notSupportedReason
}
// we failed and indicated it during setup.
if w.disruptionChecker == nil {
return nil, nil
}

return w.disruptionChecker.EvaluateTestsFromConstructedIntervals(ctx, finalIntervals)
}

func (w *availability) WriteContentToStorage(ctx context.Context, storageDir string, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
return w.notSupportedReason
}

func (w *availability) Cleanup(ctx context.Context) error {
return w.notSupportedReason
}

0 comments on commit 432ee31

Please sign in to comment.