Skip to content

OSD-18645 - Initial implementation for CannotRetrieveUpdatesSRE #404

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pkg/investigations/cannotretrieveupdatessre/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# cannotretrieveupdatessre Investigation

Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.

## Investigation Logic

The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.

## Testing

Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
package cannotretrieveupdatessre

import (
"context"
"errors"
"fmt"
"strings"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
alertname = "CannotRetrieveUpdatesSRE"
remediationName = "CannotRetrieveUpdatesSRE"
)

type Investigation struct{}

// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
result := investigation.InvestigationResult{}
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
if err != nil {
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
}
defer func() {
deferErr := k8scli.Clean()
if deferErr != nil {
logging.Error(deferErr)
err = errors.Join(err, deferErr)
}
}()

// Run network verifier
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
if err != nil {
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
} else {
switch verifierResult {
case networkverifier.Failure:
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
case networkverifier.Success:
notes.AppendSuccess("Network verifier passed")
}
}

// Check ClusterVersion
clusterVersion, err := getClusterVersion(k8scli)
if err != nil {
notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error())
} else {
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)

failureReason := getUpdateRetrievalFailures(clusterVersion)
if failureReason != "" {
logging.Warnf("Detected ClusterVersion issue: %s", failureReason)
notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s",
failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel)
}
}
notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.")
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
}

func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) {
clusterVersion := &configv1.ClusterVersion{}
err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
if err != nil {
return nil, fmt.Errorf("failed to get ClusterVersion: %w", err)
}
return clusterVersion, nil
}

// getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion
func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string {
for _, condition := range clusterVersion.Status.Conditions {
msg, found := checkCondition(condition)
if found {
return msg
}
}
return ""
}

func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) {
if condition.Type != "RetrievedUpdates" {
return "", false
}
if condition.Status == configv1.ConditionFalse {
return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true
}
return "", false
}

func (i *Investigation) Name() string {
return alertname
}

func (i *Investigation) Description() string {
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
}

func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
return strings.Contains(alert, alertname)
}

func (i *Investigation) IsExperimental() bool {
return true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
package cannotretrieveupdatessre

import (
"strings"
"testing"

configv1 "github.com/openshift/api/config/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/scheme"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
)

func newFakeClient(objs ...client.Object) (client.Client, error) {
s := scheme.Scheme
err := configv1.AddToScheme(s)
if err != nil {
return nil, err
}

client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build()
return client, nil
}

func TestGetClusterVersion(t *testing.T) {
tests := []struct {
name string
clusterVersion *configv1.ClusterVersion
expectedVersion string
expectError bool
}{
{
name: "Valid ClusterVersion",
clusterVersion: &configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Spec: configv1.ClusterVersionSpec{
Channel: "stable-4.18",
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
},
Status: configv1.ClusterVersionStatus{
Desired: configv1.Release{Version: "4.18.10"},
},
},
expectedVersion: "4.18.10",
expectError: false,
},
{
name: "ClusterVersion Not Found",
clusterVersion: nil,
expectedVersion: "",
expectError: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var k8scli client.Client
var err error
if tt.clusterVersion != nil {
k8scli, err = newFakeClient(tt.clusterVersion)
} else {
k8scli, err = newFakeClient()
}
if err != nil {
t.Fatalf("failed to create a fake client: %v", err)
}

got, err := getClusterVersion(k8scli)

if tt.expectError && err == nil {
t.Errorf("Expected an error, got none")
} else if !tt.expectError && err != nil {
t.Errorf("Expected no error, got %v", err)
}

if !tt.expectError {
if got.Status.Desired.Version != tt.expectedVersion {
t.Errorf("Expected version %q, got %q", tt.expectedVersion, got.Status.Desired.Version)
}
} else {
if got != nil {
t.Errorf("Expected nil ClusterVersion error, got %v", got)
}
if err != nil && !apierrors.IsNotFound(err) && !strings.Contains(err.Error(), "failed to get ClusterVersion") {
t.Errorf("Expected error to be related about failed to get the ClusterVersion, got %v", err)
}
}
})
}
}

func TestGetUpdateRetrievalFailures(t *testing.T) {
tests := []struct {
name string
clusterVersion *configv1.ClusterVersion
expectedNote string
}{
{
name: "RemoteFailed condition",
clusterVersion: &configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Spec: configv1.ClusterVersionSpec{
Channel: "stable-4.18",
},
Status: configv1.ClusterVersionStatus{
Conditions: []configv1.ClusterOperatorStatusCondition{
{
Type: "RetrievedUpdates",
Status: configv1.ConditionFalse,
Reason: "RemoteFailed",
Message: "Unable to retrieve available updates",
},
},
},
},
expectedNote: "(Reason: RemoteFailed). Unable to retrieve available updates",
},
{
name: "VersionNotFound condition",
clusterVersion: &configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Spec: configv1.ClusterVersionSpec{
Channel: "stable-4.18",
},
Status: configv1.ClusterVersionStatus{
Conditions: []configv1.ClusterOperatorStatusCondition{
{
Type: "RetrievedUpdates",
Status: configv1.ConditionFalse,
Reason: "VersionNotFound",
Message: "Unable to retrieve available updates",
},
},
},
},
expectedNote: "(Reason: VersionNotFound). Unable to retrieve available updates",
},
{
name: "Happy path",
clusterVersion: &configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: "version",
},
Spec: configv1.ClusterVersionSpec{
Channel: "stable-4.18",
},
Status: configv1.ClusterVersionStatus{
Conditions: []configv1.ClusterOperatorStatusCondition{
{
Type: "RetrievedUpdates",
Status: configv1.ConditionTrue,
Reason: "UpdatesRetrieved",
Message: "Available updates retrieved successfully",
},
},
},
},
expectedNote: "",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
reason := getUpdateRetrievalFailures(tt.clusterVersion)
if reason != tt.expectedNote {
t.Errorf("Expected note %q, got %q", tt.expectedNote, reason)
}
})
}
}
17 changes: 17 additions & 0 deletions pkg/investigations/cannotretrieveupdatessre/testing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Testing CannotRetrieveUpdatesSRE Investigation

### Update the ClusterVersion Channel
- Below script helps to set the test channel to check the clusterversion change.
```sh
#!/bin/bash

# Use test channel for the ClusterVersion
oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin
sleep 30

# Verify
oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; }

# Optional: Revert back to the original change
#oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin
```
2 changes: 2 additions & 0 deletions pkg/investigations/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package investigations

import (
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
Expand All @@ -22,6 +23,7 @@ var availableInvestigations = []investigation.Investigation{
&insightsoperatordown.Investigation{},
&upgradeconfigsyncfailureover4hr.Investigation{},
&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
&cannotretrieveupdatessre.Investigation{},
}

// GetInvestigation returns the first Investigation that applies to the given alert title.
Expand Down
1 change: 1 addition & 0 deletions test/generate_incident.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ declare -A alert_mapping=(
["InsightsOperatorDown"]="InsightsOperatorDown"
["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)"
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
)

# Function to print help message
Expand Down