-
Notifications
You must be signed in to change notification settings - Fork 50
OSD-18645 - Initial implementation for CannotRetrieveUpdatesSRE #404
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# cannotretrieveupdatessre Investigation | ||
|
||
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status. | ||
|
||
## Investigation Logic | ||
|
||
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks: | ||
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints. | ||
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`. | ||
|
||
## Testing | ||
|
||
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation |
117 changes: 117 additions & 0 deletions
117
pkg/investigations/cannotretrieveupdatessre/cannotretrieveupdatessre.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package cannotretrieveupdatessre | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"fmt" | ||
"strings" | ||
|
||
configv1 "github.com/openshift/api/config/v1" | ||
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation" | ||
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s" | ||
"github.com/openshift/configuration-anomaly-detection/pkg/logging" | ||
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier" | ||
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
) | ||
|
||
const ( | ||
alertname = "CannotRetrieveUpdatesSRE" | ||
remediationName = "CannotRetrieveUpdatesSRE" | ||
) | ||
|
||
type Investigation struct{} | ||
|
||
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert | ||
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) { | ||
result := investigation.InvestigationResult{} | ||
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger) | ||
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName) | ||
if err != nil { | ||
return result, fmt.Errorf("unable to initialize k8s cli: %w", err) | ||
} | ||
defer func() { | ||
deferErr := k8scli.Clean() | ||
if deferErr != nil { | ||
logging.Error(deferErr) | ||
err = errors.Join(err, deferErr) | ||
} | ||
}() | ||
|
||
// Run network verifier | ||
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient) | ||
if err != nil { | ||
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error()) | ||
} else { | ||
switch verifierResult { | ||
case networkverifier.Failure: | ||
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil} | ||
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason) | ||
case networkverifier.Success: | ||
notes.AppendSuccess("Network verifier passed") | ||
} | ||
} | ||
|
||
// Check ClusterVersion | ||
clusterVersion, err := getClusterVersion(k8scli) | ||
if err != nil { | ||
notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error()) | ||
} else { | ||
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version) | ||
|
||
failureReason := getUpdateRetrievalFailures(clusterVersion) | ||
if failureReason != "" { | ||
logging.Warnf("Detected ClusterVersion issue: %s", failureReason) | ||
notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s", | ||
failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel) | ||
} | ||
} | ||
notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.") | ||
return result, r.PdClient.EscalateIncidentWithNote(notes.String()) | ||
} | ||
|
||
func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) { | ||
anispate marked this conversation as resolved.
Show resolved
Hide resolved
|
||
clusterVersion := &configv1.ClusterVersion{} | ||
err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to get ClusterVersion: %w", err) | ||
} | ||
return clusterVersion, nil | ||
} | ||
|
||
// getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion | ||
func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string { | ||
for _, condition := range clusterVersion.Status.Conditions { | ||
msg, found := checkCondition(condition) | ||
if found { | ||
return msg | ||
} | ||
} | ||
return "" | ||
} | ||
|
||
func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) { | ||
if condition.Type != "RetrievedUpdates" { | ||
return "", false | ||
} | ||
if condition.Status == configv1.ConditionFalse { | ||
return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true | ||
} | ||
return "", false | ||
} | ||
|
||
func (i *Investigation) Name() string { | ||
anispate marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return alertname | ||
} | ||
|
||
func (i *Investigation) Description() string { | ||
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname) | ||
} | ||
|
||
func (i *Investigation) ShouldInvestigateAlert(alert string) bool { | ||
return strings.Contains(alert, alertname) | ||
} | ||
|
||
func (i *Investigation) IsExperimental() bool { | ||
return true | ||
} |
177 changes: 177 additions & 0 deletions
177
pkg/investigations/cannotretrieveupdatessre/cannotretrieveupdatessre_test.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
package cannotretrieveupdatessre | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
configv1 "github.com/openshift/api/config/v1" | ||
apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/kubernetes/scheme" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/client/fake" | ||
) | ||
|
||
func newFakeClient(objs ...client.Object) (client.Client, error) { | ||
s := scheme.Scheme | ||
err := configv1.AddToScheme(s) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build() | ||
return client, nil | ||
} | ||
|
||
func TestGetClusterVersion(t *testing.T) { | ||
anispate marked this conversation as resolved.
Show resolved
Hide resolved
|
||
tests := []struct { | ||
name string | ||
clusterVersion *configv1.ClusterVersion | ||
expectedVersion string | ||
expectError bool | ||
}{ | ||
{ | ||
name: "Valid ClusterVersion", | ||
clusterVersion: &configv1.ClusterVersion{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "version", | ||
}, | ||
Spec: configv1.ClusterVersionSpec{ | ||
Channel: "stable-4.18", | ||
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test", | ||
}, | ||
Status: configv1.ClusterVersionStatus{ | ||
Desired: configv1.Release{Version: "4.18.10"}, | ||
}, | ||
}, | ||
expectedVersion: "4.18.10", | ||
expectError: false, | ||
}, | ||
{ | ||
name: "ClusterVersion Not Found", | ||
clusterVersion: nil, | ||
expectedVersion: "", | ||
expectError: true, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
var k8scli client.Client | ||
var err error | ||
if tt.clusterVersion != nil { | ||
k8scli, err = newFakeClient(tt.clusterVersion) | ||
} else { | ||
k8scli, err = newFakeClient() | ||
} | ||
if err != nil { | ||
t.Fatalf("failed to create a fake client: %v", err) | ||
} | ||
|
||
got, err := getClusterVersion(k8scli) | ||
|
||
if tt.expectError && err == nil { | ||
t.Errorf("Expected an error, got none") | ||
} else if !tt.expectError && err != nil { | ||
t.Errorf("Expected no error, got %v", err) | ||
} | ||
|
||
if !tt.expectError { | ||
if got.Status.Desired.Version != tt.expectedVersion { | ||
t.Errorf("Expected version %q, got %q", tt.expectedVersion, got.Status.Desired.Version) | ||
} | ||
} else { | ||
if got != nil { | ||
t.Errorf("Expected nil ClusterVersion error, got %v", got) | ||
} | ||
if err != nil && !apierrors.IsNotFound(err) && !strings.Contains(err.Error(), "failed to get ClusterVersion") { | ||
t.Errorf("Expected error to be related about failed to get the ClusterVersion, got %v", err) | ||
} | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestGetUpdateRetrievalFailures(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
clusterVersion *configv1.ClusterVersion | ||
expectedNote string | ||
}{ | ||
{ | ||
name: "RemoteFailed condition", | ||
clusterVersion: &configv1.ClusterVersion{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "version", | ||
}, | ||
Spec: configv1.ClusterVersionSpec{ | ||
Channel: "stable-4.18", | ||
}, | ||
Status: configv1.ClusterVersionStatus{ | ||
Conditions: []configv1.ClusterOperatorStatusCondition{ | ||
{ | ||
Type: "RetrievedUpdates", | ||
Status: configv1.ConditionFalse, | ||
Reason: "RemoteFailed", | ||
Message: "Unable to retrieve available updates", | ||
}, | ||
}, | ||
}, | ||
}, | ||
expectedNote: "(Reason: RemoteFailed). Unable to retrieve available updates", | ||
}, | ||
{ | ||
name: "VersionNotFound condition", | ||
clusterVersion: &configv1.ClusterVersion{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "version", | ||
}, | ||
Spec: configv1.ClusterVersionSpec{ | ||
Channel: "stable-4.18", | ||
}, | ||
Status: configv1.ClusterVersionStatus{ | ||
Conditions: []configv1.ClusterOperatorStatusCondition{ | ||
{ | ||
Type: "RetrievedUpdates", | ||
Status: configv1.ConditionFalse, | ||
Reason: "VersionNotFound", | ||
Message: "Unable to retrieve available updates", | ||
}, | ||
}, | ||
}, | ||
}, | ||
expectedNote: "(Reason: VersionNotFound). Unable to retrieve available updates", | ||
}, | ||
{ | ||
name: "Happy path", | ||
clusterVersion: &configv1.ClusterVersion{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "version", | ||
}, | ||
Spec: configv1.ClusterVersionSpec{ | ||
Channel: "stable-4.18", | ||
}, | ||
Status: configv1.ClusterVersionStatus{ | ||
Conditions: []configv1.ClusterOperatorStatusCondition{ | ||
{ | ||
Type: "RetrievedUpdates", | ||
Status: configv1.ConditionTrue, | ||
Reason: "UpdatesRetrieved", | ||
Message: "Available updates retrieved successfully", | ||
}, | ||
}, | ||
}, | ||
}, | ||
expectedNote: "", | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
reason := getUpdateRetrievalFailures(tt.clusterVersion) | ||
if reason != tt.expectedNote { | ||
t.Errorf("Expected note %q, got %q", tt.expectedNote, reason) | ||
} | ||
}) | ||
} | ||
} |
File renamed without changes.
17 changes: 17 additions & 0 deletions
17
pkg/investigations/cannotretrieveupdatessre/testing/README.md
anispate marked this conversation as resolved.
Show resolved
Hide resolved
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Testing CannotRetrieveUpdatesSRE Investigation | ||
|
||
### Update the ClusterVersion Channel | ||
- Below script helps to set the test channel to check the clusterversion change. | ||
```sh | ||
#!/bin/bash | ||
|
||
# Use test channel for the ClusterVersion | ||
oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin | ||
sleep 30 | ||
|
||
# Verify | ||
oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; } | ||
|
||
# Optional: Revert back to the original change | ||
#oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin | ||
``` |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.