Skip to content

Commit 0a62198

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 71ed508 commit 0a62198

File tree

7 files changed

+306
-0
lines changed

7 files changed

+306
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct{}
24+
25+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
26+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
27+
result := investigation.InvestigationResult{}
28+
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
29+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
30+
if err != nil {
31+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
32+
}
33+
defer func() {
34+
deferErr := k8scli.Clean()
35+
if deferErr != nil {
36+
logging.Error(deferErr)
37+
err = errors.Join(err, deferErr)
38+
}
39+
}()
40+
41+
// Run network verifier
42+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
43+
if err != nil {
44+
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
45+
} else {
46+
switch verifierResult {
47+
case networkverifier.Failure:
48+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
49+
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
50+
case networkverifier.Success:
51+
notes.AppendSuccess("Network verifier passed")
52+
}
53+
}
54+
55+
// Check ClusterVersion
56+
clusterVersion, err := getClusterVersion(k8scli)
57+
if err != nil {
58+
notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error())
59+
} else {
60+
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)
61+
62+
failureReason := getUpdateRetrievalFailures(clusterVersion)
63+
if failureReason != "" {
64+
logging.Warnf("Detected ClusterVersion issue: %s", failureReason)
65+
notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s",
66+
failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel)
67+
}
68+
}
69+
notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.")
70+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
71+
}
72+
73+
func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) {
74+
clusterVersion := &configv1.ClusterVersion{}
75+
err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
76+
if err != nil {
77+
return nil, fmt.Errorf("failed to get ClusterVersion: %w", err)
78+
}
79+
return clusterVersion, nil
80+
}
81+
82+
// getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion
83+
func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string {
84+
for _, condition := range clusterVersion.Status.Conditions {
85+
msg, found := checkCondition(condition)
86+
if found {
87+
return msg
88+
}
89+
}
90+
return ""
91+
}
92+
93+
func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) {
94+
if condition.Type != "RetrievedUpdates" {
95+
return "", false
96+
}
97+
if condition.Status == configv1.ConditionFalse {
98+
return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true
99+
}
100+
return "", false
101+
}
102+
103+
func (i *Investigation) Name() string {
104+
return alertname
105+
}
106+
107+
func (i *Investigation) Description() string {
108+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
109+
}
110+
111+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
112+
return strings.Contains(alert, alertname)
113+
}
114+
115+
func (i *Investigation) IsExperimental() bool {
116+
return true
117+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"testing"
5+
6+
configv1 "github.com/openshift/api/config/v1"
7+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
8+
"k8s.io/client-go/kubernetes/scheme"
9+
"sigs.k8s.io/controller-runtime/pkg/client"
10+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
11+
)
12+
13+
func newFakeClient(objs ...client.Object) (client.Client, error) {
14+
s := scheme.Scheme
15+
err := configv1.AddToScheme(s)
16+
if err != nil {
17+
return nil, err
18+
}
19+
20+
client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build()
21+
return client, nil
22+
}
23+
24+
func TestGetClusterVersion(t *testing.T) {
25+
tests := []struct {
26+
name string
27+
clusterVersion *configv1.ClusterVersion
28+
expectedVersion string
29+
expectError bool
30+
}{
31+
{
32+
name: "Valid ClusterVersion",
33+
clusterVersion: &configv1.ClusterVersion{
34+
ObjectMeta: metav1.ObjectMeta{
35+
Name: "version",
36+
},
37+
Spec: configv1.ClusterVersionSpec{
38+
Channel: "stable-4.18",
39+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
40+
},
41+
Status: configv1.ClusterVersionStatus{
42+
Desired: configv1.Release{Version: "4.18.10"},
43+
},
44+
},
45+
expectedVersion: "4.18.10",
46+
expectError: false,
47+
},
48+
}
49+
50+
for _, tt := range tests {
51+
t.Run(tt.name, func(t *testing.T) {
52+
k8scli, err := newFakeClient(tt.clusterVersion)
53+
if err != nil {
54+
t.Fatalf("failed to create a fake client: %v", err)
55+
}
56+
57+
got, err := getClusterVersion(k8scli)
58+
59+
if tt.expectError && err == nil {
60+
t.Errorf("Expected an error, got none")
61+
} else if !tt.expectError && err != nil {
62+
t.Errorf("Expected no error, got %v", err)
63+
}
64+
65+
if !tt.expectError {
66+
if got.Status.Desired.Version != tt.expectedVersion {
67+
t.Errorf("Expected version %q, got %q", tt.expectedVersion, got.Status.Desired.Version)
68+
}
69+
}
70+
})
71+
}
72+
}
73+
74+
func TestGetUpdateRetrievalFailures(t *testing.T) {
75+
tests := []struct {
76+
name string
77+
clusterVersion *configv1.ClusterVersion
78+
expectedNote string
79+
}{
80+
{
81+
name: "RemoteFailed condition",
82+
clusterVersion: &configv1.ClusterVersion{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: "version",
85+
},
86+
Spec: configv1.ClusterVersionSpec{
87+
Channel: "stable-4.18",
88+
},
89+
Status: configv1.ClusterVersionStatus{
90+
Conditions: []configv1.ClusterOperatorStatusCondition{
91+
{
92+
Type: "RetrievedUpdates",
93+
Status: configv1.ConditionFalse,
94+
Reason: "RemoteFailed",
95+
Message: "Unable to retrieve available updates",
96+
},
97+
},
98+
},
99+
},
100+
expectedNote: "(Reason: RemoteFailed). Unable to retrieve available updates",
101+
},
102+
{
103+
name: "VersionNotFound condition",
104+
clusterVersion: &configv1.ClusterVersion{
105+
ObjectMeta: metav1.ObjectMeta{
106+
Name: "version",
107+
},
108+
Spec: configv1.ClusterVersionSpec{
109+
Channel: "stable-4.18",
110+
},
111+
Status: configv1.ClusterVersionStatus{
112+
Conditions: []configv1.ClusterOperatorStatusCondition{
113+
{
114+
Type: "RetrievedUpdates",
115+
Status: configv1.ConditionFalse,
116+
Reason: "VersionNotFound",
117+
Message: "Unable to retrieve available updates",
118+
},
119+
},
120+
},
121+
},
122+
expectedNote: "(Reason: VersionNotFound). Unable to retrieve available updates",
123+
},
124+
{
125+
name: "Happy path",
126+
clusterVersion: &configv1.ClusterVersion{
127+
ObjectMeta: metav1.ObjectMeta{
128+
Name: "version",
129+
},
130+
Spec: configv1.ClusterVersionSpec{
131+
Channel: "stable-4.18",
132+
},
133+
Status: configv1.ClusterVersionStatus{
134+
Conditions: []configv1.ClusterOperatorStatusCondition{
135+
{
136+
Type: "RetrievedUpdates",
137+
Status: configv1.ConditionTrue,
138+
Reason: "UpdatesRetrieved",
139+
Message: "Available updates retrieved successfully",
140+
},
141+
},
142+
},
143+
},
144+
expectedNote: "",
145+
},
146+
}
147+
148+
for _, tt := range tests {
149+
t.Run(tt.name, func(t *testing.T) {
150+
reason := getUpdateRetrievalFailures(tt.clusterVersion)
151+
if reason != tt.expectedNote {
152+
t.Errorf("Expected note %q, got %q", tt.expectedNote, reason)
153+
}
154+
})
155+
}
156+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
### Update the ClusterVersion Channel
4+
- Below script helps to set the test channel to check the clusterversion change.
5+
```sh
6+
#!/bin/bash
7+
8+
# Use test channel for the ClusterVersion
9+
oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin
10+
sleep 30
11+
12+
# Verify
13+
oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; }
14+
15+
# Optional: Revert back to the original change
16+
#oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin
17+
```

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package investigations
22

33
import (
44
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn"
5+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
78
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -22,6 +23,7 @@ var availableInvestigations = []investigation.Investigation{
2223
&insightsoperatordown.Investigation{},
2324
&upgradeconfigsyncfailureover4hr.Investigation{},
2425
&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
26+
&cannotretrieveupdatessre.Investigation{},
2527
}
2628

2729
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ declare -A alert_mapping=(
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
1111
["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
1212
["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)"
13+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1314
)
1415

1516
# Function to print help message

0 commit comments

Comments
 (0)