Skip to content

Commit 503c320

Browse files
committed
Initial implementation for CannotRetrieveUpdatesSRE
1 parent 71ed508 commit 503c320

File tree

7 files changed

+327
-0
lines changed

7 files changed

+327
-0
lines changed
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# cannotretrieveupdatessre Investigation
2+
3+
Investigates the CannotRetrieveUpdatesSRE alert by running network verifier and updating the notes with investigation details to the PagerDuty alert about the cluster version status.
4+
5+
## Investigation Logic
6+
7+
The `CannotRetrieveUpdatesSRE` investigation is designed to diagnose issues where an OpenShift cluster cannot retrieve updates from its configured channel. It performs two main checks:
8+
1. **Network Verification**: Uses the `networkverifier` package to ensure the cluster can reach required update endpoints.
9+
2. **ClusterVersion Check**: Examines the `ClusterVersion` resource for conditions indicating update retrieval failures, such as `VersionNotFound`.
10+
11+
## Testing
12+
13+
Refer to the [testing README](./testing/README.md) for instructions on testing this investigation
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"strings"
8+
9+
configv1 "github.com/openshift/api/config/v1"
10+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/investigation"
11+
k8sclient "github.com/openshift/configuration-anomaly-detection/pkg/k8s"
12+
"github.com/openshift/configuration-anomaly-detection/pkg/logging"
13+
"github.com/openshift/configuration-anomaly-detection/pkg/networkverifier"
14+
"github.com/openshift/configuration-anomaly-detection/pkg/notewriter"
15+
"sigs.k8s.io/controller-runtime/pkg/client"
16+
)
17+
18+
const (
19+
alertname = "CannotRetrieveUpdatesSRE"
20+
remediationName = "CannotRetrieveUpdatesSRE"
21+
)
22+
23+
type Investigation struct{}
24+
25+
// Run executes the investigation for the CannotRetrieveUpdatesSRE alert
26+
func (c *Investigation) Run(r *investigation.Resources) (investigation.InvestigationResult, error) {
27+
result := investigation.InvestigationResult{}
28+
notes := notewriter.New("CannotRetrieveUpdatesSRE", logging.RawLogger)
29+
k8scli, err := k8sclient.New(r.Cluster.ID(), r.OcmClient, remediationName)
30+
if err != nil {
31+
return result, fmt.Errorf("unable to initialize k8s cli: %w", err)
32+
}
33+
defer func() {
34+
deferErr := k8scli.Clean()
35+
if deferErr != nil {
36+
logging.Error(deferErr)
37+
err = errors.Join(err, deferErr)
38+
}
39+
}()
40+
41+
// Run network verifier
42+
verifierResult, failureReason, err := networkverifier.Run(r.Cluster, r.ClusterDeployment, r.AwsClient)
43+
if err != nil {
44+
notes.AppendWarning("NetworkVerifier failed to run:\n\t %s", err.Error())
45+
} else {
46+
switch verifierResult {
47+
case networkverifier.Failure:
48+
result.ServiceLogPrepared = investigation.InvestigationStep{Performed: true, Labels: nil}
49+
notes.AppendWarning("NetworkVerifier found unreachable targets. \n \n Verify and send service log if necessary: \n osdctl servicelog post %s -t https://raw.githubusercontent.com/openshift/managed-notifications/master/osd/required_network_egresses_are_blocked.json -p URLS=%s", r.Cluster.ID(), failureReason)
50+
case networkverifier.Success:
51+
notes.AppendSuccess("Network verifier passed")
52+
}
53+
}
54+
55+
// Check ClusterVersion
56+
clusterVersion, err := getClusterVersion(k8scli)
57+
if err != nil {
58+
notes.AppendWarning("Failed to get ClusterVersion: %s", err.Error())
59+
} else {
60+
notes.AppendSuccess("ClusterVersion found: %s", clusterVersion.Status.Desired.Version)
61+
62+
failureReason := getUpdateRetrievalFailures(clusterVersion)
63+
if failureReason != "" {
64+
logging.Warnf("Detected ClusterVersion issue: %s", failureReason)
65+
notes.AppendWarning("ClusterVersion related issue detected: %s. Current version %s not found in channel %s",
66+
failureReason, clusterVersion.Status.Desired.Version, clusterVersion.Spec.Channel)
67+
}
68+
}
69+
notes.AppendWarning("Alert escalated to on-call primary for review and please check the ClusterVersion.")
70+
return result, r.PdClient.EscalateIncidentWithNote(notes.String())
71+
}
72+
73+
func getClusterVersion(k8scli client.Client) (*configv1.ClusterVersion, error) {
74+
clusterVersion := &configv1.ClusterVersion{}
75+
err := k8scli.Get(context.TODO(), client.ObjectKey{Name: "version"}, clusterVersion)
76+
if err != nil {
77+
return nil, fmt.Errorf("failed to get ClusterVersion: %w", err)
78+
}
79+
return clusterVersion, nil
80+
}
81+
82+
// getUpdateRetrievalFailures checks for update retrieval failures in the ClusterVersion
83+
func getUpdateRetrievalFailures(clusterVersion *configv1.ClusterVersion) string {
84+
for _, condition := range clusterVersion.Status.Conditions {
85+
msg, found := checkCondition(condition)
86+
if found {
87+
return msg
88+
}
89+
}
90+
return ""
91+
}
92+
93+
func checkCondition(condition configv1.ClusterOperatorStatusCondition) (string, bool) {
94+
if condition.Type != "RetrievedUpdates" {
95+
return "", false
96+
}
97+
if condition.Status == configv1.ConditionFalse {
98+
return fmt.Sprintf("(Reason: %s). %s", condition.Reason, condition.Message), true
99+
}
100+
return "", false
101+
}
102+
103+
func (i *Investigation) Name() string {
104+
return alertname
105+
}
106+
107+
func (i *Investigation) Description() string {
108+
return fmt.Sprintf("Investigates '%s' alerts by running network verifier and checking ClusterVersion", alertname)
109+
}
110+
111+
func (i *Investigation) ShouldInvestigateAlert(alert string) bool {
112+
return strings.Contains(alert, alertname)
113+
}
114+
115+
func (i *Investigation) IsExperimental() bool {
116+
return true
117+
}
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
package cannotretrieveupdatessre
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
configv1 "github.com/openshift/api/config/v1"
8+
apierrors "k8s.io/apimachinery/pkg/api/errors"
9+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
"k8s.io/client-go/kubernetes/scheme"
11+
"sigs.k8s.io/controller-runtime/pkg/client"
12+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
13+
)
14+
15+
func newFakeClient(objs ...client.Object) (client.Client, error) {
16+
s := scheme.Scheme
17+
err := configv1.AddToScheme(s)
18+
if err != nil {
19+
return nil, err
20+
}
21+
22+
client := fake.NewClientBuilder().WithScheme(s).WithObjects(objs...).Build()
23+
return client, nil
24+
}
25+
26+
func TestGetClusterVersion(t *testing.T) {
27+
tests := []struct {
28+
name string
29+
clusterVersion *configv1.ClusterVersion
30+
expectedVersion string
31+
expectError bool
32+
}{
33+
{
34+
name: "Valid ClusterVersion",
35+
clusterVersion: &configv1.ClusterVersion{
36+
ObjectMeta: metav1.ObjectMeta{
37+
Name: "version",
38+
},
39+
Spec: configv1.ClusterVersionSpec{
40+
Channel: "stable-4.18",
41+
ClusterID: "d1ba89f3-fd3e-48d2-91c6-test",
42+
},
43+
Status: configv1.ClusterVersionStatus{
44+
Desired: configv1.Release{Version: "4.18.10"},
45+
},
46+
},
47+
expectedVersion: "4.18.10",
48+
expectError: false,
49+
},
50+
{
51+
name: "ClusterVersion Not Found",
52+
clusterVersion: nil,
53+
expectedVersion: "",
54+
expectError: true,
55+
},
56+
}
57+
58+
for _, tt := range tests {
59+
t.Run(tt.name, func(t *testing.T) {
60+
var k8scli client.Client
61+
var err error
62+
if tt.clusterVersion != nil {
63+
k8scli, err = newFakeClient(tt.clusterVersion)
64+
} else {
65+
k8scli, err = newFakeClient()
66+
}
67+
if err != nil {
68+
t.Fatalf("failed to create a fake client: %v", err)
69+
}
70+
71+
got, err := getClusterVersion(k8scli)
72+
73+
if tt.expectError && err == nil {
74+
t.Errorf("Expected an error, got none")
75+
} else if !tt.expectError && err != nil {
76+
t.Errorf("Expected no error, got %v", err)
77+
}
78+
79+
if !tt.expectError {
80+
if got.Status.Desired.Version != tt.expectedVersion {
81+
t.Errorf("Expected version %q, got %q", tt.expectedVersion, got.Status.Desired.Version)
82+
}
83+
} else {
84+
if got != nil {
85+
t.Errorf("Expected nil ClusterVersion error, got %v", got)
86+
}
87+
if err != nil && !apierrors.IsNotFound(err) && !strings.Contains(err.Error(), "failed to get ClusterVersion") {
88+
t.Errorf("Expected error to be related about failed to get the ClusterVersion, got %v", err)
89+
}
90+
}
91+
})
92+
}
93+
}
94+
95+
func TestGetUpdateRetrievalFailures(t *testing.T) {
96+
tests := []struct {
97+
name string
98+
clusterVersion *configv1.ClusterVersion
99+
expectedNote string
100+
}{
101+
{
102+
name: "RemoteFailed condition",
103+
clusterVersion: &configv1.ClusterVersion{
104+
ObjectMeta: metav1.ObjectMeta{
105+
Name: "version",
106+
},
107+
Spec: configv1.ClusterVersionSpec{
108+
Channel: "stable-4.18",
109+
},
110+
Status: configv1.ClusterVersionStatus{
111+
Conditions: []configv1.ClusterOperatorStatusCondition{
112+
{
113+
Type: "RetrievedUpdates",
114+
Status: configv1.ConditionFalse,
115+
Reason: "RemoteFailed",
116+
Message: "Unable to retrieve available updates",
117+
},
118+
},
119+
},
120+
},
121+
expectedNote: "(Reason: RemoteFailed). Unable to retrieve available updates",
122+
},
123+
{
124+
name: "VersionNotFound condition",
125+
clusterVersion: &configv1.ClusterVersion{
126+
ObjectMeta: metav1.ObjectMeta{
127+
Name: "version",
128+
},
129+
Spec: configv1.ClusterVersionSpec{
130+
Channel: "stable-4.18",
131+
},
132+
Status: configv1.ClusterVersionStatus{
133+
Conditions: []configv1.ClusterOperatorStatusCondition{
134+
{
135+
Type: "RetrievedUpdates",
136+
Status: configv1.ConditionFalse,
137+
Reason: "VersionNotFound",
138+
Message: "Unable to retrieve available updates",
139+
},
140+
},
141+
},
142+
},
143+
expectedNote: "(Reason: VersionNotFound). Unable to retrieve available updates",
144+
},
145+
{
146+
name: "Happy path",
147+
clusterVersion: &configv1.ClusterVersion{
148+
ObjectMeta: metav1.ObjectMeta{
149+
Name: "version",
150+
},
151+
Spec: configv1.ClusterVersionSpec{
152+
Channel: "stable-4.18",
153+
},
154+
Status: configv1.ClusterVersionStatus{
155+
Conditions: []configv1.ClusterOperatorStatusCondition{
156+
{
157+
Type: "RetrievedUpdates",
158+
Status: configv1.ConditionTrue,
159+
Reason: "UpdatesRetrieved",
160+
Message: "Available updates retrieved successfully",
161+
},
162+
},
163+
},
164+
},
165+
expectedNote: "",
166+
},
167+
}
168+
169+
for _, tt := range tests {
170+
t.Run(tt.name, func(t *testing.T) {
171+
reason := getUpdateRetrievalFailures(tt.clusterVersion)
172+
if reason != tt.expectedNote {
173+
t.Errorf("Expected note %q, got %q", tt.expectedNote, reason)
174+
}
175+
})
176+
}
177+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Testing CannotRetrieveUpdatesSRE Investigation
2+
3+
### Update the ClusterVersion Channel
4+
- Below script helps to set the test channel to check the clusterversion change.
5+
```sh
6+
#!/bin/bash
7+
8+
# Use test channel for the ClusterVersion
9+
oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18-test"}}' --as backplane-cluster-admin
10+
sleep 30
11+
12+
# Verify
13+
oc get clusterversion version -o jsonpath='{.spec.channel}' | grep "stable-4.18-test" || { echo "Failed to set the channel"; exit 1; }
14+
15+
# Optional: Revert back to the original change
16+
#oc patch clusterversion version --type merge -p '{"spec":{"channel":"stable-4.18"}}' --as backplane-cluster-admin
17+
```

pkg/investigations/registry.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package investigations
22

33
import (
44
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/apierrorbudgetburn"
5+
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/cannotretrieveupdatessre"
56
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/ccam"
67
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/chgm"
78
"github.com/openshift/configuration-anomaly-detection/pkg/investigations/clustermonitoringerrorbudgetburn"
@@ -22,6 +23,7 @@ var availableInvestigations = []investigation.Investigation{
2223
&insightsoperatordown.Investigation{},
2324
&upgradeconfigsyncfailureover4hr.Investigation{},
2425
&machinehealthcheckunterminatedshortcircuitsre.Investigation{},
26+
&cannotretrieveupdatessre.Investigation{},
2527
}
2628

2729
// GetInvestigation returns the first Investigation that applies to the given alert title.

test/generate_incident.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ declare -A alert_mapping=(
1010
["InsightsOperatorDown"]="InsightsOperatorDown"
1111
["MachineHealthCheckUnterminatedShortCircuitSRE"]="MachineHealthCheckUnterminatedShortCircuitSRE CRITICAL (1)"
1212
["ApiErrorBudgetBurn"]="api-ErrorBudgetBurn k8sgpt test CRITICAL (1)"
13+
["CannotRetrieveUpdatesSRE"]="CannotRetrieveUpdatesSRE"
1314
)
1415

1516
# Function to print help message

0 commit comments

Comments
 (0)