-
Notifications
You must be signed in to change notification settings - Fork 436
OCPBUGS-55638: OCPBUGS-55637: Add admin_acks handling in the MCO #5027
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
package operator | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"reflect" | ||
|
||
configv1 "github.com/openshift/api/config/v1" | ||
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" | ||
|
||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/labels" | ||
"k8s.io/apimachinery/pkg/util/sets" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
const ( | ||
BootImageAWSGCPKey = "ack-4.18-boot-image-opt-out-in-4.19" | ||
BootImageAWSGCPMsg = "This cluster is GCP or AWS but lacks a boot image configuration. OCP will automatically opt this cluster into boot image management in 4.19. Please add a configuration to disable boot image updates if this is not desired. See https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/machine_configuration/mco-update-boot-images#mco-update-boot-images-disable_machine-configs-configure for more details." | ||
|
||
AArch64BootImageKey = "ack-4.18-aarch64-bootloader-4.19" | ||
AArch64BootImageMsg = "This cluster has at least one aarch64 node. Please ensure boot images are not too old by following KCS https://access.redhat.com/solutions/7119697 prior to upgrading to 4.19." | ||
|
||
AdminAckGatesConfigMapName = "admin-gates" | ||
) | ||
|
||
// Syncs the admin-ack configmap in the openshift-config-managed namespace, and adds MCO specific keys if needed. | ||
func (optr *Operator) syncAdminAckConfigMap() error { | ||
cm, err := optr.ocManagedConfigMapLister.ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Get(AdminAckGatesConfigMapName) | ||
if err != nil { | ||
return fmt.Errorf("error fetching configmap %s during sync: %w", AdminAckGatesConfigMapName, err) | ||
} | ||
|
||
newCM := cm.DeepCopy() | ||
|
||
// Evaluate AWS/GCP boot image guards | ||
_, bootImageAWSGCPKeyExists := newCM.Data[BootImageAWSGCPKey] | ||
bootImageAWSGCPAdminGuardNeeded, err := optr.checkAWSGCPBootImageGuard() | ||
if err != nil { | ||
return fmt.Errorf("error determining aws/gcp boot image guard during configmap %s sync: %w", AdminAckGatesConfigMapName, err) | ||
} | ||
updateGuardKeyIfNeeded(bootImageAWSGCPAdminGuardNeeded, bootImageAWSGCPKeyExists, BootImageAWSGCPKey, BootImageAWSGCPMsg, newCM) | ||
|
||
// Evaluate aarch64 guards | ||
_, aarch64BootImageKeyExists := newCM.Data[AArch64BootImageKey] | ||
aarch64BootImageAdminGuardNeeded, err := optr.checkAArch64BootImageGuard() | ||
if err != nil { | ||
return fmt.Errorf("error determining aarch64 boot image guard during configmap %s sync: %w", AdminAckGatesConfigMapName, err) | ||
} | ||
updateGuardKeyIfNeeded(aarch64BootImageAdminGuardNeeded, aarch64BootImageKeyExists, AArch64BootImageKey, AArch64BootImageMsg, newCM) | ||
|
||
// Only send an API request if an update is needed | ||
if !reflect.DeepEqual(cm.Data, newCM.Data) { | ||
_, err = optr.kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Update(context.TODO(), newCM, metav1.UpdateOptions{}) | ||
if err != nil { | ||
return fmt.Errorf("error updating configmap %s during sync: %w", AdminAckGatesConfigMapName, err) | ||
} | ||
klog.Infof("%s configmap update successful", newCM.Name) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// checkAWSGCPBootImageGuard checks if AWS & GCP Boot Image admin guard is needed | ||
func (optr *Operator) checkAWSGCPBootImageGuard() (bool, error) { | ||
|
||
// First, check if the cluster is on AWS or GCP platform. | ||
infra, err := optr.infraLister.Get("cluster") | ||
if err != nil { | ||
klog.Errorf("Could not get infra: %v", err) | ||
return false, err | ||
} | ||
platforms := sets.New(configv1.GCPPlatformType, configv1.AWSPlatformType) | ||
if !platforms.Has(infra.Status.PlatformStatus.Type) { | ||
return false, nil | ||
} | ||
|
||
// Next, check if there is any boot image configuration. | ||
mcop, err := optr.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName) | ||
if err != nil { | ||
klog.Errorf("Could not get MachineConfiguration object: %v", err) | ||
return false, err | ||
} | ||
|
||
// If no machine managers exist, no configuration has been defined | ||
return mcop.Spec.ManagedBootImages.MachineManagers == nil, nil | ||
} | ||
|
||
// checkAArch64BootImageGuard checks if aarch64 Boot Image admin guard is needed | ||
func (optr *Operator) checkAArch64BootImageGuard() (bool, error) { | ||
|
||
// Grab all current nodes | ||
nodes, err := optr.nodeLister.List(labels.Everything()) | ||
if err != nil { | ||
klog.Errorf("Could not get nodes: %v", err) | ||
return false, err | ||
} | ||
|
||
// Check if any node is of the arm64 type (translates to aarch64 in RPM world) | ||
for _, node := range nodes { | ||
if node.Status.NodeInfo.Architecture == "arm64" { | ||
return true, nil | ||
} | ||
} | ||
|
||
return false, nil | ||
} | ||
|
||
// updateGuardKeyIfNeeded adds a key with a message depending on: | ||
// If guard is needed and key doesn't exist => create it | ||
// If guard is needed and key exists => nothing to do | ||
// If guard is not needed and key exists => delete it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is interesting, in the sense that we do clear stale admin acks, but only before you do the upgrade, since these syncs will be gone after 4.18. (again, this is fine, just thinking out loud) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, so I wanted the user to able to clear the gate either by addressing it via the |
||
// If guard is not needed and key doesn't exist => nothing to do | ||
func updateGuardKeyIfNeeded(guardNeeded, keyExists bool, key, msg string, cm *corev1.ConfigMap) { | ||
if guardNeeded && !keyExists { | ||
if cm.Data == nil { | ||
cm.Data = map[string]string{} | ||
} | ||
cm.Data[key] = msg | ||
klog.Infof("Adding key %s to configmap %s", key, cm.Name) | ||
} else if !guardNeeded && keyExists { | ||
delete(cm.Data, key) | ||
klog.Infof("Deleting key %s from configmap %s", key, cm.Name) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
package operator | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
configv1 "github.com/openshift/api/config/v1" | ||
opv1 "github.com/openshift/api/operator/v1" | ||
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1" | ||
mcoplistersv1 "github.com/openshift/client-go/operator/listers/operator/v1" | ||
"github.com/openshift/machine-config-operator/pkg/apihelpers" | ||
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" | ||
"github.com/stretchr/testify/assert" | ||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/client-go/kubernetes/fake" | ||
corelisterv1 "k8s.io/client-go/listers/core/v1" | ||
"k8s.io/client-go/tools/cache" | ||
) | ||
|
||
func TestAdminAck(t *testing.T) { | ||
cases := []struct { | ||
name string | ||
infra *configv1.Infrastructure | ||
mcop *opv1.MachineConfiguration | ||
adminCM *corev1.ConfigMap | ||
node *corev1.Node | ||
expectBootImageGate bool | ||
expectAArch64Gate bool | ||
}{ | ||
{ | ||
name: "non AWS/GCP platform, with no boot image configuration", | ||
infra: buildInfra(withPlatformType(configv1.AzurePlatformType)), | ||
mcop: buildMachineConfigurationWithNoBootImageConfiguration(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
}, | ||
{ | ||
name: "AWS platform, with no boot image configuration", | ||
infra: buildInfra(withPlatformType(configv1.AWSPlatformType)), | ||
mcop: buildMachineConfigurationWithNoBootImageConfiguration(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
expectBootImageGate: true, | ||
}, | ||
{ | ||
name: "GCP platform, with no boot image configuration", | ||
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)), | ||
mcop: buildMachineConfigurationWithNoBootImageConfiguration(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
expectBootImageGate: true, | ||
}, | ||
{ | ||
name: "non AWS/GCP platform, with a boot image configuration", | ||
infra: buildInfra(withPlatformType(configv1.AzurePlatformType)), | ||
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
}, | ||
{ | ||
name: "AWS platform, with a boot image configuration", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests all have a "disabled" boot image configuration (empty machinemanagers), and thereby do not need the admin ack, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, disabled meaning the user wanted to opt out so no need to raise the alarm. I can add comments/make this more verbose if its not very clear in the current state 😄 |
||
infra: buildInfra(withPlatformType(configv1.AWSPlatformType)), | ||
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(), | ||
adminCM: buildAdminAckConfigMapWithData(map[string]string{BootImageAWSGCPKey: BootImageAWSGCPMsg}), | ||
}, | ||
{ | ||
name: "GCP platform, with a boot image configuration", | ||
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)), | ||
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(), | ||
adminCM: buildAdminAckConfigMapWithData(map[string]string{BootImageAWSGCPKey: BootImageAWSGCPMsg}), | ||
}, | ||
{ | ||
name: "aarch64 Nodes present", | ||
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)), | ||
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
node: buildNodeWithArch("arm64"), | ||
expectAArch64Gate: true, | ||
}, | ||
{ | ||
name: "aarch64 Nodes not present", | ||
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)), | ||
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(), | ||
adminCM: buildAdminAckConfigMapWithData(nil), | ||
node: buildNodeWithArch("amd64"), | ||
}, | ||
} | ||
for _, tc := range cases { | ||
t.Run(tc.name, func(t *testing.T) { | ||
t.Parallel() | ||
|
||
// Create clients and listers | ||
kubeClient := fake.NewSimpleClientset() | ||
infraIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) | ||
infraIndexer.Add(tc.infra) | ||
mcopIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) | ||
mcopIndexer.Add(tc.mcop) | ||
|
||
configMapIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) | ||
configMapIndexer.Add(tc.adminCM) | ||
_, err := kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Create(context.TODO(), tc.adminCM, metav1.CreateOptions{}) | ||
assert.NoError(t, err) | ||
|
||
nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) | ||
if tc.node != nil { | ||
nodeIndexer.Add(tc.node) | ||
_, err = kubeClient.CoreV1().Nodes().Create(context.TODO(), tc.node, metav1.CreateOptions{}) | ||
assert.NoError(t, err) | ||
} | ||
|
||
optr := &Operator{ | ||
kubeClient: kubeClient, | ||
infraLister: configlistersv1.NewInfrastructureLister(infraIndexer), | ||
mcopLister: mcoplistersv1.NewMachineConfigurationLister(mcopIndexer), | ||
ocManagedConfigMapLister: corelisterv1.NewConfigMapLister(configMapIndexer), | ||
nodeLister: corelisterv1.NewNodeLister(nodeIndexer), | ||
} | ||
err = optr.syncAdminAckConfigMap() | ||
assert.NoError(t, err) | ||
|
||
adminCM, err := kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Get(context.TODO(), AdminAckGatesConfigMapName, metav1.GetOptions{}) | ||
assert.NoError(t, err) | ||
|
||
_, gateFound := adminCM.Data[BootImageAWSGCPKey] | ||
assert.Equal(t, tc.expectBootImageGate, gateFound) | ||
|
||
_, gateFound = adminCM.Data[AArch64BootImageKey] | ||
assert.Equal(t, tc.expectAArch64Gate, gateFound) | ||
}) | ||
} | ||
} | ||
|
||
func buildMachineConfigurationWithBootImageUpdateDisabled() *opv1.MachineConfiguration { | ||
return &opv1.MachineConfiguration{Spec: opv1.MachineConfigurationSpec{ManagedBootImages: apihelpers.GetManagedBootImagesWithUpdateDisabled()}, ObjectMeta: metav1.ObjectMeta{Name: "cluster"}} | ||
} | ||
|
||
func buildMachineConfigurationWithNoBootImageConfiguration() *opv1.MachineConfiguration { | ||
return &opv1.MachineConfiguration{Spec: opv1.MachineConfigurationSpec{ManagedBootImages: apihelpers.GetManagedBootImagesWithNoConfiguration()}, ObjectMeta: metav1.ObjectMeta{Name: "cluster"}} | ||
} | ||
|
||
func buildAdminAckConfigMapWithData(data map[string]string) *corev1.ConfigMap { | ||
return &corev1.ConfigMap{ | ||
ObjectMeta: metav1.ObjectMeta{Name: AdminAckGatesConfigMapName, Namespace: ctrlcommon.OpenshiftConfigManagedNamespace}, | ||
Data: data, | ||
} | ||
} | ||
|
||
func buildNodeWithArch(arch string) *corev1.Node { | ||
return &corev1.Node{ | ||
ObjectMeta: metav1.ObjectMeta{Name: "node-1"}, | ||
Status: corev1.NodeStatus{ | ||
NodeInfo: corev1.NodeSystemInfo{ | ||
Architecture: arch, | ||
}, | ||
}, | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this is saying that if the user already has opted in in a previous version, we don't raise the admin ack?
(makes sense, just thinking out loud)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not necessarily opt-ed in, but if they have any sort of opinion(all/none/partial), we'll either remove the admin-ack if one exists, or otherwise a no-op