Skip to content

OCPBUGS-55638: OCPBUGS-55637: Add admin_acks handling in the MCO #5027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/machine-config-operator/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ func runStartCmd(_ *cobra.Command, _ []string) {
ctrlctx.KubeNamespacedInformerFactory.Core().V1().Secrets(),
ctrlctx.OpenShiftConfigKubeNamespacedInformerFactory.Core().V1().Secrets(),
ctrlctx.OpenShiftConfigManagedKubeNamespacedInformerFactory.Core().V1().Secrets(),
ctrlctx.OpenShiftConfigManagedKubeNamespacedInformerFactory.Core().V1().ConfigMaps(),
ctrlctx.ConfigInformerFactory.Config().V1().ClusterOperators(),
ctrlctx.ClientBuilder.OperatorClientOrDie(componentName),
ctrlctx.OperatorInformerFactory.Operator().V1().MachineConfigurations(),
Expand Down
10 changes: 10 additions & 0 deletions pkg/apihelpers/apihelpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,3 +486,13 @@ func CheckNodeDisruptionActionsForTargetActions(actions []opv1.NodeDisruptionPol

return currentActions.HasAny(targetActions...)
}

// Returns a MachineConfiguration object with the cluster opted out of boot image updates.
func GetManagedBootImagesWithUpdateDisabled() opv1.ManagedBootImages {
return opv1.ManagedBootImages{MachineManagers: []opv1.MachineManager{}}
}

// Returns a MachineConfiguration object with an empty configuration; to be used in testing a situation where admin has no opinion.
func GetManagedBootImagesWithNoConfiguration() opv1.ManagedBootImages {
return opv1.ManagedBootImages{}
}
126 changes: 126 additions & 0 deletions pkg/operator/admin_ack.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package operator

import (
"context"
"fmt"
"reflect"

configv1 "github.com/openshift/api/config/v1"
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
)

const (
BootImageAWSGCPKey = "ack-4.18-boot-image-opt-out-in-4.19"
BootImageAWSGCPMsg = "This cluster is GCP or AWS but lacks a boot image configuration. OCP will automatically opt this cluster into boot image management in 4.19. Please add a configuration to disable boot image updates if this is not desired. See https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/machine_configuration/mco-update-boot-images#mco-update-boot-images-disable_machine-configs-configure for more details."

AArch64BootImageKey = "ack-4.18-aarch64-bootloader-4.19"
AArch64BootImageMsg = "This cluster has at least one aarch64 node. Please ensure boot images are not too old by following KCS https://access.redhat.com/solutions/7119697 prior to upgrading to 4.19."

AdminAckGatesConfigMapName = "admin-gates"
)

// Syncs the admin-ack configmap in the openshift-config-managed namespace, and adds MCO specific keys if needed.
func (optr *Operator) syncAdminAckConfigMap() error {
cm, err := optr.ocManagedConfigMapLister.ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Get(AdminAckGatesConfigMapName)
if err != nil {
return fmt.Errorf("error fetching configmap %s during sync: %w", AdminAckGatesConfigMapName, err)
}

newCM := cm.DeepCopy()

// Evaluate AWS/GCP boot image guards
_, bootImageAWSGCPKeyExists := newCM.Data[BootImageAWSGCPKey]
bootImageAWSGCPAdminGuardNeeded, err := optr.checkAWSGCPBootImageGuard()
if err != nil {
return fmt.Errorf("error determining aws/gcp boot image guard during configmap %s sync: %w", AdminAckGatesConfigMapName, err)
}
updateGuardKeyIfNeeded(bootImageAWSGCPAdminGuardNeeded, bootImageAWSGCPKeyExists, BootImageAWSGCPKey, BootImageAWSGCPMsg, newCM)

// Evaluate aarch64 guards
_, aarch64BootImageKeyExists := newCM.Data[AArch64BootImageKey]
aarch64BootImageAdminGuardNeeded, err := optr.checkAArch64BootImageGuard()
if err != nil {
return fmt.Errorf("error determining aarch64 boot image guard during configmap %s sync: %w", AdminAckGatesConfigMapName, err)
}
updateGuardKeyIfNeeded(aarch64BootImageAdminGuardNeeded, aarch64BootImageKeyExists, AArch64BootImageKey, AArch64BootImageMsg, newCM)

// Only send an API request if an update is needed
if !reflect.DeepEqual(cm.Data, newCM.Data) {
_, err = optr.kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Update(context.TODO(), newCM, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("error updating configmap %s during sync: %w", AdminAckGatesConfigMapName, err)
}
klog.Infof("%s configmap update successful", newCM.Name)
}

return nil
}

// checkAWSGCPBootImageGuard checks if AWS & GCP Boot Image admin guard is needed
func (optr *Operator) checkAWSGCPBootImageGuard() (bool, error) {

// First, check if the cluster is on AWS or GCP platform.
infra, err := optr.infraLister.Get("cluster")
if err != nil {
klog.Errorf("Could not get infra: %v", err)
return false, err
}
platforms := sets.New(configv1.GCPPlatformType, configv1.AWSPlatformType)
if !platforms.Has(infra.Status.PlatformStatus.Type) {
return false, nil
}

// Next, check if there is any boot image configuration.
mcop, err := optr.mcopLister.Get(ctrlcommon.MCOOperatorKnobsObjectName)
if err != nil {
klog.Errorf("Could not get MachineConfiguration object: %v", err)
return false, err
}

// If no machine managers exist, no configuration has been defined
return mcop.Spec.ManagedBootImages.MachineManagers == nil, nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is saying that if the user already has opted in in a previous version, we don't raise the admin ack?

(makes sense, just thinking out loud)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessarily opt-ed in, but if they have any sort of opinion(all/none/partial), we'll either remove the admin-ack if one exists, or otherwise a no-op

}

// checkAArch64BootImageGuard checks if aarch64 Boot Image admin guard is needed
func (optr *Operator) checkAArch64BootImageGuard() (bool, error) {

// Grab all current nodes
nodes, err := optr.nodeLister.List(labels.Everything())
if err != nil {
klog.Errorf("Could not get nodes: %v", err)
return false, err
}

// Check if any node is of the arm64 type (translates to aarch64 in RPM world)
for _, node := range nodes {
if node.Status.NodeInfo.Architecture == "arm64" {
return true, nil
}
}

return false, nil
}

// updateGuardKeyIfNeeded adds a key with a message depending on:
// If guard is needed and key doesn't exist => create it
// If guard is needed and key exists => nothing to do
// If guard is not needed and key exists => delete it
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting, in the sense that we do clear stale admin acks, but only before you do the upgrade, since these syncs will be gone after 4.18.

(again, this is fine, just thinking out loud)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, so I wanted the user to able to clear the gate either by addressing it via the admin-ack configmap or via defining an boot image opinion. In 4.19, this should not have any effect per Trevor since these keys only target 4.18(but we can double check that again)

// If guard is not needed and key doesn't exist => nothing to do
func updateGuardKeyIfNeeded(guardNeeded, keyExists bool, key, msg string, cm *corev1.ConfigMap) {
if guardNeeded && !keyExists {
if cm.Data == nil {
cm.Data = map[string]string{}
}
cm.Data[key] = msg
klog.Infof("Adding key %s to configmap %s", key, cm.Name)
} else if !guardNeeded && keyExists {
delete(cm.Data, key)
klog.Infof("Deleting key %s from configmap %s", key, cm.Name)
}
}
154 changes: 154 additions & 0 deletions pkg/operator/admin_ack_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
package operator

import (
"context"
"testing"

configv1 "github.com/openshift/api/config/v1"
opv1 "github.com/openshift/api/operator/v1"
configlistersv1 "github.com/openshift/client-go/config/listers/config/v1"
mcoplistersv1 "github.com/openshift/client-go/operator/listers/operator/v1"
"github.com/openshift/machine-config-operator/pkg/apihelpers"
ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
corelisterv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
)

func TestAdminAck(t *testing.T) {
cases := []struct {
name string
infra *configv1.Infrastructure
mcop *opv1.MachineConfiguration
adminCM *corev1.ConfigMap
node *corev1.Node
expectBootImageGate bool
expectAArch64Gate bool
}{
{
name: "non AWS/GCP platform, with no boot image configuration",
infra: buildInfra(withPlatformType(configv1.AzurePlatformType)),
mcop: buildMachineConfigurationWithNoBootImageConfiguration(),
adminCM: buildAdminAckConfigMapWithData(nil),
},
{
name: "AWS platform, with no boot image configuration",
infra: buildInfra(withPlatformType(configv1.AWSPlatformType)),
mcop: buildMachineConfigurationWithNoBootImageConfiguration(),
adminCM: buildAdminAckConfigMapWithData(nil),
expectBootImageGate: true,
},
{
name: "GCP platform, with no boot image configuration",
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)),
mcop: buildMachineConfigurationWithNoBootImageConfiguration(),
adminCM: buildAdminAckConfigMapWithData(nil),
expectBootImageGate: true,
},
{
name: "non AWS/GCP platform, with a boot image configuration",
infra: buildInfra(withPlatformType(configv1.AzurePlatformType)),
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(),
adminCM: buildAdminAckConfigMapWithData(nil),
},
{
name: "AWS platform, with a boot image configuration",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests all have a "disabled" boot image configuration (empty machinemanagers), and thereby do not need the admin ack, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, disabled meaning the user wanted to opt out so no need to raise the alarm. I can add comments/make this more verbose if its not very clear in the current state 😄

infra: buildInfra(withPlatformType(configv1.AWSPlatformType)),
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(),
adminCM: buildAdminAckConfigMapWithData(map[string]string{BootImageAWSGCPKey: BootImageAWSGCPMsg}),
},
{
name: "GCP platform, with a boot image configuration",
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)),
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(),
adminCM: buildAdminAckConfigMapWithData(map[string]string{BootImageAWSGCPKey: BootImageAWSGCPMsg}),
},
{
name: "aarch64 Nodes present",
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)),
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(),
adminCM: buildAdminAckConfigMapWithData(nil),
node: buildNodeWithArch("arm64"),
expectAArch64Gate: true,
},
{
name: "aarch64 Nodes not present",
infra: buildInfra(withPlatformType(configv1.GCPPlatformType)),
mcop: buildMachineConfigurationWithBootImageUpdateDisabled(),
adminCM: buildAdminAckConfigMapWithData(nil),
node: buildNodeWithArch("amd64"),
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()

// Create clients and listers
kubeClient := fake.NewSimpleClientset()
infraIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
infraIndexer.Add(tc.infra)
mcopIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
mcopIndexer.Add(tc.mcop)

configMapIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
configMapIndexer.Add(tc.adminCM)
_, err := kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Create(context.TODO(), tc.adminCM, metav1.CreateOptions{})
assert.NoError(t, err)

nodeIndexer := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc})
if tc.node != nil {
nodeIndexer.Add(tc.node)
_, err = kubeClient.CoreV1().Nodes().Create(context.TODO(), tc.node, metav1.CreateOptions{})
assert.NoError(t, err)
}

optr := &Operator{
kubeClient: kubeClient,
infraLister: configlistersv1.NewInfrastructureLister(infraIndexer),
mcopLister: mcoplistersv1.NewMachineConfigurationLister(mcopIndexer),
ocManagedConfigMapLister: corelisterv1.NewConfigMapLister(configMapIndexer),
nodeLister: corelisterv1.NewNodeLister(nodeIndexer),
}
err = optr.syncAdminAckConfigMap()
assert.NoError(t, err)

adminCM, err := kubeClient.CoreV1().ConfigMaps(ctrlcommon.OpenshiftConfigManagedNamespace).Get(context.TODO(), AdminAckGatesConfigMapName, metav1.GetOptions{})
assert.NoError(t, err)

_, gateFound := adminCM.Data[BootImageAWSGCPKey]
assert.Equal(t, tc.expectBootImageGate, gateFound)

_, gateFound = adminCM.Data[AArch64BootImageKey]
assert.Equal(t, tc.expectAArch64Gate, gateFound)
})
}
}

func buildMachineConfigurationWithBootImageUpdateDisabled() *opv1.MachineConfiguration {
return &opv1.MachineConfiguration{Spec: opv1.MachineConfigurationSpec{ManagedBootImages: apihelpers.GetManagedBootImagesWithUpdateDisabled()}, ObjectMeta: metav1.ObjectMeta{Name: "cluster"}}
}

func buildMachineConfigurationWithNoBootImageConfiguration() *opv1.MachineConfiguration {
return &opv1.MachineConfiguration{Spec: opv1.MachineConfigurationSpec{ManagedBootImages: apihelpers.GetManagedBootImagesWithNoConfiguration()}, ObjectMeta: metav1.ObjectMeta{Name: "cluster"}}
}

func buildAdminAckConfigMapWithData(data map[string]string) *corev1.ConfigMap {
return &corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{Name: AdminAckGatesConfigMapName, Namespace: ctrlcommon.OpenshiftConfigManagedNamespace},
Data: data,
}
}

func buildNodeWithArch(arch string) *corev1.Node {
return &corev1.Node{
ObjectMeta: metav1.ObjectMeta{Name: "node-1"},
Status: corev1.NodeStatus{
NodeInfo: corev1.NodeSystemInfo{
Architecture: arch,
},
},
}
}
Loading