From 2411a2eb06a0fcca343721fb84d0727130921aaf Mon Sep 17 00:00:00 2001 From: Vicente Cheng Date: Thu, 22 Aug 2024 00:22:10 +0800 Subject: [PATCH] fix(networking): cleanup service/endpoint if needed We meet a corner case that the service/endpoint would not be cleanup. That will cause the service keep the ClusterIP `None`. With this config, the endpoint of sharemanager would not correct. So the CSI driver cannot perform the mountpoint well. We would like to have a checking mechanism to know if the service/ endpoint did not cleanup. Then we will cleanup the service/endpoint to ensure the correct endpoint. Remove the cleanup function in the setting controller, we could do the cleanup on the sm controller Signed-off-by: Vicente Cheng --- controller/setting_controller.go | 40 --------------- controller/share_manager_controller.go | 68 ++++++++++++++++++++++++++ datastore/longhorn.go | 8 +++ 3 files changed, 76 insertions(+), 40 deletions(-) diff --git a/controller/setting_controller.go b/controller/setting_controller.go index 0ce531efeb..9dc236e7b4 100644 --- a/controller/setting_controller.go +++ b/controller/setting_controller.go @@ -359,14 +359,6 @@ func (sc *SettingController) syncDangerZoneSettingsForManagedComponents(settingN return &types.ErrorInvalidState{Reason: fmt.Sprintf("failed to apply %v setting to Longhorn components when there are attached volumes. It will be eventually applied", types.SettingNameStorageNetworkForRWXVolumeEnabled)} } - // Perform cleanup of the share manager Service - // This is to allow the creation of the correct Service - // and Endpoint when switching between cluster network - // and storage network. - if err := sc.cleanupShareManagerServiceAndEndpoints(); err != nil { - return err - } - return nil } @@ -943,38 +935,6 @@ func (sc *SettingController) updateKubernetesClusterAutoscalerEnabled() error { return nil } -func (sc *SettingController) cleanupShareManagerServiceAndEndpoints() error { - var err error - defer func() { - if err != nil { - err = errors.Wrapf(err, "failed to cleanup share manager service and endpoints for %s setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled) - } - }() - - shareManagers, err := sc.ds.ListShareManagers() - if err != nil { - return err - } - - for _, shareManager := range shareManagers { - log := sc.logger.WithField("shareManager", shareManager.Name) - - log.WithField("service", shareManager.Name).Infof("Deleting Service for %v setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled) - err := sc.ds.DeleteService(shareManager.Namespace, shareManager.Name) - if err != nil && !apierrors.IsNotFound(err) { - return err - } - - log.WithField("endpoint", shareManager.Name).Infof("Deleting Endpoint for %v setting update", types.SettingNameStorageNetworkForRWXVolumeEnabled) - err = sc.ds.DeleteKubernetesEndpoint(shareManager.Namespace, shareManager.Name) - if err != nil && !apierrors.IsNotFound(err) { - return err - } - } - - return nil -} - // updateCNI deletes all system-managed data plane components immediately with the updated CNI annotation. func (sc *SettingController) updateCNI(funcPreupdate func() error) error { storageNetwork, err := sc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) diff --git a/controller/share_manager_controller.go b/controller/share_manager_controller.go index f14ff90934..6177b9c172 100644 --- a/controller/share_manager_controller.go +++ b/controller/share_manager_controller.go @@ -1050,6 +1050,69 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc * return tolerations } +func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) { + targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled} + for _, item := range targetSettings { + if applied, err := c.ds.GetSettingApplied(item); err != nil || !applied { + return applied, err + } + } + return true, nil +} + +func (c *ShareManagerController) cleanupServiceAndEndpoint(shareManager *longhorn.ShareManager) error { + service, err := c.ds.GetService(c.namespace, shareManager.Name) + if err != nil { + // if NotFound, means the service/endpoint is already cleaned up + if apierrors.IsNotFound(err) { + return nil + } + return errors.Wrapf(err, "failed to get service for share manager %v", shareManager.Name) + } + + applied, err := c.checkStorageNetworkApplied() + if err != nil { + return errors.Wrapf(err, "failed to check if the storage network setting is applied") + } + if !applied { + c.logger.Warnf("`StorageNetwork` related settings are not applied, do nothing.") + return nil + } + + storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork) + if err != nil { + return errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork) + } + + storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled) + if err != nil { + return errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled) + } + + // no need to cleanup because looks the service file is correct + if types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled) && service.Spec.ClusterIP == core.ClusterIPNone { + return nil + } + if !types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled) && service.Spec.ClusterIP != core.ClusterIPNone { + return nil + } + + // let's cleanup + c.logger.Infof("Deleting Service for sharemanager %v", shareManager.Name) + err = c.ds.DeleteService(c.namespace, shareManager.Name) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete service for share manager %v", shareManager.Name) + } + + c.logger.Infof("Deleting Endpoint for sharemanager %v", shareManager.Name) + err = c.ds.DeleteKubernetesEndpoint(c.namespace, shareManager.Name) + if err != nil && !apierrors.IsNotFound(err) { + return errors.Wrapf(err, "failed to delete Endpoint for share manager %v", shareManager.Name) + } + + return nil +} + func (c *ShareManagerController) createServiceAndEndpoint(shareManager *longhorn.ShareManager) error { // check if we need to create the service _, err := c.ds.GetService(c.namespace, shareManager.Name) @@ -1122,6 +1185,11 @@ func (c *ShareManagerController) createShareManagerPod(sm *longhorn.ShareManager } priorityClass := setting.Value + err = c.cleanupServiceAndEndpoint(sm) + if err != nil { + return nil, errors.Wrapf(err, "failed to cleanup service and endpoint for share manager %v", sm.Name) + } + err = c.createServiceAndEndpoint(sm) if err != nil { return nil, errors.Wrapf(err, "failed to create service and endpoint for share manager %v", sm.Name) diff --git a/datastore/longhorn.go b/datastore/longhorn.go index 5443653cf3..663c07ebf1 100644 --- a/datastore/longhorn.go +++ b/datastore/longhorn.go @@ -619,6 +619,14 @@ func (s *DataStore) GetSettingExactRO(sName types.SettingName) (*longhorn.Settin return resultRO, nil } +func (s *DataStore) GetSettingApplied(sName types.SettingName) (bool, error) { + resultRO, err := s.getSettingRO(string(sName)) + if err != nil { + return false, err + } + return resultRO.Status.Applied, nil +} + // GetSetting will automatically fill the non-existing setting if it's a valid // setting name. // The function will not return nil for *longhorn.Setting when error is nil