Skip to content

Commit

Permalink
fix(manager): fix logic for when RWX workload is restarted after node…
Browse files Browse the repository at this point in the history
… failure

Signed-off-by: James Munson <james.munson@suse.com>
  • Loading branch information
james-munson committed Aug 30, 2024
1 parent 69f8cdb commit 8b1a9cd
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 30 deletions.
25 changes: 15 additions & 10 deletions controller/kubernetes_pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,19 +209,12 @@ func (kc *KubernetesPodController) handleWorkloadPodDeletionIfCSIPluginPodIsDown
return nil
}

storageNetworkSetting, err := kc.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetwork)
log.WithError(err).Warnf("%s. Failed to check isStorageNetwork", logAbort)
return nil
}

storageNetworkForRWXVolumeEnabled, err := kc.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
log.WithError(err).Warnf("%s. Failed to get setting %v", logAbort, types.SettingNameStorageNetworkForRWXVolumeEnabled)
return nil
}

if !types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled) {
if !isStorageNetworkForRWXVolume {
return nil
}

Expand Down Expand Up @@ -476,6 +469,11 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
return err
}

isStorageNetworkForRWXVolume, err := kc.ds.IsStorageNetworkForRWXVolume()
if err != nil {
kc.logger.WithError(err).Warn("Failed to check isStorageNetwork, assuming not")
}

// Only delete pod which has startTime < vol.Status.RemountRequestAt AND timeNow > vol.Status.RemountRequestAt + delayDuration
// The delayDuration is to make sure that we don't repeatedly delete the pod too fast
// when vol.Status.RemountRequestAt is updated too quickly by volumeController
Expand All @@ -493,6 +491,13 @@ func (kc *KubernetesPodController) handlePodDeletionIfVolumeRequestRemount(pod *
if vol.Status.RemountRequestedAt == "" {
continue
}

// NFS clients can generally recover without a restart/remount when the NFS server restarts using the same Cluster IP.
// A remount is required when the storage network for RWX is in use because the new NFS server has a different IP.
if isRegularRWXVolume(vol) && !isStorageNetworkForRWXVolume {
continue
}

remountRequestedAt, err := time.Parse(time.RFC3339, vol.Status.RemountRequestedAt)
if err != nil {
return err
Expand Down
20 changes: 3 additions & 17 deletions controller/share_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ func (c *ShareManagerController) syncShareManagerEndpoint(sm *longhorn.ShareMana
return nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return err
}
Expand Down Expand Up @@ -1045,20 +1045,6 @@ func (c *ShareManagerController) getShareManagerTolerationsFromStorageClass(sc *
return tolerations
}

func (c *ShareManagerController) isStorageNetworkForRWXVolume() (bool, error) {
storageNetwork, err := c.ds.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := c.ds.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "failed to get setting value %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetwork, storageNetworkForRWXVolumeEnabled), nil
}

func (c *ShareManagerController) checkStorageNetworkApplied() (bool, error) {
targetSettings := []types.SettingName{types.SettingNameStorageNetwork, types.SettingNameStorageNetworkForRWXVolumeEnabled}
for _, item := range targetSettings {
Expand Down Expand Up @@ -1091,7 +1077,7 @@ func (c *ShareManagerController) canCleanupService(shareManagerName string) (boo
return false, nil
}

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
return false, err
}
Expand Down Expand Up @@ -1362,7 +1348,7 @@ func (c *ShareManagerController) createServiceManifest(sm *longhorn.ShareManager

log := getLoggerForShareManager(c.logger, sm)

storageNetworkForRWXVolume, err := c.isStorageNetworkForRWXVolume()
storageNetworkForRWXVolume, err := c.ds.IsStorageNetworkForRWXVolume()
if err != nil {
log.WithError(err).Warnf("Failed to check storage network for RWX volume")
}
Expand Down
6 changes: 3 additions & 3 deletions controller/volume_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4521,9 +4521,9 @@ func (c *VolumeController) ReconcileShareManagerState(volume *longhorn.Volume) e
log.Infof("Updated image for share manager from %v to %v", sm.Spec.Image, c.smImage)
}

// kill the workload pods, when the share manager goes into error state
// easiest approach is to set the RemountRequestedAt variable,
// since that is already responsible for killing the workload pods
// Give the workload pods a chance to restart when the share manager goes into error state.
// Easiest approach is to set the RemountRequestedAt variable. Pods will make that decision
// in the kubernetes_pod_controller.
if sm.Status.State == longhorn.ShareManagerStateError || sm.Status.State == longhorn.ShareManagerStateUnknown {
volume.Status.RemountRequestedAt = c.nowHandler()
msg := fmt.Sprintf("Volume %v requested remount at %v", volume.Name, volume.Status.RemountRequestedAt)
Expand Down
14 changes: 14 additions & 0 deletions datastore/longhorn.go
Original file line number Diff line number Diff line change
Expand Up @@ -5486,3 +5486,17 @@ func (s *DataStore) GetOneBackingImageReadyNodeDisk(backingImage *longhorn.Backi

return nil, "", fmt.Errorf("failed to find one ready backing image %v", backingImage.Name)
}

func (s *DataStore) IsStorageNetworkForRWXVolume() (bool, error) {
storageNetworkSetting, err := s.GetSettingWithAutoFillingRO(types.SettingNameStorageNetwork)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetwork)
}

storageNetworkForRWXVolumeEnabled, err := s.GetSettingAsBool(types.SettingNameStorageNetworkForRWXVolumeEnabled)
if err != nil {
return false, errors.Wrapf(err, "Failed to get setting %v", types.SettingNameStorageNetworkForRWXVolumeEnabled)
}

return types.IsStorageNetworkForRWXVolume(storageNetworkSetting, storageNetworkForRWXVolumeEnabled), nil
}

0 comments on commit 8b1a9cd

Please sign in to comment.