Skip to content

Commit

Permalink
Follow-on fixes for BIAv2 controller work
Browse files Browse the repository at this point in the history
Signed-off-by: Scott Seago <sseago@redhat.com>
  • Loading branch information
sseago committed Mar 13, 2023
1 parent 36163c9 commit dd63e81
Show file tree
Hide file tree
Showing 32 changed files with 629 additions and 569 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/5971-sseago
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Follow-on fixes for BIAv2 controller work
26 changes: 13 additions & 13 deletions config/crd/v1/bases/velero.io_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -420,19 +420,19 @@ spec:
status:
description: BackupStatus captures the current status of a Velero backup.
properties:
asyncBackupItemOperationsAttempted:
description: AsyncBackupItemOperationsAttempted is the total number
of attempted async BackupItemAction operations for this backup.
backupItemOperationsAttempted:
description: BackupItemOperationsAttempted is the total number of
attempted async BackupItemAction operations for this backup.
type: integer
asyncBackupItemOperationsCompleted:
description: AsyncBackupItemOperationsCompleted is the total number
of successfully completed async BackupItemAction operations for
this backup.
backupItemOperationsCompleted:
description: BackupItemOperationsCompleted is the total number of
successfully completed async BackupItemAction operations for this
backup.
type: integer
asyncBackupItemOperationsFailed:
description: AsyncBackupItemOperationsFailed is the total number of
async BackupItemAction operations for this backup which ended with
an error.
backupItemOperationsFailed:
description: BackupItemOperationsFailed is the total number of async
BackupItemAction operations for this backup which ended with an
error.
type: integer
completionTimestamp:
description: CompletionTimestamp records the time a backup was completed.
Expand Down Expand Up @@ -476,8 +476,8 @@ spec:
- InProgress
- WaitingForPluginOperations
- WaitingForPluginOperationsPartiallyFailed
- FinalizingAfterPluginOperations
- FinalizingAfterPluginOperationsPartiallyFailed
- Finalizing
- FinalizingPartiallyFailed
- Completed
- PartiallyFailed
- Failed
Expand Down
2 changes: 1 addition & 1 deletion config/crd/v1/crds/crds.go

Large diffs are not rendered by default.

74 changes: 36 additions & 38 deletions design/general-progress-monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,11 @@ from a plugin could cause a backup or restore to move to "PartiallyFailed". If
deleted (cancelled), the plug-ins will attempt to delete the snapshots and stop the data movement -
this may not be possible with all storage systems.

In addition, for backups (but not restores), there will also be two additional phases,
"FinalizingAfterPluginOperations" and "FinalizingAfterPluginOperationsPartiallyFailed", which will
handle any steps required after plugin operations have all completed. Initially, this will just
include adding any required resources to the backup that might have changed during asynchronous
operation execution, although eventually other cleanup actions could be added to this phase.
In addition, for backups (but not restores), there will also be two additional phases, "Finalizing"
and "FinalizingPartiallyFailed", which will handle any steps required after plugin operations have
all completed. Initially, this will just include adding any required resources to the backup that
might have changed during asynchronous operation execution, although eventually other cleanup
actions could be added to this phase.

### State progression

Expand Down Expand Up @@ -156,24 +156,24 @@ asynchronous plugin operations and no errors so far, "WaitingForPluginOperations
backups or restores which have unfinished asynchronous plugin operations at least one error,
"Completed" for restores with no unfinished asynchronous plugin operations and no errors,
"PartiallyFailed" for restores with no unfinished asynchronous plugin operations and at least one
error, "FinalizingAfterPluginOperations" for backups with no unfinished asynchronous plugin
operations and no errors, "FinalizingAfterPluginOperationsPartiallyFailed" for backups with no
unfinished asynchronous plugin operations and at least one error, or "PartiallyFailed".
Backups/restores which would have a final phase of "Completed" or "PartiallyFailed" may move to the
"WaitingForPluginOperations" or "WaitingForPluginOperationsPartiallyFailed" state. A backup/restore
which will be marked "Failed" will go directly to the "Failed" phase. Uploads may continue in the
background for snapshots that were taken by a "Failed" backup/restore, but no progress will not be
monitored or updated. If there are any operations in progress when a backup is moved to the "Failed"
phase (although with the current workflow, that shouldn't happen), Cancel() should be called on
these operations. When a "Failed" backup is deleted, all snapshots will be deleted and at that point
any uploads still in progress should be aborted.
error, "Finalizing" for backups with no unfinished asynchronous plugin operations and no errors,
"FinalizingPartiallyFailed" for backups with no unfinished asynchronous plugin operations and at
least one error, or "PartiallyFailed". Backups/restores which would have a final phase of
"Completed" or "PartiallyFailed" may move to the "WaitingForPluginOperations" or
"WaitingForPluginOperationsPartiallyFailed" state. A backup/restore which will be marked "Failed"
will go directly to the "Failed" phase. Uploads may continue in the background for snapshots that
were taken by a "Failed" backup/restore, but no progress will not be monitored or updated. If there
are any operations in progress when a backup is moved to the "Failed" phase (although with the
current workflow, that shouldn't happen), Cancel() should be called on these operations. When a
"Failed" backup is deleted, all snapshots will be deleted and at that point any uploads still in
progress should be aborted.

### WaitingForPluginOperations (new)
The "WaitingForPluginOperations" phase signifies that the main part of the backup/restore, including
snapshotting has completed successfully and uploading and any other asynchronous BIA/RIA plugin
operations are continuing. In the event of an error during this phase, the phase will change to
WaitingForPluginOperationsPartiallyFailed. On success, the phase changes to
"FinalizingAfterPluginOperations" for backups and "Completed" for restores. Backups cannot be
"Finalizing" for backups and "Completed" for restores. Backups cannot be
restored from when they are in the WaitingForPluginOperations state.

### WaitingForPluginOperationsPartiallyFailed (new)
Expand All @@ -182,21 +182,19 @@ backup/restore, including snapshotting has completed, but there were partial fai
the main part or during any async operations, including snapshot uploads. Backups cannot be
restored from when they are in the WaitingForPluginOperationsPartiallyFailed state.

### FinalizingAfterPluginOperations (new)
The "FinalizingAfterPluginOperations" phase signifies that asynchronous backup operations have all
completed successfully and Velero is currently backing up any resources indicated by asynchronous
plugins as items to back up after operations complete. Once this is done, the phase changes to
Completed. Backups cannot be restored from when they are in the FinalizingAfterPluginOperations
state.
### Finalizing (new)
The "Finalizing" phase signifies that asynchronous backup operations have all completed successfully
and Velero is currently backing up any resources indicated by asynchronous plugins as items to back
up after operations complete. Once this is done, the phase changes to Completed. Backups cannot be
restored from when they are in the Finalizing state.

### FinalizingAfterPluginOperationsPartiallyFailed (new)
### FinalizingPartiallyFailed (new)

The "FinalizingAfterPluginOperationsPartiallyFailed" phase signifies that, for a backup which had
errors during initial processing or asynchronous plugin operation, asynchronous backup operations
have all completed and Velero is currently backing up any resources indicated by asynchronous
plugins as items to back up after operations complete. Once this is done, the phase changes to
PartiallyFailed. Backups cannot be restored from when they are in the
FinalizingAfterPluginOperationsPartiallyFailed state.
The "FinalizingPartiallyFailed" phase signifies that, for a backup which had errors during initial
processing or asynchronous plugin operation, asynchronous backup operations have all completed and
Velero is currently backing up any resources indicated by asynchronous plugins as items to back up
after operations complete. Once this is done, the phase changes to PartiallyFailed. Backups cannot
be restored from when they are in the FinalizingPartiallyFailed state.

### Failed
When a backup/restore has had fatal errors it is marked as "Failed" Backups in this state cannot be
Expand Down Expand Up @@ -244,14 +242,14 @@ WaitingForPluginOperationsPartiallyFailed phase, another backup/restore may be s
While in the WaitingForPluginOperations or WaitingForPluginOperationsPartiallyFailed phase, the
snapshots and item actions will be periodically polled. When all of the snapshots and item actions
have reported success, restores will move directly to the Completed or PartiallyFailed phase, and
backups will move to the FinalizingAfterPluginOperations or
FinalizingAfterPluginOperationsPartiallyFailed phase, depending on whether the backup/restore was in
the WaitingForPluginOperations or WaitingForPluginOperationsPartiallyFailed phase.
backups will move to the Finalizing or FinalizingPartiallyFailed phase, depending on whether the
backup/restore was in the WaitingForPluginOperations or WaitingForPluginOperationsPartiallyFailed
phase.

While in the FinalizingAfterPluginOperations or FinalizingAfterPluginOperationsPartiallyFailed
phase, Velero will update the backup with any resources indicated by plugins that they must be added
to the backup after operations are completed, and then the backup will move to the Completed or
PartiallyFailed phase, depending on whether there are any backup errors.
While in the Finalizing or FinalizingPartiallyFailed phase, Velero will update the backup with any
resources indicated by plugins that they must be added to the backup after operations are completed,
and then the backup will move to the Completed or PartiallyFailed phase, depending on whether there
are any backup errors.

The Backup resources will be written to object storage at the time the backup leaves the InProgress
phase, but it will not be synced to other clusters (or usable for restores in the current cluster)
Expand All @@ -272,7 +270,7 @@ ignored.
type OperationProgress struct {
Completed bool // True when the operation has completed, either successfully or with a failure
Err string // Set when the operation has failed
NCompleted, NTotal int64 // Quantity completed so far and the total quantity associated with the operaation in operationUnits
NCompleted, NTotal int64 // Quantity completed so far and the total quantity associated with the operation in operationUnits
// For data mover and volume snapshotter use cases, this would be in bytes
// On successful completion, completed and total should be the same.
OperationUnits string // Units represented by completed and total -- for data mover and item
Expand Down
22 changes: 11 additions & 11 deletions pkg/apis/velero/v1/backup_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ const (

// BackupPhase is a string representation of the lifecycle phase
// of a Velero backup.
// +kubebuilder:validation:Enum=New;FailedValidation;InProgress;WaitingForPluginOperations;WaitingForPluginOperationsPartiallyFailed;FinalizingAfterPluginOperations;FinalizingAfterPluginOperationsPartiallyFailed;Completed;PartiallyFailed;Failed;Deleting
// +kubebuilder:validation:Enum=New;FailedValidation;InProgress;WaitingForPluginOperations;WaitingForPluginOperationsPartiallyFailed;Finalizing;FinalizingPartiallyFailed;Completed;PartiallyFailed;Failed;Deleting
type BackupPhase string

const (
Expand Down Expand Up @@ -256,22 +256,22 @@ const (
// ongoing. The backup is not usable yet.
BackupPhaseWaitingForPluginOperationsPartiallyFailed BackupPhase = "WaitingForPluginOperationsPartiallyFailed"

// BackupPhaseFinalizingAfterPluginOperations means the backup of
// BackupPhaseFinalizing means the backup of
// Kubernetes resources, creation of snapshots, and other
// async plugin operations were successful and snapshot upload and
// other plugin operations are now complete, but the Backup is awaiting
// final update of resources modified during async operations.
// The backup is not usable yet.
BackupPhaseFinalizingAfterPluginOperations BackupPhase = "FinalizingAfterPluginOperations"
BackupPhaseFinalizing BackupPhase = "Finalizing"

// BackupPhaseFinalizingAfterPluginOperationsPartiallyFailed means the backup of
// BackupPhaseFinalizingPartiallyFailed means the backup of
// Kubernetes resources, creation of snapshots, and other
// async plugin operations were successful and snapshot upload and
// other plugin operations are now complete, but one or more errors
// occurred during backup or async operation processing, and the
// Backup is awaiting final update of resources modified during async
// operations. The backup is not usable yet.
BackupPhaseFinalizingAfterPluginOperationsPartiallyFailed BackupPhase = "FinalizingAfterPluginOperationsPartiallyFailed"
BackupPhaseFinalizingPartiallyFailed BackupPhase = "FinalizingPartiallyFailed"

// BackupPhaseCompleted means the backup has run successfully without
// errors.
Expand Down Expand Up @@ -374,20 +374,20 @@ type BackupStatus struct {
// +optional
CSIVolumeSnapshotsCompleted int `json:"csiVolumeSnapshotsCompleted,omitempty"`

// AsyncBackupItemOperationsAttempted is the total number of attempted
// BackupItemOperationsAttempted is the total number of attempted
// async BackupItemAction operations for this backup.
// +optional
AsyncBackupItemOperationsAttempted int `json:"asyncBackupItemOperationsAttempted,omitempty"`
BackupItemOperationsAttempted int `json:"backupItemOperationsAttempted,omitempty"`

// AsyncBackupItemOperationsCompleted is the total number of successfully completed
// BackupItemOperationsCompleted is the total number of successfully completed
// async BackupItemAction operations for this backup.
// +optional
AsyncBackupItemOperationsCompleted int `json:"asyncBackupItemOperationsCompleted,omitempty"`
BackupItemOperationsCompleted int `json:"backupItemOperationsCompleted,omitempty"`

// AsyncBackupItemOperationsFailed is the total number of async
// BackupItemOperationsFailed is the total number of async
// BackupItemAction operations for this backup which ended with an error.
// +optional
AsyncBackupItemOperationsFailed int `json:"asyncBackupItemOperationsFailed,omitempty"`
BackupItemOperationsFailed int `json:"backupItemOperationsFailed,omitempty"`
}

// BackupProgress stores information about the progress of a Backup's execution.
Expand Down
12 changes: 6 additions & 6 deletions pkg/backup/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,7 @@ func (kb *kubernetesBackupper) BackupWithResolvers(log logrus.FieldLogger,
}

func (kb *kubernetesBackupper) backupItem(log logrus.FieldLogger, gr schema.GroupResource, itemBackupper *itemBackupper, unstructured *unstructured.Unstructured, preferredGVR schema.GroupVersionResource) bool {
backedUpItem, err := itemBackupper.backupItem(log, unstructured, gr, preferredGVR, false)
backedUpItem, _, err := itemBackupper.backupItem(log, unstructured, gr, preferredGVR, false, false)
if aggregate, ok := err.(kubeerrs.Aggregate); ok {
log.WithField("name", unstructured.GetName()).Infof("%d errors encountered backup up item", len(aggregate.Errors()))
// log each error separately so we get error location info in the log, and an
Expand All @@ -441,7 +441,7 @@ func (kb *kubernetesBackupper) backupItem(log logrus.FieldLogger, gr schema.Grou
}

func (kb *kubernetesBackupper) finalizeItem(log logrus.FieldLogger, gr schema.GroupResource, itemBackupper *itemBackupper, unstructured *unstructured.Unstructured, preferredGVR schema.GroupVersionResource) (bool, []FileForArchive) {
backedUpItem, updateFiles, err := itemBackupper.finalizeItem(log, unstructured, gr, preferredGVR)
backedUpItem, updateFiles, err := itemBackupper.backupItem(log, unstructured, gr, preferredGVR, true, true)
if aggregate, ok := err.(kubeerrs.Aggregate); ok {
log.WithField("name", unstructured.GetName()).Infof("%d errors encountered backup up item", len(aggregate.Errors()))
// log each error separately so we get error location info in the log, and an
Expand Down Expand Up @@ -563,15 +563,15 @@ func (kb *kubernetesBackupper) FinalizeBackup(log logrus.FieldLogger,
pageSize: kb.clientPageSize,
}

// Get item list from itemoperation.BackupOperation.Spec.ItemsToUpdate
// Get item list from itemoperation.BackupOperation.Spec.PostOperationItems
var resourceIDs []velero.ResourceIdentifier
for _, operation := range asyncBIAOperations {
if len(operation.Spec.ItemsToUpdate) != 0 {
resourceIDs = append(resourceIDs, operation.Spec.ItemsToUpdate...)
if len(operation.Spec.PostOperationItems) != 0 {
resourceIDs = append(resourceIDs, operation.Spec.PostOperationItems...)
}
}
items := collector.getItemsFromResourceIdentifiers(resourceIDs)
log.WithField("progress", "").Infof("Collected %d items from the async BIA operations ItemsToUpdate list", len(items))
log.WithField("progress", "").Infof("Collected %d items from the async BIA operations PostOperationItems list", len(items))

itemBackupper := &itemBackupper{
backupRequest: backupRequest,
Expand Down
16 changes: 8 additions & 8 deletions pkg/backup/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1136,23 +1136,23 @@ func TestBackupResourceOrdering(t *testing.T) {
// to run for specific resources/namespaces and simply records the items
// that it is executed for.
type recordResourcesAction struct {
selector velero.ResourceSelector
ids []string
backups []velerov1.Backup
additionalItems []velero.ResourceIdentifier
operationID string
itemsToUpdate []velero.ResourceIdentifier
selector velero.ResourceSelector
ids []string
backups []velerov1.Backup
additionalItems []velero.ResourceIdentifier
operationID string
postOperationItems []velero.ResourceIdentifier
}

func (a *recordResourcesAction) Execute(item runtime.Unstructured, backup *velerov1.Backup) (runtime.Unstructured, []velero.ResourceIdentifier, string, []velero.ResourceIdentifier, error) {
metadata, err := meta.Accessor(item)
if err != nil {
return item, a.additionalItems, a.operationID, a.itemsToUpdate, err
return item, a.additionalItems, a.operationID, a.postOperationItems, err
}
a.ids = append(a.ids, kubeutil.NamespaceAndName(metadata))
a.backups = append(a.backups, *backup)

return item, a.additionalItems, a.operationID, a.itemsToUpdate, nil
return item, a.additionalItems, a.operationID, a.postOperationItems, nil
}

func (a *recordResourcesAction) AppliesTo() (velero.ResourceSelector, error) {
Expand Down
Loading

0 comments on commit dd63e81

Please sign in to comment.