Skip to content

Commit

Permalink
Merge pull request #4833 from ywk253100/220413_restart
Browse files Browse the repository at this point in the history
Make in-progress backup/restore as failed when doing the reconcile
  • Loading branch information
sseago authored Apr 27, 2022
2 parents 40261dc + dfc8656 commit 58a8371
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 19 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/4833-ywk253100
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make in-progress backup/restore as failed when doing the reconcile to avoid hanging in in-progress status
4 changes: 4 additions & 0 deletions config/crd/v1/bases/velero.io_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ spec:
format: date-time
nullable: true
type: string
failureReason:
description: FailureReason is an error that caused the entire backup
to fail.
type: string
formatVersion:
description: FormatVersion is the backup format version, including
major, minor, and patch version.
Expand Down
2 changes: 1 addition & 1 deletion config/crd/v1/crds/crds.go

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pkg/apis/velero/v1/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,10 @@ type BackupStatus struct {
// +optional
VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted,omitempty"`

// FailureReason is an error that caused the entire backup to fail.
// +optional
FailureReason string `json:"failureReason,omitempty"`

// Warnings is a count of all warning messages that were generated during
// execution of the backup. The actual warnings are in the backup's log
// file in object storage.
Expand Down
24 changes: 20 additions & 4 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,12 @@ func NewBackupController(
backup := obj.(*velerov1api.Backup)

switch backup.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case "", velerov1api.BackupPhaseNew, velerov1api.BackupPhaseInProgress:
default:
c.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(backup),
"phase": backup.Status.Phase,
}).Debug("Backup is not new, skipping")
}).Debug("Backup is not new or in-progress, skipping")
return
}

Expand Down Expand Up @@ -241,7 +240,22 @@ func (c *backupController) processBackup(key string) error {
// this key (even though it was a no-op).
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case velerov1api.BackupPhaseInProgress:
// A backup may stay in-progress forever because of
// 1) the controller restarts during the processing of a backup
// 2) the backup with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such Backups as failed to avoid it
updated := original.DeepCopy()
updated.Status.Phase = velerov1api.BackupPhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Backup with unexpected status %q, this may be due to a restart of the controller during the backing up, mark it as %q",
velerov1api.BackupPhaseInProgress, updated.Status.Phase)
updated.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
_, err = patchBackup(original, updated, c.client)
if err != nil {
return errors.Wrapf(err, "error updating Backup status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
default:
return nil
}
Expand All @@ -261,6 +275,7 @@ func (c *backupController) processBackup(key string) error {
if err != nil {
return errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}

// store ref to just-updated item for creating patch
original = updatedBackup
request.Backup = updatedBackup.DeepCopy()
Expand All @@ -287,6 +302,7 @@ func (c *backupController) processBackup(key string) error {
// result in the backup being Failed.
log.WithError(err).Error("backup failed")
request.Status.Phase = velerov1api.BackupPhaseFailed
request.Status.FailureReason = err.Error()
}

switch request.Status.Phase {
Expand Down
29 changes: 24 additions & 5 deletions pkg/controller/backup_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseFailedValidation).Result(),
},
{
name: "InProgress backup is not processed",
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result(),
},
{
name: "Completed backup is not processed",
key: "velero/backup-1",
Expand Down Expand Up @@ -140,6 +135,28 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
}
}

func TestMarkInProgressBackupAsFailed(t *testing.T) {
backup := defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result()
clientset := fake.NewSimpleClientset(backup)
sharedInformers := informers.NewSharedInformerFactory(clientset, 0)
logger := logging.DefaultLogger(logrus.DebugLevel, logging.FormatText)

c := &backupController{
genericController: newGenericController("backup-test", logger),
client: clientset.VeleroV1(),
lister: sharedInformers.Velero().V1().Backups().Lister(),
clock: &clock.RealClock{},
}
require.NoError(t, sharedInformers.Velero().V1().Backups().Informer().GetStore().Add(backup))

err := c.processBackup(fmt.Sprintf("%s/%s", backup.Namespace, backup.Name))
require.Nil(t, err)

res, err := clientset.VeleroV1().Backups(backup.Namespace).Get(context.TODO(), backup.Name, metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, velerov1api.BackupPhaseFailed, res.Status.Phase)
}

func TestProcessBackupValidationFailures(t *testing.T) {
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Result()

Expand Down Expand Up @@ -729,6 +746,7 @@ func TestProcessBackupCompletions(t *testing.T) {
},
Status: velerov1api.BackupStatus{
Phase: velerov1api.BackupPhaseFailed,
FailureReason: "backup already exists in object storage",
Version: 1,
FormatVersion: "1.1.0",
StartTimestamp: &timestamp,
Expand Down Expand Up @@ -766,6 +784,7 @@ func TestProcessBackupCompletions(t *testing.T) {
},
Status: velerov1api.BackupStatus{
Phase: velerov1api.BackupPhaseFailed,
FailureReason: "error checking if backup already exists in object storage: Backup already exists in object storage",
Version: 1,
FormatVersion: "1.1.0",
StartTimestamp: &timestamp,
Expand Down
21 changes: 17 additions & 4 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ func NewRestoreController(
restore := obj.(*api.Restore)

switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case "", api.RestorePhaseNew, api.RestorePhaseInProgress:
default:
c.logger.WithFields(logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
"phase": restore.Status.Phase,
}).Debug("Restore is not new, skipping")
}).Debug("Restore is not new or in-progress, skipping")
return
}

Expand Down Expand Up @@ -202,7 +201,21 @@ func (c *restoreController) processQueueItem(key string) error {
// is ("" | New)
switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case api.RestorePhaseInProgress:
// A restore may stay in-progress forever because of
// 1) the controller restarts during the processing of a restore
// 2) the restore with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such restores as failed to avoid it
updated := restore.DeepCopy()
updated.Status.Phase = api.RestorePhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Restore with unexpected status %q, this may be due to a restart of the controller during the restore, mark it as %q",
api.RestorePhaseInProgress, updated.Status.Phase)
_, err = patchRestore(restore, updated, c.restoreClient)
if err != nil {
return errors.Wrapf(err, "error updating Restore status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
default:
return nil
}
Expand Down
31 changes: 26 additions & 5 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"encoding/json"
"fmt"
"io/ioutil"
"testing"
"time"
Expand Down Expand Up @@ -170,11 +171,6 @@ func TestProcessQueueItemSkips(t *testing.T) {
restoreKey: "foo/bar",
expectError: true,
},
{
name: "restore with phase InProgress does not get processed",
restoreKey: "foo/bar",
restore: builder.ForRestore("foo", "bar").Phase(velerov1api.RestorePhaseInProgress).Result(),
},
{
name: "restore with phase Completed does not get processed",
restoreKey: "foo/bar",
Expand Down Expand Up @@ -226,6 +222,31 @@ func TestProcessQueueItemSkips(t *testing.T) {
}
}

func TestMarkInProgressRestoreAsFailed(t *testing.T) {
var (
restore = builder.ForRestore("velero", "bar").Phase(velerov1api.RestorePhaseInProgress).Result()
client = fake.NewSimpleClientset(restore)
sharedInformers = informers.NewSharedInformerFactory(client, 0)
logger = velerotest.NewLogger()
)

c := restoreController{
genericController: newGenericController("restore-test", logger),
restoreClient: client.VeleroV1(),
restoreLister: sharedInformers.Velero().V1().Restores().Lister(),
}

err := sharedInformers.Velero().V1().Restores().Informer().GetStore().Add(restore)
require.Nil(t, err)

err = c.processQueueItem(fmt.Sprintf("%s/%s", restore.Namespace, restore.Name))
require.Nil(t, err)

res, err := c.restoreClient.Restores(restore.Namespace).Get(context.Background(), restore.Name, metav1.GetOptions{})
require.Nil(t, err)
assert.Equal(t, velerov1api.RestorePhaseFailed, res.Status.Phase)
}

func TestProcessQueueItem(t *testing.T) {

defaultStorageLocation := builder.ForBackupStorageLocation("velero", "default").Provider("myCloud").Bucket("bucket").Result()
Expand Down
2 changes: 2 additions & 0 deletions site/content/docs/main/api-types/backup.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,5 +144,7 @@ status:
warnings: 2
# Number of errors that were logged by the backup.
errors: 0
# An error that caused the entire backup to fail.
failureReason: ""

```

0 comments on commit 58a8371

Please sign in to comment.