Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make in-progress backup/restore as failed when doing the reconcile #4833

Merged
merged 1 commit into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/4833-ywk253100
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make in-progress backup/restore as failed when doing the reconcile to avoid hanging in in-progress status
4 changes: 4 additions & 0 deletions config/crd/v1/bases/velero.io_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,10 @@ spec:
format: date-time
nullable: true
type: string
failureReason:
description: FailureReason is an error that caused the entire backup
to fail.
type: string
formatVersion:
description: FormatVersion is the backup format version, including
major, minor, and patch version.
Expand Down
2 changes: 1 addition & 1 deletion config/crd/v1/crds/crds.go

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions pkg/apis/velero/v1/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,10 @@ type BackupStatus struct {
// +optional
VolumeSnapshotsCompleted int `json:"volumeSnapshotsCompleted,omitempty"`

// FailureReason is an error that caused the entire backup to fail.
// +optional
FailureReason string `json:"failureReason,omitempty"`

// Warnings is a count of all warning messages that were generated during
// execution of the backup. The actual warnings are in the backup's log
// file in object storage.
Expand Down
24 changes: 20 additions & 4 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,12 @@ func NewBackupController(
backup := obj.(*velerov1api.Backup)

switch backup.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case "", velerov1api.BackupPhaseNew, velerov1api.BackupPhaseInProgress:
default:
c.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(backup),
"phase": backup.Status.Phase,
}).Debug("Backup is not new, skipping")
}).Debug("Backup is not new or in-progress, skipping")
return
}

Expand Down Expand Up @@ -241,7 +240,22 @@ func (c *backupController) processBackup(key string) error {
// this key (even though it was a no-op).
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case velerov1api.BackupPhaseInProgress:
// A backup may stay in-progress forever because of
// 1) the controller restarts during the processing of a backup
// 2) the backup with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such Backups as failed to avoid it
updated := original.DeepCopy()
updated.Status.Phase = velerov1api.BackupPhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Backup with unexpected status %q, this may be due to a restart of the controller during the backing up, mark it as %q",
velerov1api.BackupPhaseInProgress, updated.Status.Phase)
updated.Status.CompletionTimestamp = &metav1.Time{Time: c.clock.Now()}
_, err = patchBackup(original, updated, c.client)
if err != nil {
return errors.Wrapf(err, "error updating Backup status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
default:
return nil
}
Expand All @@ -261,6 +275,7 @@ func (c *backupController) processBackup(key string) error {
if err != nil {
return errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}

// store ref to just-updated item for creating patch
original = updatedBackup
request.Backup = updatedBackup.DeepCopy()
Expand All @@ -287,6 +302,7 @@ func (c *backupController) processBackup(key string) error {
// result in the backup being Failed.
log.WithError(err).Error("backup failed")
request.Status.Phase = velerov1api.BackupPhaseFailed
request.Status.FailureReason = err.Error()
}

switch request.Status.Phase {
Expand Down
29 changes: 24 additions & 5 deletions pkg/controller/backup_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseFailedValidation).Result(),
},
{
name: "InProgress backup is not processed",
key: "velero/backup-1",
backup: defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result(),
},
{
name: "Completed backup is not processed",
key: "velero/backup-1",
Expand Down Expand Up @@ -140,6 +135,28 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
}
}

func TestMarkInProgressBackupAsFailed(t *testing.T) {
backup := defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result()
clientset := fake.NewSimpleClientset(backup)
sharedInformers := informers.NewSharedInformerFactory(clientset, 0)
logger := logging.DefaultLogger(logrus.DebugLevel, logging.FormatText)

c := &backupController{
genericController: newGenericController("backup-test", logger),
client: clientset.VeleroV1(),
lister: sharedInformers.Velero().V1().Backups().Lister(),
clock: &clock.RealClock{},
}
require.NoError(t, sharedInformers.Velero().V1().Backups().Informer().GetStore().Add(backup))

err := c.processBackup(fmt.Sprintf("%s/%s", backup.Namespace, backup.Name))
require.Nil(t, err)

res, err := clientset.VeleroV1().Backups(backup.Namespace).Get(context.TODO(), backup.Name, metav1.GetOptions{})
require.NoError(t, err)
assert.Equal(t, velerov1api.BackupPhaseFailed, res.Status.Phase)
}

func TestProcessBackupValidationFailures(t *testing.T) {
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Result()

Expand Down Expand Up @@ -729,6 +746,7 @@ func TestProcessBackupCompletions(t *testing.T) {
},
Status: velerov1api.BackupStatus{
Phase: velerov1api.BackupPhaseFailed,
FailureReason: "backup already exists in object storage",
Version: 1,
FormatVersion: "1.1.0",
StartTimestamp: &timestamp,
Expand Down Expand Up @@ -766,6 +784,7 @@ func TestProcessBackupCompletions(t *testing.T) {
},
Status: velerov1api.BackupStatus{
Phase: velerov1api.BackupPhaseFailed,
FailureReason: "error checking if backup already exists in object storage: Backup already exists in object storage",
Version: 1,
FormatVersion: "1.1.0",
StartTimestamp: &timestamp,
Expand Down
21 changes: 17 additions & 4 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,12 @@ func NewRestoreController(
restore := obj.(*api.Restore)

switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case "", api.RestorePhaseNew, api.RestorePhaseInProgress:
default:
c.logger.WithFields(logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
"phase": restore.Status.Phase,
}).Debug("Restore is not new, skipping")
}).Debug("Restore is not new or in-progress, skipping")
return
}

Expand Down Expand Up @@ -202,7 +201,21 @@ func (c *restoreController) processQueueItem(key string) error {
// is ("" | New)
switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case api.RestorePhaseInProgress:
// A restore may stay in-progress forever because of
// 1) the controller restarts during the processing of a restore
// 2) the restore with in-progress status isn't updated to completed or failed status successfully
// So we try to mark such restores as failed to avoid it
updated := restore.DeepCopy()
updated.Status.Phase = api.RestorePhaseFailed
updated.Status.FailureReason = fmt.Sprintf("got a Restore with unexpected status %q, this may be due to a restart of the controller during the restore, mark it as %q",
api.RestorePhaseInProgress, updated.Status.Phase)
_, err = patchRestore(restore, updated, c.restoreClient)
if err != nil {
return errors.Wrapf(err, "error updating Restore status to %s", updated.Status.Phase)
}
log.Warn(updated.Status.FailureReason)
return nil
default:
return nil
}
Expand Down
31 changes: 26 additions & 5 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"encoding/json"
"fmt"
"io/ioutil"
"testing"
"time"
Expand Down Expand Up @@ -170,11 +171,6 @@ func TestProcessQueueItemSkips(t *testing.T) {
restoreKey: "foo/bar",
expectError: true,
},
{
name: "restore with phase InProgress does not get processed",
restoreKey: "foo/bar",
restore: builder.ForRestore("foo", "bar").Phase(velerov1api.RestorePhaseInProgress).Result(),
},
{
name: "restore with phase Completed does not get processed",
restoreKey: "foo/bar",
Expand Down Expand Up @@ -226,6 +222,31 @@ func TestProcessQueueItemSkips(t *testing.T) {
}
}

func TestMarkInProgressRestoreAsFailed(t *testing.T) {
var (
restore = builder.ForRestore("velero", "bar").Phase(velerov1api.RestorePhaseInProgress).Result()
client = fake.NewSimpleClientset(restore)
sharedInformers = informers.NewSharedInformerFactory(client, 0)
logger = velerotest.NewLogger()
)

c := restoreController{
genericController: newGenericController("restore-test", logger),
restoreClient: client.VeleroV1(),
restoreLister: sharedInformers.Velero().V1().Restores().Lister(),
}

err := sharedInformers.Velero().V1().Restores().Informer().GetStore().Add(restore)
require.Nil(t, err)

err = c.processQueueItem(fmt.Sprintf("%s/%s", restore.Namespace, restore.Name))
require.Nil(t, err)

res, err := c.restoreClient.Restores(restore.Namespace).Get(context.Background(), restore.Name, metav1.GetOptions{})
require.Nil(t, err)
assert.Equal(t, velerov1api.RestorePhaseFailed, res.Status.Phase)
}

func TestProcessQueueItem(t *testing.T) {

defaultStorageLocation := builder.ForBackupStorageLocation("velero", "default").Provider("myCloud").Bucket("bucket").Result()
Expand Down
2 changes: 2 additions & 0 deletions site/content/docs/main/api-types/backup.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,5 +144,7 @@ status:
warnings: 2
# Number of errors that were logged by the backup.
errors: 0
# An error that caused the entire backup to fail.
failureReason: ""

```