Skip to content

Commit

Permalink
Retry completion status patch for backup and restore resources
Browse files Browse the repository at this point in the history
Signed-off-by: Tiger Kaovilai <[email protected]>

update to design vmware-tanzu#8063

Signed-off-by: Tiger Kaovilai <[email protected]>
  • Loading branch information
kaovilai committed Sep 10, 2024
1 parent b92143d commit 40ac9ab
Show file tree
Hide file tree
Showing 15 changed files with 538 additions and 11 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/7845-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retry backup/restore completion/finalizing status patching to unstuck inprogress backups/restores
4 changes: 4 additions & 0 deletions design/retry-patching-configuration_design.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ and from above non final phases to
- Completed
- PartiallyFailed

Once backup/restore is in some phase it will already be reconciled again periodically and do not need additional retry
- WaitingForPluginOperations
- WaitingForPluginOperationsPartiallyFailed

## Detailed Design
Relevant reconcilers will have `resourceTimeout time.Duration` added to its struct and to parameters of New[Backup|Restore]XReconciler functions.

Expand Down
27 changes: 27 additions & 0 deletions pkg/client/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package client

import (
"context"
"math"
"time"

apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -36,3 +39,27 @@ func CreateRetryGenerateName(client kbclient.Client, ctx context.Context, obj kb
return client.Create(ctx, obj, &kbclient.CreateOptions{})
}
}

// CapBackoff provides a backoff with a set backoff cap
func CapBackoff(cap time.Duration) wait.Backoff {
if cap < 0 {
cap = 0
}
return wait.Backoff{
Steps: math.MaxInt,
Duration: 10 * time.Millisecond,
Cap: cap,
Factor: retry.DefaultBackoff.Factor,
Jitter: retry.DefaultBackoff.Jitter,
}
}

// RetryOnRetriableMaxBackOff accepts a patch function param, retrying when the provided retriable function returns true.
func RetryOnRetriableMaxBackOff(maxDuration time.Duration, fn func() error, retriable func(error) bool) error {
return retry.OnError(CapBackoff(maxDuration), func(err error) bool { return retriable(err) }, fn)
}

// RetryOnErrorMaxBackOff accepts a patch function param, retrying when the error is not nil.
func RetryOnErrorMaxBackOff(maxDuration time.Duration, fn func() error) error {
return RetryOnRetriableMaxBackOff(maxDuration, fn, func(err error) bool { return err != nil })
}
2 changes: 2 additions & 0 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
backupStoreGetter,
s.logger,
s.metrics,
s.config.resourceTimeout,
)
if err := r.SetupWithManager(s.mgr); err != nil {
s.logger.Fatal(err, "unable to create controller", "controller", constant.ControllerBackupFinalizer)
Expand Down Expand Up @@ -794,6 +795,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.config.DefaultItemOperationTimeout,
s.config.DisableInformerCache,
s.crClient,
s.config.resourceTimeout,
)

if err = r.SetupWithManager(s.mgr); err != nil {
Expand Down
16 changes: 13 additions & 3 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,10 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
request.Status.StartTimestamp = &metav1.Time{Time: b.clock.Now()}
}

// update status
// update status to
// BackupPhaseFailedValidation
// BackupPhaseInProgress
// if patch fail, backup can reconcile again as phase would still be "" or New
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}
Expand Down Expand Up @@ -305,8 +308,15 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
b.metrics.RegisterBackupLastStatus(backupScheduleName, metrics.BackupLastStatusFailure)
}
log.Info("Updating backup's final status")
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
// Phases were updated in runBackup()
// This patch with retry update Phase from InProgress to
// BackupPhaseWaitingForPluginOperations -> backup_operations_controller.go will now reconcile
// BackupPhaseWaitingForPluginOperationsPartiallyFailed -> backup_operations_controller.go will now reconcile
// BackupPhaseFinalizing -> backup_finalizer_controller.go will now reconcile
// BackupPhaseFinalizingPartiallyFailed -> backup_finalizer_controller.go will now reconcile
// BackupPhaseFailed
if err := kubeutil.PatchResourceWithRetriesOnErrors(b.resourceTimeout, original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Errorf("error updating backup's status from InProgress to %v", request.Backup.Status.Phase)
}
return ctrl.Result{}, nil
}
Expand Down
9 changes: 8 additions & 1 deletion pkg/controller/backup_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"os"
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
Expand All @@ -31,6 +32,7 @@ import (

velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
pkgbackup "github.com/vmware-tanzu/velero/pkg/backup"
"github.com/vmware-tanzu/velero/pkg/client"
"github.com/vmware-tanzu/velero/pkg/itemoperation"
"github.com/vmware-tanzu/velero/pkg/kuberesource"
"github.com/vmware-tanzu/velero/pkg/metrics"
Expand All @@ -51,6 +53,7 @@ type backupFinalizerReconciler struct {
metrics *metrics.ServerMetrics
backupStoreGetter persistence.ObjectBackupStoreGetter
log logrus.FieldLogger
resourceTimeout time.Duration
}

// NewBackupFinalizerReconciler initializes and returns backupFinalizerReconciler struct.
Expand All @@ -64,6 +67,7 @@ func NewBackupFinalizerReconciler(
backupStoreGetter persistence.ObjectBackupStoreGetter,
log logrus.FieldLogger,
metrics *metrics.ServerMetrics,
resourceTimeout time.Duration,
) *backupFinalizerReconciler {
return &backupFinalizerReconciler{
client: client,
Expand Down Expand Up @@ -119,7 +123,10 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
// Always attempt to Patch the backup object and status after each reconciliation.
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
//
// if this patch fails, there may not be another opportunity to update the backup object without external update event.
// so we retry
if err := client.RetriesPhasePatchFuncOnErrors(r.resourceTimeout, func() error { return r.client.Patch(ctx, backup, kbclient.MergeFrom(original)) }); err != nil {

Check failure on line 129 in pkg/controller/backup_finalizer_controller.go

View workflow job for this annotation

GitHub Actions / build

undefined: client.RetriesPhasePatchFuncOnErrors
log.WithError(err).Error("Error updating backup")
return
}
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/backup_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func mockBackupFinalizerReconciler(fakeClient kbclient.Client, fakeGlobalClient
NewFakeSingleObjectBackupStoreGetter(backupStore),
logrus.StandardLogger(),
metrics.NewServerMetrics(),
10*time.Minute,
), backupper
}
func TestBackupFinalizerReconcile(t *testing.T) {
Expand Down
7 changes: 5 additions & 2 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ type restoreReconciler struct {
newPluginManager func(logger logrus.FieldLogger) clientmgmt.Manager
backupStoreGetter persistence.ObjectBackupStoreGetter
globalCrClient client.Client
resourceTimeout time.Duration
}

type backupInfo struct {
Expand All @@ -130,6 +131,7 @@ func NewRestoreReconciler(
defaultItemOperationTimeout time.Duration,
disableInformerCache bool,
globalCrClient client.Client,
resourceTimeout time.Duration,
) *restoreReconciler {
r := &restoreReconciler{
ctx: ctx,
Expand All @@ -149,7 +151,8 @@ func NewRestoreReconciler(
newPluginManager: newPluginManager,
backupStoreGetter: backupStoreGetter,

globalCrClient: globalCrClient,
globalCrClient: globalCrClient,
resourceTimeout: resourceTimeout,
}

// Move the periodical backup and restore metrics computing logic from controllers to here.
Expand Down Expand Up @@ -276,7 +279,7 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
}
log.Debug("Updating restore's final status")

if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
if err = kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
// No need to re-enqueue here, because restore's already set to InProgress before.
// Controller only handle New restore.
Expand Down
5 changes: 5 additions & 0 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ func TestFetchBackupInfo(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

if test.backupStoreError == nil {
Expand Down Expand Up @@ -196,6 +197,7 @@ func TestProcessQueueItemSkips(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

_, err := r.Reconcile(context.Background(), ctrl.Request{NamespacedName: types.NamespacedName{
Expand Down Expand Up @@ -498,6 +500,7 @@ func TestRestoreReconcile(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

r.clock = clocktesting.NewFakeClock(now)
Expand Down Expand Up @@ -681,6 +684,7 @@ func TestValidateAndCompleteWhenScheduleNameSpecified(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

restore := &velerov1api.Restore{
Expand Down Expand Up @@ -776,6 +780,7 @@ func TestValidateAndCompleteWithResourceModifierSpecified(t *testing.T) {
60*time.Minute,
false,
fakeGlobalClient,
10*time.Minute,
)

restore := &velerov1api.Restore{
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/restore_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func (r *restoreFinalizerReconciler) finishProcessing(restorePhase velerov1api.R
}
restore.Status.CompletionTimestamp = &metav1.Time{Time: r.clock.Now()}

return kubeutil.PatchResource(original, restore, r.Client)
return kubeutil.PatchResourceWithRetriesOnErrors(r.resourceTimeout, original, restore, r.Client)
}

// finalizerContext includes all the dependencies required by finalization tasks and
Expand Down
79 changes: 75 additions & 4 deletions pkg/controller/restore_finalizer_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,19 @@ package controller
import (
"context"
"fmt"
"syscall"
"testing"
"time"

"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
corev1api "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
testclocks "k8s.io/utils/clock/testing"
ctrl "sigs.k8s.io/controller-runtime"

"github.com/stretchr/testify/mock"

corev1api "k8s.io/api/core/v1"
crclient "sigs.k8s.io/controller-runtime/pkg/client"

"github.com/vmware-tanzu/velero/internal/hook"
Expand All @@ -44,6 +43,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt"
pluginmocks "github.com/vmware-tanzu/velero/pkg/plugin/mocks"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
pkgUtilKubeMocks "github.com/vmware-tanzu/velero/pkg/util/kube/mocks"
"github.com/vmware-tanzu/velero/pkg/util/results"
)

Expand Down Expand Up @@ -556,3 +556,74 @@ func TestWaitRestoreExecHook(t *testing.T) {
assert.Equal(t, tc.expectedHooksFailed, updated.Status.HookStatus.HooksFailed)
}
}

// test finishprocessing with mocks of kube client to simulate connection refused
func Test_restoreFinalizerReconciler_finishProcessing(t *testing.T) {
type args struct {
// mockClientActions simulate different client errors
mockClientActions func(*pkgUtilKubeMocks.Client)
// return bool indicating if the client method was called as expected
mockClientAsserts func(*pkgUtilKubeMocks.Client) bool
}
tests := []struct {
name string
args args
wantErr bool
}{
{
name: "restore failed to patch status, should retry on connection refused",
args: args{
mockClientActions: func(client *pkgUtilKubeMocks.Client) {
client.On("Patch", mock.Anything, mock.Anything, mock.Anything).Return(syscall.ECONNREFUSED).Once()
client.On("Patch", mock.Anything, mock.Anything, mock.Anything).Return(nil)
},
mockClientAsserts: func(client *pkgUtilKubeMocks.Client) bool {
return client.AssertNumberOfCalls(t, "Patch", 2)
},
},
},
{
name: "restore failed to patch status, retry on connection refused until max retries",
args: args{
mockClientActions: func(client *pkgUtilKubeMocks.Client) {
client.On("Patch", mock.Anything, mock.Anything, mock.Anything).Return(syscall.ECONNREFUSED)
},
mockClientAsserts: func(client *pkgUtilKubeMocks.Client) bool {
return len(client.Calls) > 3
},
},
wantErr: true,
},
{
name: "restore patch status ok, should not retry",
args: args{
mockClientActions: func(client *pkgUtilKubeMocks.Client) {
client.On("Patch", mock.Anything, mock.Anything, mock.Anything).Return(nil)
},
mockClientAsserts: func(client *pkgUtilKubeMocks.Client) bool {
return client.AssertNumberOfCalls(t, "Patch", 1)
},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
client := pkgUtilKubeMocks.NewClient(t)
// mock client actions
tt.args.mockClientActions(client)
r := &restoreFinalizerReconciler{
Client: client,
metrics: metrics.NewServerMetrics(),
clock: testclocks.NewFakeClock(time.Now()),
resourceTimeout: 1 * time.Second,
}
restore := builder.ForRestore(velerov1api.DefaultNamespace, "restoreName").Result()
if err := r.finishProcessing(velerov1api.RestorePhaseInProgress, restore, restore); (err != nil) != tt.wantErr {
t.Errorf("restoreFinalizerReconciler.finishProcessing() error = %v, wantErr %v", err, tt.wantErr)
}
if !tt.args.mockClientAsserts(client) {
t.Errorf("mockClientAsserts() failed")
}
})
}
}
2 changes: 2 additions & 0 deletions pkg/controller/restore_operations_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ func (r *restoreOperationsReconciler) Reconcile(ctx context.Context, req ctrl.Re
if err != nil {
log.Warnf("Cannot check progress on Restore operations because backup info is unavailable %s; marking restore FinalizingPartiallyFailed", err.Error())
restore.Status.Phase = velerov1api.RestorePhaseFinalizingPartiallyFailed
// Don't need to retry as Reconcile will be called again
err2 := r.updateRestoreAndOperationsJSON(ctx, original, restore, nil, &itemoperationmap.OperationsForRestore{ErrsSinceUpdate: []string{err.Error()}}, false, false)
if err2 != nil {
log.WithError(err2).Error("error updating Restore")
Expand Down Expand Up @@ -183,6 +184,7 @@ func (r *restoreOperationsReconciler) Reconcile(ctx context.Context, req ctrl.Re
restore.Status.Phase = velerov1api.RestorePhaseFinalizingPartiallyFailed
}
}
// No need to retry as Reconcile will be called again
err = r.updateRestoreAndOperationsJSON(ctx, original, restore, backupStore, operations, changes, completionChanges)
if err != nil {
return ctrl.Result{}, errors.Wrap(err, "error updating Restore")
Expand Down
19 changes: 19 additions & 0 deletions pkg/util/kube/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,31 @@ package kube

import (
"context"
"time"

"sigs.k8s.io/controller-runtime/pkg/client"

veleroPkgClient "github.com/vmware-tanzu/velero/pkg/client"
)

func PatchResource(original, updated client.Object, kbClient client.Client) error {
err := kbClient.Patch(context.Background(), updated, client.MergeFrom(original))

return err
}

// PatchResourceWithRetries patches the original resource with the updated resource, retrying when the provided retriable function returns true.
func PatchResourceWithRetries(maxDuration time.Duration, original, updated client.Object, kbClient client.Client, retriable func(error) bool) error {
return veleroPkgClient.RetryOnRetriableMaxBackOff(maxDuration, func() error { return PatchResource(original, updated, kbClient) }, retriable)
}

// PatchResourceWithRetriesOnErrors patches the original resource with the updated resource, retrying when the operation returns an error.
func PatchResourceWithRetriesOnErrors(maxDuration time.Duration, original, updated client.Object, kbClient client.Client) error {
return PatchResourceWithRetries(maxDuration, original, updated, kbClient, func(err error) bool {
// retry using DefaultBackoff to resolve connection refused error that may occur when the server is under heavy load
// TODO: consider using a more specific error type to retry, for now, we retry on all errors
// specific errors:
// - connection refused: https://pkg.go.dev/syscall#:~:text=Errno(0x67)-,ECONNREFUSED,-%3D%20Errno(0x6f
return err != nil
})
}
Loading

0 comments on commit 40ac9ab

Please sign in to comment.