Skip to content

Commit

Permalink
Implement the Repo maintanence Job configuration design.
Browse files Browse the repository at this point in the history
Remove the resource parameters from the velero server CLI.

Signed-off-by: Xun Jiang <[email protected]>
  • Loading branch information
blackpiglet committed Aug 27, 2024
1 parent f5671c7 commit 690d330
Show file tree
Hide file tree
Showing 21 changed files with 690 additions and 293 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/8145-blackpiglet
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Implement the Repo maintenance Job configuration.
4 changes: 1 addition & 3 deletions design/repo_maintenance_job_config.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ velero server \
### Structure
The data structure for ```repo-maintenance-job-config``` is as below:
```go
type MaintenanceConfigMap map[string]Configs

type Configs struct {
// LoadAffinity is the config for data path load affinity.
LoadAffinity []*LoadAffinity `json:"loadAffinity,omitempty"`
Expand Down Expand Up @@ -80,7 +78,7 @@ type Resources struct {
```

The ConfigMap content is a map.
If there is a key value as `global` in the map, the key's value is applied to all BackupRepositories maintenance jobs that don't their own specific configuration in the ConfigMap.
If there is a key value as `global` in the map, the key's value is applied to all BackupRepositories maintenance jobs that cannot find their own specific configuration in the ConfigMap.
The other keys in the map is the combination of three elements of a BackupRepository:
* The namespace in which BackupRepository backs up volume data
* The BackupRepository referenced BackupStorageLocation's name
Expand Down
8 changes: 0 additions & 8 deletions pkg/cmd/cli/install/install.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"strings"
"time"

"github.com/vmware-tanzu/velero/pkg/repository"
"github.com/vmware-tanzu/velero/pkg/uploader"

"github.com/pkg/errors"
Expand Down Expand Up @@ -85,7 +84,6 @@ type Options struct {
DefaultSnapshotMoveData bool
DisableInformerCache bool
ScheduleSkipImmediately bool
MaintenanceCfg repository.MaintenanceConfig
}

// BindFlags adds command line values to the options struct.
Expand Down Expand Up @@ -130,11 +128,6 @@ func (o *Options) BindFlags(flags *pflag.FlagSet) {
flags.BoolVar(&o.DefaultSnapshotMoveData, "default-snapshot-move-data", o.DefaultSnapshotMoveData, "Bool flag to configure Velero server to move data by default for all snapshots supporting data movement. Optional.")
flags.BoolVar(&o.DisableInformerCache, "disable-informer-cache", o.DisableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable). Optional.")
flags.BoolVar(&o.ScheduleSkipImmediately, "schedule-skip-immediately", o.ScheduleSkipImmediately, "Skip the first scheduled backup immediately after creating a schedule. Default is false (don't skip).")
flags.IntVar(&o.MaintenanceCfg.KeepLatestMaitenanceJobs, "keep-latest-maintenance-jobs", o.MaintenanceCfg.KeepLatestMaitenanceJobs, "Number of latest maintenance jobs to keep each repository. Optional.")
flags.StringVar(&o.MaintenanceCfg.CPURequest, "maintenance-job-cpu-request", o.MaintenanceCfg.CPURequest, "CPU request for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.MemRequest, "maintenance-job-mem-request", o.MaintenanceCfg.MemRequest, "Memory request for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.CPULimit, "maintenance-job-cpu-limit", o.MaintenanceCfg.CPULimit, "CPU limit for maintenance jobs. Default is no limit.")
flags.StringVar(&o.MaintenanceCfg.MemLimit, "maintenance-job-mem-limit", o.MaintenanceCfg.MemLimit, "Memory limit for maintenance jobs. Default is no limit.")
}

// NewInstallOptions instantiates a new, default InstallOptions struct.
Expand Down Expand Up @@ -231,7 +224,6 @@ func (o *Options) AsVeleroOptions() (*install.VeleroOptions, error) {
DefaultSnapshotMoveData: o.DefaultSnapshotMoveData,
DisableInformerCache: o.DisableInformerCache,
ScheduleSkipImmediately: o.ScheduleSkipImmediately,
MaintenanceCfg: o.MaintenanceCfg,
}, nil
}

Expand Down
18 changes: 16 additions & 2 deletions pkg/cmd/cli/nodeagent/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/nodeagent"
"github.com/vmware-tanzu/velero/pkg/repository"
velerotypes "github.com/vmware-tanzu/velero/pkg/types"
"github.com/vmware-tanzu/velero/pkg/util/filesystem"
"github.com/vmware-tanzu/velero/pkg/util/logging"

Expand Down Expand Up @@ -292,7 +293,7 @@ func (s *nodeAgentServer) run() {
s.logger.WithError(err).Fatal("Unable to create the pod volume restore controller")
}

var loadAffinity *nodeagent.LoadAffinity
var loadAffinity *velerotypes.LoadAffinity

Check warning on line 296 in pkg/cmd/cli/nodeagent/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/nodeagent/server.go#L296

Added line #L296 was not covered by tests
if s.dataPathConfigs != nil && len(s.dataPathConfigs.LoadAffinity) > 0 {
loadAffinity = s.dataPathConfigs.LoadAffinity[0]
}
Expand All @@ -302,7 +303,20 @@ func (s *nodeAgentServer) run() {
backupPVCConfig = s.dataPathConfigs.BackupPVCConfig
}

dataUploadReconciler := controller.NewDataUploadReconciler(s.mgr.GetClient(), s.mgr, s.kubeClient, s.csiSnapshotClient.SnapshotV1(), s.dataPathMgr, loadAffinity, backupPVCConfig, clock.RealClock{}, s.nodeName, s.config.dataMoverPrepareTimeout, s.logger, s.metrics)
dataUploadReconciler := controller.NewDataUploadReconciler(
s.mgr.GetClient(),
s.mgr,
s.kubeClient,
s.csiSnapshotClient.SnapshotV1(),
s.dataPathMgr,
loadAffinity,
backupPVCConfig,
clock.RealClock{},
s.nodeName,
s.config.dataMoverPrepareTimeout,
s.logger,
s.metrics,
)

Check warning on line 319 in pkg/cmd/cli/nodeagent/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/cli/nodeagent/server.go#L306-L319

Added lines #L306 - L319 were not covered by tests
if err = dataUploadReconciler.SetupWithManager(s.mgr); err != nil {
s.logger.WithError(err).Fatal("Unable to create the data upload controller")
}
Expand Down
50 changes: 34 additions & 16 deletions pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ type serverConfig struct {
defaultSnapshotMoveData bool
disableInformerCache bool
scheduleSkipImmediately bool
maintenanceCfg repository.MaintenanceConfig
backukpRepoConfig string
backupRepoConfig string
repoMaintenanceJobConfig string
}

func NewCommand(f client.Factory) *cobra.Command {
Expand Down Expand Up @@ -172,9 +172,6 @@ func NewCommand(f client.Factory) *cobra.Command {
defaultSnapshotMoveData: false,
disableInformerCache: defaultDisableInformerCache,
scheduleSkipImmediately: false,
maintenanceCfg: repository.MaintenanceConfig{
KeepLatestMaitenanceJobs: repository.DefaultKeepLatestMaitenanceJobs,
},
}
)

Expand Down Expand Up @@ -248,17 +245,20 @@ func NewCommand(f client.Factory) *cobra.Command {
command.Flags().BoolVar(&config.defaultSnapshotMoveData, "default-snapshot-move-data", config.defaultSnapshotMoveData, "Move data by default for all snapshots supporting data movement.")
command.Flags().BoolVar(&config.disableInformerCache, "disable-informer-cache", config.disableInformerCache, "Disable informer cache for Get calls on restore. With this enabled, it will speed up restore in cases where there are backup resources which already exist in the cluster, but for very large clusters this will increase velero memory usage. Default is false (don't disable).")
command.Flags().BoolVar(&config.scheduleSkipImmediately, "schedule-skip-immediately", config.scheduleSkipImmediately, "Skip the first scheduled backup immediately after creating a schedule. Default is false (don't skip).")
command.Flags().IntVar(&config.maintenanceCfg.KeepLatestMaitenanceJobs, "keep-latest-maintenance-jobs", config.maintenanceCfg.KeepLatestMaitenanceJobs, "Number of latest maintenance jobs to keep each repository. Optional.")
command.Flags().StringVar(&config.maintenanceCfg.CPURequest, "maintenance-job-cpu-request", config.maintenanceCfg.CPURequest, "CPU request for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.MemRequest, "maintenance-job-mem-request", config.maintenanceCfg.MemRequest, "Memory request for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.CPULimit, "maintenance-job-cpu-limit", config.maintenanceCfg.CPULimit, "CPU limit for maintenance job. Default is no limit.")
command.Flags().StringVar(&config.maintenanceCfg.MemLimit, "maintenance-job-mem-limit", config.maintenanceCfg.MemLimit, "Memory limit for maintenance job. Default is no limit.")

command.Flags().StringVar(&config.backukpRepoConfig, "backup-repository-config", config.backukpRepoConfig, "The name of configMap containing backup repository configurations.")
command.Flags().StringVar(
&config.backupRepoConfig,
"backup-repository-config",
config.backupRepoConfig,
"The name of configMap containing backup repository configurations.",
)
command.Flags().StringVar(
&config.repoMaintenanceJobConfig,
"repo-maintenance-job-config",
config.repoMaintenanceJobConfig,
"The name of ConfigMap containing repository maintenance Job configurations.",
)

// maintenance job log setting inherited from velero server
config.maintenanceCfg.FormatFlag = config.formatFlag
config.maintenanceCfg.LogLevelFlag = logLevelFlag
return command
}

Expand Down Expand Up @@ -667,7 +667,18 @@ func (s *server) initRepoManager() error {
s.repoLocker = repository.NewRepoLocker()
s.repoEnsurer = repository.NewEnsurer(s.mgr.GetClient(), s.logger, s.config.resourceTimeout)

s.repoManager = repository.NewManager(s.namespace, s.mgr.GetClient(), s.repoLocker, s.repoEnsurer, s.credentialFileStore, s.credentialSecretStore, s.config.maintenanceCfg, s.logger)
s.repoManager = repository.NewManager(
s.namespace,
s.mgr.GetClient(),
s.repoLocker,
s.repoEnsurer,
s.credentialFileStore,
s.credentialSecretStore,
s.config.repoMaintenanceJobConfig,
s.logger,
s.logLevel,
s.config.formatFlag,
)

Check warning on line 681 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L670-L681

Added lines #L670 - L681 were not covered by tests

return nil
}
Expand Down Expand Up @@ -881,7 +892,14 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
}

if _, ok := enabledRuntimeControllers[controller.BackupRepo]; ok {
if err := controller.NewBackupRepoReconciler(s.namespace, s.logger, s.mgr.GetClient(), s.config.repoMaintenanceFrequency, s.config.backukpRepoConfig, s.repoManager).SetupWithManager(s.mgr); err != nil {
if err := controller.NewBackupRepoReconciler(
s.namespace,
s.logger,
s.mgr.GetClient(),
s.config.repoMaintenanceFrequency,
s.config.backupRepoConfig,
s.repoManager,
).SetupWithManager(s.mgr); err != nil {

Check warning on line 902 in pkg/cmd/server/server.go

View check run for this annotation

Codecov / codecov/patch

pkg/cmd/server/server.go#L895-L902

Added lines #L895 - L902 were not covered by tests
s.logger.Fatal(err, "unable to create controller", "controller", controller.BackupRepo)
}
}
Expand Down
10 changes: 5 additions & 5 deletions pkg/controller/backup_repository_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,19 @@ type BackupRepoReconciler struct {
logger logrus.FieldLogger
clock clocks.WithTickerAndDelayedExecution
maintenanceFrequency time.Duration
backukpRepoConfig string
backupRepoConfig string
repositoryManager repository.Manager
}

func NewBackupRepoReconciler(namespace string, logger logrus.FieldLogger, client client.Client,
maintenanceFrequency time.Duration, backukpRepoConfig string, repositoryManager repository.Manager) *BackupRepoReconciler {
maintenanceFrequency time.Duration, backupRepoConfig string, repositoryManager repository.Manager) *BackupRepoReconciler {
c := &BackupRepoReconciler{
client,
namespace,
logger,
clocks.RealClock{},
maintenanceFrequency,
backukpRepoConfig,
backupRepoConfig,
repositoryManager,
}

Expand Down Expand Up @@ -229,7 +229,7 @@ func (r *BackupRepoReconciler) getIdentiferByBSL(ctx context.Context, req *veler
}

func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1api.BackupRepository, log logrus.FieldLogger) error {
log.WithField("repoConfig", r.backukpRepoConfig).Info("Initializing backup repository")
log.WithField("repoConfig", r.backupRepoConfig).Info("Initializing backup repository")

// confirm the repo's BackupStorageLocation is valid
repoIdentifier, err := r.getIdentiferByBSL(ctx, req)
Expand All @@ -244,7 +244,7 @@ func (r *BackupRepoReconciler) initializeRepo(ctx context.Context, req *velerov1
})
}

config, err := getBackupRepositoryConfig(ctx, r, r.backukpRepoConfig, r.namespace, req.Name, req.Spec.RepositoryType, log)
config, err := getBackupRepositoryConfig(ctx, r, r.backupRepoConfig, r.namespace, req.Name, req.Spec.RepositoryType, log)
if err != nil {
log.WithError(err).Warn("Failed to get repo config, repo config is ignored")
} else if config != nil {
Expand Down
23 changes: 17 additions & 6 deletions pkg/controller/data_upload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"fmt"
"time"

snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
corev1 "k8s.io/api/core/v1"
Expand All @@ -39,8 +40,6 @@ import (
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1"

"github.com/vmware-tanzu/velero/pkg/apis/velero/shared"
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
velerov2alpha1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1"
Expand All @@ -49,6 +48,7 @@ import (
"github.com/vmware-tanzu/velero/pkg/exposer"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/nodeagent"
velerotypes "github.com/vmware-tanzu/velero/pkg/types"
"github.com/vmware-tanzu/velero/pkg/uploader"
"github.com/vmware-tanzu/velero/pkg/util/kube"
)
Expand All @@ -71,15 +71,26 @@ type DataUploadReconciler struct {
logger logrus.FieldLogger
snapshotExposerList map[velerov2alpha1api.SnapshotType]exposer.SnapshotExposer
dataPathMgr *datapath.Manager
loadAffinity *nodeagent.LoadAffinity
loadAffinity *velerotypes.LoadAffinity
backupPVCConfig map[string]nodeagent.BackupPVC
preparingTimeout time.Duration
metrics *metrics.ServerMetrics
}

func NewDataUploadReconciler(client client.Client, mgr manager.Manager, kubeClient kubernetes.Interface, csiSnapshotClient snapshotter.SnapshotV1Interface,
dataPathMgr *datapath.Manager, loadAffinity *nodeagent.LoadAffinity, backupPVCConfig map[string]nodeagent.BackupPVC, clock clocks.WithTickerAndDelayedExecution,
nodeName string, preparingTimeout time.Duration, log logrus.FieldLogger, metrics *metrics.ServerMetrics) *DataUploadReconciler {
func NewDataUploadReconciler(
client client.Client,
mgr manager.Manager,
kubeClient kubernetes.Interface,
csiSnapshotClient snapshotter.SnapshotV1Interface,
dataPathMgr *datapath.Manager,
loadAffinity *velerotypes.LoadAffinity,
backupPVCConfig map[string]nodeagent.BackupPVC,
clock clocks.WithTickerAndDelayedExecution,
nodeName string,
preparingTimeout time.Duration,
log logrus.FieldLogger,
metrics *metrics.ServerMetrics,
) *DataUploadReconciler {
return &DataUploadReconciler{
client: client,
mgr: mgr,
Expand Down
16 changes: 14 additions & 2 deletions pkg/controller/data_upload_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,20 @@ func initDataUploaderReconcilerWithError(needError ...error) (*DataUploadReconci
fakeSnapshotClient := snapshotFake.NewSimpleClientset(vsObject, vscObj)
fakeKubeClient := clientgofake.NewSimpleClientset(daemonSet)

return NewDataUploadReconciler(fakeClient, nil, fakeKubeClient, fakeSnapshotClient.SnapshotV1(), dataPathMgr, nil, map[string]nodeagent.BackupPVC{},
testclocks.NewFakeClock(now), "test-node", time.Minute*5, velerotest.NewLogger(), metrics.NewServerMetrics()), nil
return NewDataUploadReconciler(
fakeClient,
nil,
fakeKubeClient,
fakeSnapshotClient.SnapshotV1(),
dataPathMgr,
nil,
map[string]nodeagent.BackupPVC{},
testclocks.NewFakeClock(now),
"test-node",
time.Minute*5,
velerotest.NewLogger(),
metrics.NewServerMetrics(),
), nil
}

func dataUploadBuilder() *builder.DataUploadBuilder {
Expand Down
Loading

0 comments on commit 690d330

Please sign in to comment.