-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CAPT machine power management with Rufio BMCJobs #182
Changes from all commits
11865c2
bffdee2
d2fa712
6e03b90
53666cc
c4c0475
d50b060
e91568d
ebb93c3
038e787
e56542b
222839a
e0415eb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,3 +45,4 @@ jobs: | |
with: | ||
check_filenames: true | ||
check_hidden: true | ||
skip: './.git,./go.mod,./go.sum' |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ import ( | |
"github.com/go-logr/logr" | ||
corev1 "k8s.io/api/core/v1" | ||
apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/types" | ||
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" | ||
"sigs.k8s.io/cluster-api/util" | ||
|
@@ -31,6 +32,7 @@ import ( | |
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" | ||
|
||
rufiov1 "github.com/tinkerbell/rufio/api/v1alpha1" | ||
tinkv1 "github.com/tinkerbell/tink/pkg/apis/core/v1alpha1" | ||
|
||
infrastructurev1 "github.com/tinkerbell/cluster-api-provider-tinkerbell/api/v1beta1" | ||
|
@@ -139,18 +141,7 @@ func (bmrc *baseMachineReconcileContext) MachineScheduledForDeletion() bool { | |
return !bmrc.tinkerbellMachine.ObjectMeta.DeletionTimestamp.IsZero() | ||
} | ||
|
||
func (bmrc *baseMachineReconcileContext) releaseHardware() error { | ||
hardware := &tinkv1.Hardware{} | ||
|
||
namespacedName := types.NamespacedName{ | ||
Name: bmrc.tinkerbellMachine.Spec.HardwareName, | ||
Namespace: bmrc.tinkerbellMachine.Namespace, | ||
} | ||
|
||
if err := bmrc.client.Get(bmrc.ctx, namespacedName, hardware); err != nil { | ||
return fmt.Errorf("getting hardware: %w", err) | ||
} | ||
|
||
func (bmrc *baseMachineReconcileContext) releaseHardware(hardware *tinkv1.Hardware) error { | ||
patchHelper, err := patch.NewHelper(hardware, bmrc.client) | ||
if err != nil { | ||
return fmt.Errorf("initializing patch helper for selected hardware: %w", err) | ||
|
@@ -174,10 +165,103 @@ func (bmrc *baseMachineReconcileContext) releaseHardware() error { | |
return nil | ||
} | ||
|
||
func (bmrc *baseMachineReconcileContext) getHardwareForMachine(hardware *tinkv1.Hardware) error { | ||
namespacedName := types.NamespacedName{ | ||
Name: bmrc.tinkerbellMachine.Spec.HardwareName, | ||
Namespace: bmrc.tinkerbellMachine.Namespace, | ||
} | ||
|
||
if err := bmrc.client.Get(bmrc.ctx, namespacedName, hardware); err != nil { | ||
return fmt.Errorf("getting hardware: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// createPowerOffJob creates a BMCJob object with the required tasks for hardware power off. | ||
func (bmrc *baseMachineReconcileContext) createPowerOffJob(hardware *tinkv1.Hardware) error { | ||
powerOffAction := rufiov1.HardPowerOff | ||
controller := true | ||
bmcJob := &rufiov1.BMCJob{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: fmt.Sprintf("%s-poweroff", bmrc.tinkerbellMachine.Name), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. having a consistent name seems good so you don't get two running jobs conflicting, but this is never cleaned up. I don't think you need to do that necessarily in this PR, but can you at least open a GH issue stating that it needs to be thought through? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! Will do, thanks. |
||
Namespace: bmrc.tinkerbellMachine.Namespace, | ||
OwnerReferences: []metav1.OwnerReference{ | ||
{ | ||
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", | ||
Kind: "TinkerbellMachine", | ||
Name: bmrc.tinkerbellMachine.Name, | ||
UID: bmrc.tinkerbellMachine.ObjectMeta.UID, | ||
Controller: &controller, | ||
}, | ||
}, | ||
}, | ||
Spec: rufiov1.BMCJobSpec{ | ||
BaseboardManagementRef: rufiov1.BaseboardManagementRef{ | ||
Name: hardware.Spec.BMCRef.Name, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we check if a BMC object with the ref exists? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually there’s a check for this on Rufio controller and job would not go to We could add one here as well but currently I have limited the scope of CAPT to only deal with only BMCJob objects and Rufio would deal with the rest. |
||
Namespace: bmrc.tinkerbellMachine.Namespace, | ||
}, | ||
Tasks: []rufiov1.Task{ | ||
{ | ||
PowerAction: &powerOffAction, | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
if err := bmrc.client.Create(bmrc.ctx, bmcJob); err != nil { | ||
return fmt.Errorf("creating BMCJob: %w", err) | ||
} | ||
|
||
bmrc.log.Info("Created BMCJob to power off hardware", | ||
"Name", bmcJob.Name, | ||
"Namespace", bmcJob.Namespace) | ||
|
||
return nil | ||
} | ||
|
||
// getBMCJob fetches the BMCJob with name JName. | ||
func (bmrc *baseMachineReconcileContext) getBMCJob(jName string, bmj *rufiov1.BMCJob) error { | ||
namespacedName := types.NamespacedName{ | ||
Name: jName, | ||
Namespace: bmrc.tinkerbellMachine.Namespace, | ||
} | ||
|
||
if err := bmrc.client.Get(bmrc.ctx, namespacedName, bmj); err != nil { | ||
return fmt.Errorf("GET BMCJob: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// DeleteMachineWithDependencies removes template and workflow objects associated with given machine. | ||
func (bmrc *baseMachineReconcileContext) DeleteMachineWithDependencies() error { | ||
bmrc.log.Info("Removing machine", "hardwareName", bmrc.tinkerbellMachine.Spec.HardwareName) | ||
// Fetch hardware for the machine. | ||
hardware := &tinkv1.Hardware{} | ||
if err := bmrc.getHardwareForMachine(hardware); err != nil { | ||
return err | ||
} | ||
|
||
if err := bmrc.removeDependencies(hardware); err != nil { | ||
return err | ||
} | ||
|
||
// The hardware BMCRef is nil. | ||
// Remove finalizers and let machine object delete. | ||
if hardware.Spec.BMCRef == nil { | ||
bmrc.log.Info("Hardware BMC reference not present; skipping hardware power off", | ||
"BMCRef", hardware.Spec.BMCRef, "Hardware", hardware.Name) | ||
|
||
return bmrc.removeFinalizer() | ||
} | ||
|
||
return bmrc.ensureBMCJobCompletionForDelete(hardware) | ||
} | ||
|
||
// removeDependencies removes the Template, Workflow linked to the machine. | ||
// Deletes the machine hardware labels for the machine. | ||
func (bmrc *baseMachineReconcileContext) removeDependencies(hardware *tinkv1.Hardware) error { | ||
if err := bmrc.removeTemplate(); err != nil { | ||
return fmt.Errorf("removing Template: %w", err) | ||
} | ||
|
@@ -186,17 +270,50 @@ func (bmrc *baseMachineReconcileContext) DeleteMachineWithDependencies() error { | |
return fmt.Errorf("removing Workflow: %w", err) | ||
} | ||
|
||
if err := bmrc.releaseHardware(); err != nil { | ||
if err := bmrc.releaseHardware(hardware); err != nil { | ||
return fmt.Errorf("releasing Hardware: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (bmrc *baseMachineReconcileContext) removeFinalizer() error { | ||
controllerutil.RemoveFinalizer(bmrc.tinkerbellMachine, infrastructurev1.MachineFinalizer) | ||
|
||
bmrc.log.Info("Patching Machine object to remove finalizer") | ||
|
||
return bmrc.patch() | ||
} | ||
|
||
// ensureBMCJobCompletionForDelete ensures the machine power off BMCJob is completed. | ||
// Removes the machint finalizer to let machine delete. | ||
func (bmrc *baseMachineReconcileContext) ensureBMCJobCompletionForDelete(hardware *tinkv1.Hardware) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the 'ForDelete` part of this name gives me pause. With this in the name i should be able to infer that this method has some behavior around the deletion of a machine (tinkerbellMachine?). I dont see anything in the code that touches machine deletion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh actually the |
||
// Fetch a poweroff BMCJob for the machine. | ||
// If Job not found, we remove dependencies and create job. | ||
bmcJob := &rufiov1.BMCJob{} | ||
jobName := fmt.Sprintf("%s-poweroff", bmrc.tinkerbellMachine.Name) | ||
|
||
err := bmrc.getBMCJob(jobName, bmcJob) | ||
if err != nil { | ||
if apierrors.IsNotFound(err) { | ||
return bmrc.createPowerOffJob(hardware) | ||
} | ||
|
||
return fmt.Errorf("get bmc job for machine: %w", err) | ||
} | ||
|
||
// Check the Job conditions to ensure the power off job is complete. | ||
if bmcJob.HasCondition(rufiov1.JobCompleted, rufiov1.ConditionTrue) { | ||
return bmrc.removeFinalizer() | ||
} | ||
|
||
if bmcJob.HasCondition(rufiov1.JobFailed, rufiov1.ConditionTrue) { | ||
return fmt.Errorf("bmc job %s/%s failed", bmcJob.Namespace, bmcJob.Name) // nolint:goerr113 | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// IntoMachineReconcileContext implements BaseMachineReconcileContext by building MachineReconcileContext | ||
// from existing fields. | ||
func (bmrc *baseMachineReconcileContext) IntoMachineReconcileContext() (ReconcileContext, error) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ import ( | |
"sigs.k8s.io/cluster-api/util/patch" | ||
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" | ||
|
||
rufiov1 "github.com/tinkerbell/rufio/api/v1alpha1" | ||
tinkv1 "github.com/tinkerbell/tink/pkg/apis/core/v1alpha1" | ||
|
||
infrastructurev1 "github.com/tinkerbell/cluster-api-provider-tinkerbell/api/v1beta1" | ||
|
@@ -139,9 +140,17 @@ func (mrc *machineReconcileContext) Reconcile() error { | |
return fmt.Errorf("failed to ensure hardware: %w", err) | ||
} | ||
|
||
return mrc.reconcile(hw) | ||
} | ||
|
||
func (mrc *machineReconcileContext) reconcile(hw *tinkv1.Hardware) error { | ||
if !isHardwareReady(hw) { | ||
wf, err := mrc.ensureTemplateAndWorkflow(hw) | ||
|
||
if ensureJobErr := mrc.ensureHardwareProvisionJob(hw); ensureJobErr != nil { | ||
return fmt.Errorf("failed to ensure hardware ready for provisioning: %w", ensureJobErr) | ||
} | ||
|
||
switch { | ||
case errors.Is(err, &errRequeueRequested{}): | ||
return nil | ||
|
@@ -150,6 +159,7 @@ func (mrc *machineReconcileContext) Reconcile() error { | |
} | ||
|
||
s := wf.GetCurrentActionState() | ||
|
||
if s == tinkv1.WorkflowStateFailed || s == tinkv1.WorkflowStateTimeout { | ||
return errWorkflowFailed | ||
} | ||
|
@@ -163,6 +173,8 @@ func (mrc *machineReconcileContext) Reconcile() error { | |
} | ||
} | ||
|
||
mrc.log.Info("Marking TinkerbellMachine as Ready") | ||
|
||
mrc.tinkerbellMachine.Status.Ready = true | ||
|
||
return nil | ||
|
@@ -527,6 +539,104 @@ func byHardwareAffinity(hardware []tinkv1.Hardware, preferred []infrastructurev1 | |
}, nil | ||
} | ||
|
||
// ensureHardwareProvisionJob ensures the hardware is ready to be provisioned. | ||
// Uses the BMCRef from the hardware to create a BMCJob. | ||
// The BMCJob is responsible for getting the machine to desired state for provisioning. | ||
// If a BMCJob is already found and has failed, we error. | ||
func (mrc *machineReconcileContext) ensureHardwareProvisionJob(hardware *tinkv1.Hardware) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curious why the use of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intended it to be like "Job that gets hardware ready for provisioning" since it does multiple actions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The challenge i see is that a |
||
if hardware.Spec.BMCRef == nil { | ||
mrc.log.Info("Hardware BMC reference not present; skipping BMCJob creation", | ||
"BMCRef", hardware.Spec.BMCRef, "Hardware", hardware.Name) | ||
|
||
return nil | ||
} | ||
|
||
bmcJob := &rufiov1.BMCJob{} | ||
jobName := fmt.Sprintf("%s-provision", mrc.tinkerbellMachine.Name) | ||
|
||
err := mrc.getBMCJob(jobName, bmcJob) | ||
if err != nil { | ||
if apierrors.IsNotFound(err) { | ||
// Create a BMCJob for hardware provisioning | ||
return mrc.createHardwareProvisionJob(hardware, jobName) | ||
} | ||
|
||
return err | ||
} | ||
|
||
if bmcJob.HasCondition(rufiov1.JobFailed, rufiov1.ConditionTrue) { | ||
return fmt.Errorf("bmc job %s/%s failed", bmcJob.Namespace, bmcJob.Name) // nolint:goerr113 | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// getBMCJob fetches the BMCJob with name JName. | ||
func (mrc *machineReconcileContext) getBMCJob(jName string, bmj *rufiov1.BMCJob) error { | ||
namespacedName := types.NamespacedName{ | ||
Name: jName, | ||
Namespace: mrc.tinkerbellMachine.Namespace, | ||
} | ||
|
||
if err := mrc.client.Get(mrc.ctx, namespacedName, bmj); err != nil { | ||
return fmt.Errorf("GET BMCJob: %w", err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// createHardwareProvisionJob creates a BMCJob object with the required tasks for hardware provisioning. | ||
func (mrc *machineReconcileContext) createHardwareProvisionJob(hardware *tinkv1.Hardware, name string) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This file is already very large. This functionality feels like a good opportunity to create a new file in this package just for bmc/rufio functionality. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was initially going to do that. But i thought we could benefit from a wider refactoring where we move and abstract some of the logic for creating Templates, Workflows and Jobs. And even address reducing the duplication of some of these methods. Is it alright to work on it on followup PRs so we can move forward with the functionality? :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure thing, just wanted to be sure the trade off of growing the tech debt and needed refactor work was brought up. |
||
powerOffAction := rufiov1.HardPowerOff | ||
powerOnAction := rufiov1.PowerOn | ||
bmcJob := &rufiov1.BMCJob{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: name, | ||
Namespace: mrc.tinkerbellMachine.Namespace, | ||
OwnerReferences: []metav1.OwnerReference{ | ||
{ | ||
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", | ||
Kind: "TinkerbellMachine", | ||
Name: mrc.tinkerbellMachine.Name, | ||
UID: mrc.tinkerbellMachine.ObjectMeta.UID, | ||
}, | ||
}, | ||
}, | ||
Spec: rufiov1.BMCJobSpec{ | ||
BaseboardManagementRef: rufiov1.BaseboardManagementRef{ | ||
Name: hardware.Spec.BMCRef.Name, | ||
Namespace: mrc.tinkerbellMachine.Namespace, | ||
}, | ||
Tasks: []rufiov1.Task{ | ||
{ | ||
PowerAction: &powerOffAction, | ||
}, | ||
{ | ||
OneTimeBootDeviceAction: &rufiov1.OneTimeBootDeviceAction{ | ||
Devices: []rufiov1.BootDevice{ | ||
rufiov1.PXE, | ||
}, | ||
EFIBoot: hardware.Spec.Interfaces[0].DHCP.UEFI, | ||
}, | ||
}, | ||
{ | ||
PowerAction: &powerOnAction, | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
if err := mrc.client.Create(mrc.ctx, bmcJob); err != nil { | ||
return fmt.Errorf("creating BMCJob: %w", err) | ||
} | ||
|
||
mrc.log.Info("Created BMCJob to get hardware ready for provisioning", | ||
"Name", bmcJob.Name, | ||
"Namespace", bmcJob.Namespace) | ||
|
||
return nil | ||
} | ||
|
||
func (mrc *machineReconcileContext) getWorkflow() (*tinkv1.Workflow, error) { | ||
namespacedName := types.NamespacedName{ | ||
Name: mrc.tinkerbellMachine.Name, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we de-duplicate some of this construction and perhaps have a separate function set or builder.
This lets us re-use that construction logic elsewhere in the program like in
ensureHardware
.