Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CAPT machine power management with Rufio BMCJobs #182

Merged
merged 13 commits into from
Jun 15, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ jobs:
with:
check_filenames: true
check_hidden: true
skip: './.git','./go.mod','./go.sum'
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

# Build the manager binary
ARG GOVER=1.16.8
ARG GOVER=1.17
FROM golang:${GOVER} as builder

WORKDIR /workspace
Expand Down
9 changes: 9 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ rules:
- get
- list
- watch
- apiGroups:
- bmc.tinkerbell.org
resources:
- bmcjobs
verbs:
- create
- get
- list
- watch
- apiGroups:
- cluster.x-k8s.io
resources:
Expand Down
143 changes: 130 additions & 13 deletions controllers/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/util"
Expand All @@ -31,6 +32,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

rufiov1 "github.com/tinkerbell/rufio/api/v1alpha1"
tinkv1 "github.com/tinkerbell/tink/pkg/apis/core/v1alpha1"

infrastructurev1 "github.com/tinkerbell/cluster-api-provider-tinkerbell/api/v1beta1"
Expand Down Expand Up @@ -139,18 +141,7 @@ func (bmrc *baseMachineReconcileContext) MachineScheduledForDeletion() bool {
return !bmrc.tinkerbellMachine.ObjectMeta.DeletionTimestamp.IsZero()
}

func (bmrc *baseMachineReconcileContext) releaseHardware() error {
hardware := &tinkv1.Hardware{}

namespacedName := types.NamespacedName{
Name: bmrc.tinkerbellMachine.Spec.HardwareName,
Namespace: bmrc.tinkerbellMachine.Namespace,
}

if err := bmrc.client.Get(bmrc.ctx, namespacedName, hardware); err != nil {
return fmt.Errorf("getting hardware: %w", err)
}

func (bmrc *baseMachineReconcileContext) releaseHardware(hardware *tinkv1.Hardware) error {
patchHelper, err := patch.NewHelper(hardware, bmrc.client)
if err != nil {
return fmt.Errorf("initializing patch helper for selected hardware: %w", err)
Expand All @@ -174,10 +165,103 @@ func (bmrc *baseMachineReconcileContext) releaseHardware() error {
return nil
}

func (bmrc *baseMachineReconcileContext) getHardwareForMachine(hardware *tinkv1.Hardware) error {
namespacedName := types.NamespacedName{
Name: bmrc.tinkerbellMachine.Spec.HardwareName,
Namespace: bmrc.tinkerbellMachine.Namespace,
}

if err := bmrc.client.Get(bmrc.ctx, namespacedName, hardware); err != nil {
return fmt.Errorf("getting hardware: %w", err)
}

return nil
}

// createPowerOffJob creates a BMCJob object with the required tasks for hardware power off.
func (bmrc *baseMachineReconcileContext) createPowerOffJob(hardware *tinkv1.Hardware) error {
powerOffAction := rufiov1.HardPowerOff
controller := true
bmcJob := &rufiov1.BMCJob{
Copy link
Member

@chrisdoherty4 chrisdoherty4 Jun 2, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we de-duplicate some of this construction and perhaps have a separate function set or builder.

func (bmrc *baseMachinereconcileContext) createPowerOffJob(hardware ...) error {
  job := newBMCJobBuilder(bmrc.tinkerbellMachine, hardware)
  job.WithTask(rufiov1.Task{
    PowerAction: *powerOffAction)
  })

  if err := bmrc.client.Create(bmrc.ctx, job.Build()); err != nil {
    ...
  }
}

This lets us re-use that construction logic elsewhere in the program like in ensureHardware.

ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-poweroff", bmrc.tinkerbellMachine.Name),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having a consistent name seems good so you don't get two running jobs conflicting, but this is never cleaned up. I don't think you need to do that necessarily in this PR, but can you at least open a GH issue stating that it needs to be thought through?

Copy link
Contributor Author

@pokearu pokearu Jun 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure! Will do, thanks.

Namespace: bmrc.tinkerbellMachine.Namespace,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
Kind: "TinkerbellMachine",
Name: bmrc.tinkerbellMachine.Name,
UID: bmrc.tinkerbellMachine.ObjectMeta.UID,
Controller: &controller,
},
},
},
Spec: rufiov1.BMCJobSpec{
BaseboardManagementRef: rufiov1.BaseboardManagementRef{
Name: hardware.Spec.BMCRef.Name,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we check if a BMC object with the ref exists?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there’s a check for this on Rufio controller and job would not go to Running condition unless the Ref was found.

We could add one here as well but currently I have limited the scope of CAPT to only deal with only BMCJob objects and Rufio would deal with the rest.

Namespace: bmrc.tinkerbellMachine.Namespace,
},
Tasks: []rufiov1.Task{
{
PowerAction: &powerOffAction,
},
},
},
}

if err := bmrc.client.Create(bmrc.ctx, bmcJob); err != nil {
return fmt.Errorf("creating BMCJob: %w", err)
}

bmrc.log.Info("Created BMCJob to power off hardware",
"Name", bmcJob.Name,
"Namespace", bmcJob.Namespace)

return nil
}

// getBMCJob fetches the BMCJob with name JName.
func (bmrc *baseMachineReconcileContext) getBMCJob(jName string, bmj *rufiov1.BMCJob) error {
namespacedName := types.NamespacedName{
Name: jName,
Namespace: bmrc.tinkerbellMachine.Namespace,
}

if err := bmrc.client.Get(bmrc.ctx, namespacedName, bmj); err != nil {
return fmt.Errorf("GET BMCJob: %w", err)
}

return nil
}

// DeleteMachineWithDependencies removes template and workflow objects associated with given machine.
func (bmrc *baseMachineReconcileContext) DeleteMachineWithDependencies() error {
bmrc.log.Info("Removing machine", "hardwareName", bmrc.tinkerbellMachine.Spec.HardwareName)
// Fetch hardware for the machine.
hardware := &tinkv1.Hardware{}
if err := bmrc.getHardwareForMachine(hardware); err != nil {
return err
}

if err := bmrc.removeDependencies(hardware); err != nil {
return err
}

// The hardware BMCRef is nil.
// Remove finalizers and let machine object delete.
if hardware.Spec.BMCRef == nil {
bmrc.log.Info("Hardware BMC reference not present; skipping hardware power off",
"BMCRef", hardware.Spec.BMCRef, "Hardware", hardware.Name)

return bmrc.removeFinalizer()
}

return bmrc.ensureBMCJobCompletionForDelete(hardware)
}

// removeDependencies removes the Template, Workflow linked to the machine.
// Deletes the machine hardware labels for the machine.
func (bmrc *baseMachineReconcileContext) removeDependencies(hardware *tinkv1.Hardware) error {
if err := bmrc.removeTemplate(); err != nil {
return fmt.Errorf("removing Template: %w", err)
}
Expand All @@ -186,17 +270,50 @@ func (bmrc *baseMachineReconcileContext) DeleteMachineWithDependencies() error {
return fmt.Errorf("removing Workflow: %w", err)
}

if err := bmrc.releaseHardware(); err != nil {
if err := bmrc.releaseHardware(hardware); err != nil {
return fmt.Errorf("releasing Hardware: %w", err)
}

return nil
}

func (bmrc *baseMachineReconcileContext) removeFinalizer() error {
controllerutil.RemoveFinalizer(bmrc.tinkerbellMachine, infrastructurev1.MachineFinalizer)

bmrc.log.Info("Patching Machine object to remove finalizer")

return bmrc.patch()
}

// ensureBMCJobCompletionForDelete ensures the machine power off BMCJob is completed.
// Removes the machint finalizer to let machine delete.
func (bmrc *baseMachineReconcileContext) ensureBMCJobCompletionForDelete(hardware *tinkv1.Hardware) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the 'ForDelete` part of this name gives me pause. With this in the name i should be able to infer that this method has some behavior around the deletion of a machine (tinkerbellMachine?). I dont see anything in the code that touches machine deletion.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh actually the return bmrc.removeFinalizer() is actually what allows the tinkerbellmachine objects to get deleted.
That's why i had it on the name.
We check job completion and then remove the finalizer.

// Fetch a poweroff BMCJob for the machine.
// If Job not found, we remove dependencies and create job.
bmcJob := &rufiov1.BMCJob{}
jobName := fmt.Sprintf("%s-poweroff", bmrc.tinkerbellMachine.Name)

err := bmrc.getBMCJob(jobName, bmcJob)
if err != nil {
if apierrors.IsNotFound(err) {
return bmrc.createPowerOffJob(hardware)
}

return fmt.Errorf("get bmc job for machine: %w", err)
}

// Check the Job conditions to ensure the power off job is complete.
if bmcJob.HasCondition(rufiov1.JobCompleted, rufiov1.ConditionTrue) {
return bmrc.removeFinalizer()
}

if bmcJob.HasCondition(rufiov1.JobFailed, rufiov1.ConditionTrue) {
return fmt.Errorf("bmc job %s/%s failed", bmcJob.Namespace, bmcJob.Name) // nolint:goerr113
}

return nil
}

// IntoMachineReconcileContext implements BaseMachineReconcileContext by building MachineReconcileContext
// from existing fields.
func (bmrc *baseMachineReconcileContext) IntoMachineReconcileContext() (ReconcileContext, error) {
Expand Down
110 changes: 110 additions & 0 deletions controllers/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"

rufiov1 "github.com/tinkerbell/rufio/api/v1alpha1"
tinkv1 "github.com/tinkerbell/tink/pkg/apis/core/v1alpha1"

infrastructurev1 "github.com/tinkerbell/cluster-api-provider-tinkerbell/api/v1beta1"
Expand Down Expand Up @@ -139,9 +140,17 @@ func (mrc *machineReconcileContext) Reconcile() error {
return fmt.Errorf("failed to ensure hardware: %w", err)
}

return mrc.reconcile(hw)
}

func (mrc *machineReconcileContext) reconcile(hw *tinkv1.Hardware) error {
if !isHardwareReady(hw) {
wf, err := mrc.ensureTemplateAndWorkflow(hw)

if ensureJobErr := mrc.ensureHardwareProvisionJob(hw); ensureJobErr != nil {
return fmt.Errorf("failed to ensure hardware ready for provisioning: %w", ensureJobErr)
}

switch {
case errors.Is(err, &errRequeueRequested{}):
return nil
Expand All @@ -150,6 +159,7 @@ func (mrc *machineReconcileContext) Reconcile() error {
}

s := wf.GetCurrentActionState()

if s == tinkv1.WorkflowStateFailed || s == tinkv1.WorkflowStateTimeout {
return errWorkflowFailed
}
Expand All @@ -163,6 +173,8 @@ func (mrc *machineReconcileContext) Reconcile() error {
}
}

mrc.log.Info("Marking TinkerbellMachine as Ready")

mrc.tinkerbellMachine.Status.Ready = true

return nil
Expand Down Expand Up @@ -527,6 +539,104 @@ func byHardwareAffinity(hardware []tinkv1.Hardware, preferred []infrastructurev1
}, nil
}

// ensureHardwareProvisionJob ensures the hardware is ready to be provisioned.
// Uses the BMCRef from the hardware to create a BMCJob.
// The BMCJob is responsible for getting the machine to desired state for provisioning.
// If a BMCJob is already found and has failed, we error.
func (mrc *machineReconcileContext) ensureHardwareProvisionJob(hardware *tinkv1.Hardware) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curious why the use of HardwareProvisionJob instead of BMCJob? This requires a translation that feels like it might be unnecessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intended it to be like "Job that gets hardware ready for provisioning" since it does multiple actions.
I tried to explain it a bit on the function comment.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The challenge i see is that a HardwareProvisionJob is just a pointer to a BMCJob. The definition you give in the comments points us to a BMCJob. So to understand the result or behavior of this function we need to go BMCJob regardless. I know naming can be very subjective and difficult but I'm having trouble seeing the value in creating this new construct of a HardwareProvisionJob.

if hardware.Spec.BMCRef == nil {
mrc.log.Info("Hardware BMC reference not present; skipping BMCJob creation",
"BMCRef", hardware.Spec.BMCRef, "Hardware", hardware.Name)

return nil
}

bmcJob := &rufiov1.BMCJob{}
jobName := fmt.Sprintf("%s-provision", mrc.tinkerbellMachine.Name)

err := mrc.getBMCJob(jobName, bmcJob)
if err != nil {
if apierrors.IsNotFound(err) {
// Create a BMCJob for hardware provisioning
return mrc.createHardwareProvisionJob(hardware, jobName)
}

return err
}

if bmcJob.HasCondition(rufiov1.JobFailed, rufiov1.ConditionTrue) {
return fmt.Errorf("bmc job %s/%s failed", bmcJob.Namespace, bmcJob.Name) // nolint:goerr113
}

return nil
}

// getBMCJob fetches the BMCJob with name JName.
func (mrc *machineReconcileContext) getBMCJob(jName string, bmj *rufiov1.BMCJob) error {
namespacedName := types.NamespacedName{
Name: jName,
Namespace: mrc.tinkerbellMachine.Namespace,
}

if err := mrc.client.Get(mrc.ctx, namespacedName, bmj); err != nil {
return fmt.Errorf("GET BMCJob: %w", err)
}

return nil
}

// createHardwareProvisionJob creates a BMCJob object with the required tasks for hardware provisioning.
func (mrc *machineReconcileContext) createHardwareProvisionJob(hardware *tinkv1.Hardware, name string) error {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is already very large. This functionality feels like a good opportunity to create a new file in this package just for bmc/rufio functionality.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was initially going to do that. But i thought we could benefit from a wider refactoring where we move and abstract some of the logic for creating Templates, Workflows and Jobs. And even address reducing the duplication of some of these methods.

Is it alright to work on it on followup PRs so we can move forward with the functionality? :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing, just wanted to be sure the trade off of growing the tech debt and needed refactor work was brought up.

powerOffAction := rufiov1.HardPowerOff
powerOnAction := rufiov1.PowerOn
bmcJob := &rufiov1.BMCJob{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: mrc.tinkerbellMachine.Namespace,
OwnerReferences: []metav1.OwnerReference{
{
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
Kind: "TinkerbellMachine",
Name: mrc.tinkerbellMachine.Name,
UID: mrc.tinkerbellMachine.ObjectMeta.UID,
},
},
},
Spec: rufiov1.BMCJobSpec{
BaseboardManagementRef: rufiov1.BaseboardManagementRef{
Name: hardware.Spec.BMCRef.Name,
Namespace: mrc.tinkerbellMachine.Namespace,
},
Tasks: []rufiov1.Task{
{
PowerAction: &powerOffAction,
},
{
OneTimeBootDeviceAction: &rufiov1.OneTimeBootDeviceAction{
Devices: []rufiov1.BootDevice{
rufiov1.PXE,
},
EFIBoot: hardware.Spec.Interfaces[0].DHCP.UEFI,
},
},
{
PowerAction: &powerOnAction,
},
},
},
}

if err := mrc.client.Create(mrc.ctx, bmcJob); err != nil {
return fmt.Errorf("creating BMCJob: %w", err)
}

mrc.log.Info("Created BMCJob to get hardware ready for provisioning",
"Name", bmcJob.Name,
"Namespace", bmcJob.Namespace)

return nil
}

func (mrc *machineReconcileContext) getWorkflow() (*tinkv1.Workflow, error) {
namespacedName := types.NamespacedName{
Name: mrc.tinkerbellMachine.Name,
Expand Down
8 changes: 8 additions & 0 deletions controllers/tinkerbellmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/source"

rufiov1 "github.com/tinkerbell/rufio/api/v1alpha1"
tinkv1 "github.com/tinkerbell/tink/pkg/apis/core/v1alpha1"

infrastructurev1 "github.com/tinkerbell/cluster-api-provider-tinkerbell/api/v1beta1"
Expand All @@ -50,6 +51,7 @@ type TinkerbellMachineReconciler struct {
// +kubebuilder:rbac:groups=tinkerbell.org,resources=hardware;hardware/status,verbs=get;list;watch;update;patch
// +kubebuilder:rbac:groups=tinkerbell.org,resources=templates;templates/status,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=tinkerbell.org,resources=workflows;workflows/status,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=bmc.tinkerbell.org,resources=bmcjobs,verbs=get;list;watch;create

// Reconcile ensures that all Tinkerbell machines are aligned with a given spec.
func (tmr *TinkerbellMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
Expand Down Expand Up @@ -116,6 +118,12 @@ func (tmr *TinkerbellMachineReconciler) SetupWithManager(
).
Watches(
&source.Kind{Type: &tinkv1.Workflow{}},
&handler.EnqueueRequestForOwner{
OwnerType: &infrastructurev1.TinkerbellMachine{},
IsController: true,
}).
Watches(
&source.Kind{Type: &rufiov1.BMCJob{}},
&handler.EnqueueRequestForOwner{
OwnerType: &infrastructurev1.TinkerbellMachine{},
IsController: true,
Expand Down
Loading